diff --git a/.gitignore b/.gitignore index dd0f148c66c9..eeebe0d05267 100644 --- a/.gitignore +++ b/.gitignore @@ -45,5 +45,7 @@ tools/clang tools/lldb # lld, which is tracked independently. tools/lld +# Polly, which is tracked independently. +tools/polly # Sphinx build tree, if building in-source dir. docs/_build diff --git a/CMakeLists.txt b/CMakeLists.txt index 36e6ed8afa7e..b9fca2a386d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,6 +114,12 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE) # They are used as destination of target generators. set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin) set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib) +if(WIN32 OR CYGWIN) + # DLL platform -- put DLLs into bin. + set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_RUNTIME_OUTPUT_INTDIR}) +else() + set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_LIBRARY_OUTPUT_INTDIR}) +endif() # Each of them corresponds to llvm-config's. set(LLVM_TOOLS_BINARY_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR}) # --bindir @@ -499,7 +505,6 @@ add_subdirectory(lib) if( LLVM_INCLUDE_UTILS ) add_subdirectory(utils/FileCheck) - add_subdirectory(utils/FileUpdate) add_subdirectory(utils/PerfectShuffle) add_subdirectory(utils/count) add_subdirectory(utils/not) diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT index ee9148715901..86d401ee4498 100644 --- a/CODE_OWNERS.TXT +++ b/CODE_OWNERS.TXT @@ -70,8 +70,8 @@ N: Justin Holewinski E: jholewinski@nvidia.com D: NVPTX Target (lib/Target/NVPTX/*) -N: Andy Kaylor -E: andrew.kaylor@intel.com +N: Lang Hames +E: lhames@gmail.com D: MCJIT, RuntimeDyld and JIT event listeners N: Galina Kistanova diff --git a/Makefile.config.in b/Makefile.config.in index 1c364121ca34..b98ebc6f017d 100644 --- a/Makefile.config.in +++ b/Makefile.config.in @@ -259,9 +259,6 @@ CLANG_PLUGIN_SUPPORT = @CLANG_PLUGIN_SUPPORT@ # When ENABLE_CLANG_ARCMT is enabled, clang will have ARCMigrationTool. ENABLE_CLANG_ARCMT = @ENABLE_CLANG_ARCMT@ -# When ENABLE_CLANG_REWRITER is enabled, clang will have Rewriter. -ENABLE_CLANG_REWRITER = @ENABLE_CLANG_REWRITER@ - # When ENABLE_CLANG_STATIC_ANALYZER is enabled, clang will have StaticAnalyzer. ENABLE_CLANG_STATIC_ANALYZER = @ENABLE_CLANG_STATIC_ANALYZER@ diff --git a/Makefile.rules b/Makefile.rules index ff0a3e3f8191..ebebc0a85c4f 100644 --- a/Makefile.rules +++ b/Makefile.rules @@ -1725,7 +1725,7 @@ $(ObjDir)/%GenDFAPacketizer.inc.tmp : %.td $(ObjDir)/.dir $(LLVM_TBLGEN) # Dump all the records to .td.expanded. This is useful for debugging. $(TARGET:%=%.td.expanded): \ -%.td.expanded : %.td $(LLVM_TBLGEN) +%.td.expanded : %.td $(LLVM_TBLGEN) $(TDFiles) $(Echo) "Building a fully expanded version of $(], - [Use udis86 external x86 disassembler library]), - [ - AC_SUBST(USE_UDIS86, [1]) - case "$withval" in - /usr/lib|yes) ;; - *) LDFLAGS="$LDFLAGS -L${withval}" ;; - esac - AC_CHECK_LIB(udis86, ud_init, [], [ - echo "Error! You need to have libudis86 around." - exit -1 - ]) - ], - AC_SUBST(USE_UDIS86, [0])) -AC_DEFINE_UNQUOTED([USE_UDIS86],$USE_UDIS86, - [Define if use udis86 library]) - dnl Allow OProfile support for JIT output. AC_ARG_WITH(oprofile, AS_HELP_STRING([--with-oprofile=], diff --git a/bindings/ocaml/executionengine/llvm_executionengine.mli b/bindings/ocaml/executionengine/llvm_executionengine.mli index 16f08930a75b..74a606287f42 100644 --- a/bindings/ocaml/executionengine/llvm_executionengine.mli +++ b/bindings/ocaml/executionengine/llvm_executionengine.mli @@ -151,4 +151,6 @@ module ExecutionEngine: sig val data_layout : t -> Llvm_target.DataLayout.t end +(** [initialize_native_target ()] initializes the native target corresponding + to the host. Returns [true] if initialization is {b not} done. *) val initialize_native_target : unit -> bool diff --git a/bindings/ocaml/llvm/llvm.mli b/bindings/ocaml/llvm/llvm.mli index 59a89db5f489..f5f5b53e84d5 100644 --- a/bindings/ocaml/llvm/llvm.mli +++ b/bindings/ocaml/llvm/llvm.mli @@ -157,38 +157,40 @@ end See the [llvm::ICmpInst::Predicate] enumeration. *) module Icmp : sig type t = - | Eq - | Ne - | Ugt - | Uge - | Ult - | Ule - | Sgt - | Sge - | Slt - | Sle + | Eq (* Equal *) + | Ne (* Not equal *) + | Ugt (* Unsigned greater than *) + | Uge (* Unsigned greater or equal *) + | Ult (* Unsigned less than *) + | Ule (* Unsigned less or equal *) + | Sgt (* Signed greater than *) + | Sge (* Signed greater or equal *) + | Slt (* Signed less than *) + | Sle (* Signed less or equal *) end (** The predicate for a floating-point comparison ([fcmp]) instruction. + Ordered means that neither operand is a QNAN while unordered means + that either operand may be a QNAN. See the [llvm::FCmpInst::Predicate] enumeration. *) module Fcmp : sig type t = - | False - | Oeq - | Ogt - | Oge - | Olt - | Ole - | One - | Ord - | Uno - | Ueq - | Ugt - | Uge - | Ult - | Ule - | Une - | True + | False (* Always false *) + | Oeq (* Ordered and equal *) + | Ogt (* Ordered and greater than *) + | Oge (* Ordered and greater or equal *) + | Olt (* Ordered and less than *) + | Ole (* Ordered and less or equal *) + | One (* Ordered and not equal *) + | Ord (* Ordered (no operand is NaN) *) + | Uno (* Unordered (one operand at least is NaN) *) + | Ueq (* Unordered and equal *) + | Ugt (* Unordered and greater than *) + | Uge (* Unordered and greater or equal *) + | Ult (* Unordered and less than *) + | Ule (* Unordered and less or equal *) + | Une (* Unordered and not equal *) + | True (* Always true *) end (** The opcodes for LLVM instructions and constant expressions. *) @@ -1051,12 +1053,12 @@ val const_lshr : llvalue -> llvalue -> llvalue See the method [llvm::ConstantExpr::getAShr]. *) val const_ashr : llvalue -> llvalue -> llvalue -(** [const_gep pc indices] returns the constant [getElementPtr] of [p1] with the +(** [const_gep pc indices] returns the constant [getElementPtr] of [pc] with the constant integers indices from the array [indices]. See the method [llvm::ConstantExpr::getGetElementPtr]. *) val const_gep : llvalue -> llvalue array -> llvalue -(** [const_in_bounds_gep pc indices] returns the constant [getElementPtr] of [p1] +(** [const_in_bounds_gep pc indices] returns the constant [getElementPtr] of [pc] with the constant integers indices from the array [indices]. See the method [llvm::ConstantExpr::getInBoundsGetElementPtr]. *) val const_in_bounds_gep : llvalue -> llvalue array -> llvalue @@ -2360,7 +2362,7 @@ val build_insertelement : llvalue -> llvalue -> llvalue -> string -> val build_shufflevector : llvalue -> llvalue -> llvalue -> string -> llbuilder -> llvalue -(** [build_insertvalue agg idx name b] creates a +(** [build_extractvalue agg idx name b] creates a [%name = extractvalue %agg, %idx] instruction at the position specified by the instruction builder [b]. See the method [llvm::LLVMBuilder::CreateExtractValue]. *) diff --git a/bindings/ocaml/llvm/llvm_ocaml.c b/bindings/ocaml/llvm/llvm_ocaml.c index d5ebdcd3e31a..2044856ef2da 100644 --- a/bindings/ocaml/llvm/llvm_ocaml.c +++ b/bindings/ocaml/llvm/llvm_ocaml.c @@ -695,7 +695,7 @@ CAMLprim value llvm_append_namedmd(LLVMModuleRef M, value Name, LLVMValueRef Val /* lltype -> int -> llvalue */ CAMLprim LLVMValueRef llvm_const_int(LLVMTypeRef IntTy, value N) { - return LLVMConstInt(IntTy, (long long) Int_val(N), 1); + return LLVMConstInt(IntTy, (long long) Long_val(N), 1); } /* lltype -> Int64.t -> bool -> llvalue */ diff --git a/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c b/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c index 0a71bd7cad24..47e17902baab 100644 --- a/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c +++ b/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c @@ -134,6 +134,12 @@ CAMLprim value llvm_add_tail_call_elimination(LLVMPassManagerRef PM) { return Val_unit; } +/* [ unit */ +CAMLprim value llvm_add_merged_load_store_motion(LLVMPassManagerRef PM) { + LLVMAddMergedLoadStoreMotionPass(PM); + return Val_unit; +} + /* [ unit */ CAMLprim value llvm_add_gvn(LLVMPassManagerRef PM) { LLVMAddGVNPass(PM); diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake index 69ffa5b6606a..409a5d61e65b 100644 --- a/cmake/modules/AddLLVM.cmake +++ b/cmake/modules/AddLLVM.cmake @@ -8,8 +8,13 @@ function(llvm_update_compile_flags name) set(update_src_props ON) endif() - if(LLVM_REQUIRES_EH) - set(LLVM_REQUIRES_RTTI ON) + # LLVM_REQUIRES_EH is an internal flag that individual + # targets can use to force EH + if(LLVM_REQUIRES_EH OR LLVM_ENABLE_EH) + if(NOT (LLVM_REQUIRES_RTTI OR LLVM_ENABLE_RTTI)) + message(AUTHOR_WARNING "Exception handling requires RTTI. Enabling RTTI for ${name}") + set(LLVM_REQUIRES_RTTI ON) + endif() else() if(LLVM_COMPILER_IS_GCC_COMPATIBLE) list(APPEND LLVM_COMPILE_FLAGS "-fno-exceptions") @@ -19,7 +24,9 @@ function(llvm_update_compile_flags name) endif() endif() - if(NOT LLVM_REQUIRES_RTTI) + # LLVM_REQUIRES_RTTI is an internal flag that individual + # targets can use to force RTTI + if(NOT (LLVM_REQUIRES_RTTI OR LLVM_ENABLE_RTTI)) list(APPEND LLVM_COMPILE_DEFINITIONS GTEST_HAS_RTTI=0) if (LLVM_COMPILER_IS_GCC_COMPATIBLE) list(APPEND LLVM_COMPILE_FLAGS "-fno-rtti") @@ -150,19 +157,33 @@ endfunction(add_dead_strip) # Note: Don't set variables CMAKE_*_OUTPUT_DIRECTORY any more, # or a certain builder, for eaxample, msbuild.exe, would be confused. function(set_output_directory target bindir libdir) + # Do nothing if *_OUTPUT_INTDIR is empty. + if("${bindir}" STREQUAL "") + return() + endif() + + # moddir -- corresponding to LIBRARY_OUTPUT_DIRECTORY. + # It affects output of add_library(MODULE). + if(WIN32 OR CYGWIN) + # DLL platform + set(moddir ${bindir}) + else() + set(moddir ${libdir}) + endif() if(NOT "${CMAKE_CFG_INTDIR}" STREQUAL ".") foreach(build_mode ${CMAKE_CONFIGURATION_TYPES}) string(TOUPPER "${build_mode}" CONFIG_SUFFIX) string(REPLACE ${CMAKE_CFG_INTDIR} ${build_mode} bi ${bindir}) string(REPLACE ${CMAKE_CFG_INTDIR} ${build_mode} li ${libdir}) + string(REPLACE ${CMAKE_CFG_INTDIR} ${build_mode} mi ${moddir}) set_target_properties(${target} PROPERTIES "RUNTIME_OUTPUT_DIRECTORY_${CONFIG_SUFFIX}" ${bi}) set_target_properties(${target} PROPERTIES "ARCHIVE_OUTPUT_DIRECTORY_${CONFIG_SUFFIX}" ${li}) - set_target_properties(${target} PROPERTIES "LIBRARY_OUTPUT_DIRECTORY_${CONFIG_SUFFIX}" ${li}) + set_target_properties(${target} PROPERTIES "LIBRARY_OUTPUT_DIRECTORY_${CONFIG_SUFFIX}" ${mi}) endforeach() else() set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${bindir}) set_target_properties(${target} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${libdir}) - set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${libdir}) + set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${moddir}) endif() endfunction() @@ -205,7 +226,7 @@ function(llvm_add_library name) if(ARG_SHARED OR ARG_STATIC) message(WARNING "MODULE with SHARED|STATIC doesn't make sense.") endif() - if(NOT LLVM_ON_UNIX OR CYGWIN) + if(NOT LLVM_ENABLE_PLUGINS) message(STATUS "${name} ignored -- Loadable modules not supported on this platform.") return() endif() @@ -319,6 +340,13 @@ function(llvm_add_library name) ${lib_deps} ${llvm_libs} ) + elseif((CYGWIN OR WIN32) AND ARG_SHARED) + # Win32's import library may be unaware of its dependent libs. + target_link_libraries(${name} PRIVATE + ${ARG_LINK_LIBS} + ${lib_deps} + ${llvm_libs} + ) elseif(ARG_SHARED AND BUILD_SHARED_LIBS) # FIXME: It may be PRIVATE since SO knows its dependent libs. target_link_libraries(${name} PUBLIC @@ -359,6 +387,7 @@ macro(add_llvm_library name) if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "LTO") install(TARGETS ${name} EXPORT LLVMExports + RUNTIME DESTINATION bin LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX} ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}) endif() @@ -377,9 +406,15 @@ macro(add_llvm_loadable_module name) set_target_properties( ${name} PROPERTIES EXCLUDE_FROM_ALL ON) else() if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY) + if(WIN32 OR CYGWIN) + # DLL platform + set(dlldir "bin") + else() + set(dlldir "lib${LLVM_LIBDIR_SUFFIX}") + endif() install(TARGETS ${name} EXPORT LLVMExports - LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX} + LIBRARY DESTINATION ${dlldir} ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}) endif() set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${name}) @@ -570,12 +605,6 @@ function(configure_lit_site_cfg input output) set(SHLIBEXT "${LTDL_SHLIB_EXT}") - if(BUILD_SHARED_LIBS) - set(LLVM_SHARED_LIBS_ENABLED "1") - else() - set(LLVM_SHARED_LIBS_ENABLED "0") - endif(BUILD_SHARED_LIBS) - # Configuration-time: See Unit/lit.site.cfg.in if (CMAKE_CFG_INTDIR STREQUAL ".") set(LLVM_BUILD_MODE ".") @@ -590,10 +619,16 @@ function(configure_lit_site_cfg input output) string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} LLVM_LIBS_DIR ${LLVM_LIBRARY_DIR}) # SHLIBDIR points the build tree. - string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} SHLIBDIR ${LLVM_LIBRARY_OUTPUT_INTDIR}) + string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} SHLIBDIR "${LLVM_SHLIB_OUTPUT_INTDIR}") set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE}) - set(ENABLE_SHARED ${LLVM_SHARED_LIBS_ENABLED}) + # FIXME: "ENABLE_SHARED" doesn't make sense, since it is used just for + # plugins. We may rename it. + if(LLVM_ENABLE_PLUGINS) + set(ENABLE_SHARED "1") + else() + set(ENABLE_SHARED "0") + endif() if(LLVM_ENABLE_ASSERTIONS AND NOT MSVC_IDE) set(ENABLE_ASSERTIONS "1") @@ -604,22 +639,6 @@ function(configure_lit_site_cfg input output) set(HOST_OS ${CMAKE_SYSTEM_NAME}) set(HOST_ARCH ${CMAKE_SYSTEM_PROCESSOR}) - if (CLANG_ENABLE_ARCMT) - set(ENABLE_CLANG_ARCMT "1") - else() - set(ENABLE_CLANG_ARCMT "0") - endif() - if (CLANG_ENABLE_REWRITER) - set(ENABLE_CLANG_REWRITER "1") - else() - set(ENABLE_CLANG_REWRITER "0") - endif() - if (CLANG_ENABLE_STATIC_ANALYZER) - set(ENABLE_CLANG_STATIC_ANALYZER "1") - else() - set(ENABLE_CLANG_STATIC_ANALYZER "0") - endif() - configure_file(${input} ${output} @ONLY) endfunction() diff --git a/cmake/modules/CMakeLists.txt b/cmake/modules/CMakeLists.txt index 08aeeb9a92e9..f7c54f2c0bd3 100644 --- a/cmake/modules/CMakeLists.txt +++ b/cmake/modules/CMakeLists.txt @@ -17,7 +17,6 @@ endforeach(lib) set(LLVM_CONFIG_CODE " # LLVM_BUILD_* values available only from LLVM build tree. set(LLVM_BUILD_BINARY_DIR \"${LLVM_BINARY_DIR}\") -set(LLVM_BUILD_ENABLE_ASSERTIONS \"${LLVM_ENABLE_ASSERTIONS}\") set(LLVM_BUILD_LIBRARY_DIR \"${LLVM_LIBRARY_DIR}\") set(LLVM_BUILD_MAIN_INCLUDE_DIR \"${LLVM_MAIN_INCLUDE_DIR}\") set(LLVM_BUILD_MAIN_SRC_DIR \"${LLVM_MAIN_SRC_DIR}\") diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake index 447ba52ce097..8258512c42a0 100644 --- a/cmake/modules/HandleLLVMOptions.cmake +++ b/cmake/modules/HandleLLVMOptions.cmake @@ -67,12 +67,6 @@ if( LLVM_ENABLE_ASSERTIONS ) "${flags_var_to_scrub}" "${${flags_var_to_scrub}}") endforeach() endif() -else() - if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELEASE" ) - if( NOT MSVC_IDE AND NOT XCODE ) - add_definitions( -DNDEBUG ) - endif() - endif() endif() if(WIN32) @@ -113,18 +107,6 @@ if(APPLE) set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-flat_namespace -Wl,-undefined -Wl,suppress") endif() -function(add_flag_or_print_warning flag) - check_c_compiler_flag(${flag} C_SUPPORTS_FLAG) - check_cxx_compiler_flag(${flag} CXX_SUPPORTS_FLAG) - if (C_SUPPORTS_FLAG AND CXX_SUPPORTS_FLAG) - message(STATUS "Building with ${flag}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag}" PARENT_SCOPE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}" PARENT_SCOPE) - else() - message(WARNING "${flag} is not supported.") - endif() -endfunction() - function(append value) foreach(variable ${ARGN}) set(${variable} "${${variable}} ${value}" PARENT_SCOPE) @@ -139,13 +121,25 @@ function(append_if condition value) endif() endfunction() -macro(add_flag_if_supported flag) - check_c_compiler_flag(${flag} C_SUPPORTS_FLAG) - append_if(C_SUPPORTS_FLAG "${flag}" CMAKE_C_FLAGS) - check_cxx_compiler_flag(${flag} CXX_SUPPORTS_FLAG) - append_if(CXX_SUPPORTS_FLAG "${flag}" CMAKE_CXX_FLAGS) +macro(add_flag_if_supported flag name) + check_c_compiler_flag("-Werror ${flag}" "C_SUPPORTS_${name}") + append_if("C_SUPPORTS_${name}" "${flag}" CMAKE_C_FLAGS) + check_cxx_compiler_flag("-Werror ${flag}" "CXX_SUPPORTS_${name}") + append_if("CXX_SUPPORTS_${name}" "${flag}" CMAKE_CXX_FLAGS) endmacro() +function(add_flag_or_print_warning flag name) + check_c_compiler_flag("-Werror ${flag}" "C_SUPPORTS_${name}") + check_cxx_compiler_flag("-Werror ${flag}" "CXX_SUPPORTS_${name}") + if ("C_SUPPORTS_${name}" AND "CXX_SUPPORTS_${name}") + message(STATUS "Building with ${flag}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag}" PARENT_SCOPE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}" PARENT_SCOPE) + else() + message(WARNING "${flag} is not supported.") + endif() +endfunction() + if( LLVM_ENABLE_PIC ) if( XCODE ) # Xcode has -mdynamic-no-pic on by default, which overrides -fPIC. I don't @@ -154,7 +148,7 @@ if( LLVM_ENABLE_PIC ) elseif( WIN32 OR CYGWIN) # On Windows all code is PIC. MinGW warns if -fPIC is used. else() - add_flag_or_print_warning("-fPIC") + add_flag_or_print_warning("-fPIC" FPIC) if( WIN32 OR CYGWIN) # MinGW warns if -fvisibility-inlines-hidden is used. @@ -290,10 +284,7 @@ elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE ) endif() append_if(LLVM_ENABLE_PEDANTIC "-pedantic -Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) - check_cxx_compiler_flag("-Werror -Wcovered-switch-default" CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG) - append_if(CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG "-Wcovered-switch-default" CMAKE_CXX_FLAGS) - check_c_compiler_flag("-Werror -Wcovered-switch-default" C_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG) - append_if(C_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG "-Wcovered-switch-default" CMAKE_C_FLAGS) + add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG) append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS) append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS) check_cxx_compiler_flag("-Werror -Wnon-virtual-dtor" CXX_SUPPORTS_NON_VIRTUAL_DTOR_FLAG) @@ -311,6 +302,9 @@ elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE ) endif() endif (LLVM_ENABLE_WARNINGS) append_if(LLVM_ENABLE_WERROR "-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + if (NOT LLVM_ENABLE_TIMESTAMPS) + add_flag_if_supported("-Werror=date-time" WERROR_DATE_TIME) + endif () if (LLVM_ENABLE_CXX1Y) check_cxx_compiler_flag("-std=c++1y" CXX_SUPPORTS_CXX1Y) append_if(CXX_SUPPORTS_CXX1Y "-std=c++1y" CMAKE_CXX_FLAGS) @@ -333,14 +327,14 @@ endif( MSVC ) macro(append_common_sanitizer_flags) # Append -fno-omit-frame-pointer and turn on debug info to get better # stack traces. - add_flag_if_supported("-fno-omit-frame-pointer") + add_flag_if_supported("-fno-omit-frame-pointer" FNO_OMIT_FRAME_POINTER) if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" AND NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO") - add_flag_if_supported("-gline-tables-only") + add_flag_if_supported("-gline-tables-only" GLINE_TABLES_ONLY) endif() # Use -O1 even in debug mode, otherwise sanitizers slowdown is too large. if (uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - add_flag_if_supported("-O1") + add_flag_if_supported("-O1" O1) endif() endmacro() @@ -349,12 +343,12 @@ if(LLVM_USE_SANITIZER) if (LLVM_ON_UNIX) if (LLVM_USE_SANITIZER STREQUAL "Address") append_common_sanitizer_flags() - add_flag_or_print_warning("-fsanitize=address") + append("-fsanitize=address" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) elseif (LLVM_USE_SANITIZER MATCHES "Memory(WithOrigins)?") append_common_sanitizer_flags() - add_flag_or_print_warning("-fsanitize=memory") + append("-fsanitize=memory" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) if(LLVM_USE_SANITIZER STREQUAL "MemoryWithOrigins") - add_flag_or_print_warning("-fsanitize-memory-track-origins") + append("-fsanitize-memory-track-origins" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) endif() else() message(WARNING "Unsupported value of LLVM_USE_SANITIZER: ${LLVM_USE_SANITIZER}") @@ -390,15 +384,9 @@ if(NOT CYGWIN AND NOT WIN32) if (C_SUPPORTS_FNO_FUNCTION_SECTIONS) # Don't add -ffunction-section if it can be disabled with -fno-function-sections. # Doing so will break sanitizers. - check_c_compiler_flag("-Werror -ffunction-sections" C_SUPPORTS_FFUNCTION_SECTIONS) - check_cxx_compiler_flag("-Werror -ffunction-sections" CXX_SUPPORTS_FFUNCTION_SECTIONS) - append_if(C_SUPPORTS_FFUNCTION_SECTIONS "-ffunction-sections" CMAKE_C_FLAGS) - append_if(CXX_SUPPORTS_FFUNCTION_SECTIONS "-ffunction-sections" CMAKE_CXX_FLAGS) + add_flag_if_supported("-ffunction-sections" FFUNCTION_SECTIONS) endif() - check_c_compiler_flag("-Werror -fdata-sections" C_SUPPORTS_FDATA_SECTIONS) - check_cxx_compiler_flag("-Werror -fdata-sections" CXX_SUPPORTS_FDATA_SECTIONS) - append_if(C_SUPPORTS_FDATA_SECTIONS "-fdata-sections" CMAKE_C_FLAGS) - append_if(CXX_SUPPORTS_FDATA_SECTIONS "-fdata-sections" CMAKE_CXX_FLAGS) + add_flag_if_supported("-fdata-sections" FDATA_SECTIONS) endif() endif() @@ -419,3 +407,22 @@ if(MSVC) string(REGEX REPLACE "(^| ) */EH[-cs]+ *( |$)" "\\1 \\2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REGEX REPLACE "(^| ) */GR-? *( |$)" "\\1 \\2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") endif() + +# Provide public options to globally control RTTI and EH +option(LLVM_ENABLE_EH "Enable Exception handling" OFF) +option(LLVM_ENABLE_RTTI "Enable run time type information" OFF) +if(LLVM_ENABLE_EH AND NOT LLVM_ENABLE_RTTI) + message(FATAL_ERROR "Exception handling requires RTTI. You must set LLVM_ENABLE_RTTI to ON") +endif() + +# Plugin support +# FIXME: Make this configurable. +if(WIN32 OR CYGWIN) + if(BUILD_SHARED_LIBS) + set(LLVM_ENABLE_PLUGINS ON) + else() + set(LLVM_ENABLE_PLUGINS OFF) + endif() +else() + set(LLVM_ENABLE_PLUGINS ON) +endif() diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake index 2783af807a18..faba6dfd597b 100644 --- a/cmake/modules/LLVM-Config.cmake +++ b/cmake/modules/LLVM-Config.cmake @@ -105,6 +105,9 @@ function(llvm_map_components_to_libnames out_libs) if( TARGET LLVM${c}AsmParser ) list(APPEND expanded_components "LLVM${c}AsmParser") endif() + if( TARGET LLVM${c}Desc ) + list(APPEND expanded_components "LLVM${c}Desc") + endif() if( TARGET LLVM${c}Info ) list(APPEND expanded_components "LLVM${c}Info") endif() @@ -115,6 +118,12 @@ function(llvm_map_components_to_libnames out_libs) # already processed elseif( c STREQUAL "nativecodegen" ) list(APPEND expanded_components "LLVM${LLVM_NATIVE_ARCH}CodeGen") + if( TARGET LLVM${LLVM_NATIVE_ARCH}Desc ) + list(APPEND expanded_components "LLVM${LLVM_NATIVE_ARCH}Desc") + endif() + if( TARGET LLVM${LLVM_NATIVE_ARCH}Info ) + list(APPEND expanded_components "LLVM${LLVM_NATIVE_ARCH}Info") + endif() elseif( c STREQUAL "backend" ) # same case as in `native'. elseif( c STREQUAL "engine" ) diff --git a/cmake/modules/LLVMConfig.cmake.in b/cmake/modules/LLVMConfig.cmake.in index 780001a1e8d2..7c314ac7c833 100644 --- a/cmake/modules/LLVMConfig.cmake.in +++ b/cmake/modules/LLVMConfig.cmake.in @@ -21,6 +21,12 @@ set(LLVM_TARGETS_WITH_JIT @LLVM_TARGETS_WITH_JIT@) set(TARGET_TRIPLE "@TARGET_TRIPLE@") +set(LLVM_ENABLE_ASSERTIONS @LLVM_ENABLE_ASSERTIONS@) + +set(LLVM_ENABLE_EH @LLVM_ENABLE_EH@) + +set(LLVM_ENABLE_RTTI @LLVM_ENABLE_RTTI@) + set(LLVM_ENABLE_TERMINFO @LLVM_ENABLE_TERMINFO@) set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@) diff --git a/cmake/modules/Makefile b/cmake/modules/Makefile index 265c1f8ec14b..33021bcbc9ae 100644 --- a/cmake/modules/Makefile +++ b/cmake/modules/Makefile @@ -15,6 +15,24 @@ include $(LEVEL)/Makefile.common PROJ_cmake := $(DESTDIR)$(PROJ_prefix)/share/llvm/cmake +ifeq ($(DISABLE_ASSERTIONS),1) + LLVM_ENABLE_ASSERTIONS := 0 +else + LLVM_ENABLE_ASSERTIONS := 1 +endif + +ifeq ($(REQUIRES_EH),1) + LLVM_ENABLE_EH := 1 +else + LLVM_ENABLE_EH := 0 +endif + +ifeq ($(REQUIRES_RTTI),1) + LLVM_ENABLE_RTTI := 1 +else + LLVM_ENABLE_RTTI := 0 +endif + OBJMODS := LLVMConfig.cmake LLVMConfigVersion.cmake LLVMExports.cmake $(PROJ_OBJ_DIR)/LLVMConfig.cmake: LLVMConfig.cmake.in $(LLVMBuildCMakeFrag) @@ -32,6 +50,9 @@ $(PROJ_OBJ_DIR)/LLVMConfig.cmake: LLVMConfig.cmake.in $(LLVMBuildCMakeFrag) -e 's/@LLVM_TARGETS_TO_BUILD@/'"$(TARGETS_TO_BUILD)"'/' \ -e 's/@LLVM_TARGETS_WITH_JIT@/'"$(TARGETS_WITH_JIT)"'/' \ -e 's/@TARGET_TRIPLE@/'"$(TARGET_TRIPLE)"'/' \ + -e 's/@LLVM_ENABLE_ASSERTIONS@/'"$(LLVM_ENABLE_ASSERTIONS)"'/' \ + -e 's/@LLVM_ENABLE_EH@/'"$(LLVM_ENABLE_EH)"'/' \ + -e 's/@LLVM_ENABLE_RTTI@/'"$(LLVM_ENABLE_RTTI)"'/' \ -e 's/@LLVM_ENABLE_TERMINFO@/'"$(ENABLE_TERMINFO)"'/' \ -e 's/@LLVM_ENABLE_THREADS@/'"$(ENABLE_THREADS)"'/' \ -e 's/@LLVM_ENABLE_ZLIB@/'"$(ENABLE_ZLIB)"'/' \ diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake index 34b03430b26e..845c986ae2e0 100644 --- a/cmake/modules/TableGen.cmake +++ b/cmake/modules/TableGen.cmake @@ -77,9 +77,18 @@ if(CMAKE_CROSSCOMPILING) COMMAND ${CMAKE_COMMAND} -E make_directory ${CX_NATIVE_TG_DIR} COMMENT "Creating ${CX_NATIVE_TG_DIR}...") + # Forward a subset of configure options to discover additional tablegen modules. + get_cmake_property(_variableNames CACHE_VARIABLES) + foreach (_variableName ${_variableNames}) + if (_variableName MATCHES "^(LLVM_EXTERNAL_.*_SOURCE_DIR)$") + list(APPEND CX_CMAKE_ARGUMENTS "-D${_variableName}=\"${${_variableName}}\"") + endif () + endforeach() + add_custom_command(OUTPUT ${CX_NATIVE_TG_DIR}/CMakeCache.txt + # TODO: Clear the old CMakeCache.txt somehow without breaking restat. COMMAND ${CMAKE_COMMAND} -UMAKE_TOOLCHAIN_FILE -DCMAKE_BUILD_TYPE=Release - -DLLVM_BUILD_POLLY=OFF + -DLLVM_BUILD_POLLY=OFF ${CX_CMAKE_ARGUMENTS} -G "${CMAKE_GENERATOR}" ${CMAKE_SOURCE_DIR} WORKING_DIRECTORY ${CX_NATIVE_TG_DIR} DEPENDS ${CX_NATIVE_TG_DIR} diff --git a/docs/CMake.rst b/docs/CMake.rst index 988e12b73502..2c8323875389 100644 --- a/docs/CMake.rst +++ b/docs/CMake.rst @@ -218,10 +218,18 @@ LLVM-specific variables Enables code assertions. Defaults to OFF if and only if ``CMAKE_BUILD_TYPE`` is *Release*. +**LLVM_ENABLE_EH**:BOOL + Build LLVM with exception handling support. This is necessary if you wish to + link against LLVM libraries and make use of C++ exceptions in your own code + that need to propagate through LLVM code. Defaults to OFF. + **LLVM_ENABLE_PIC**:BOOL Add the ``-fPIC`` flag for the compiler command-line, if the compiler supports this flag. Some systems, like Windows, do not need this flag. Defaults to ON. +**LLVM_ENABLE_RTTI**:BOOL + Build LLVM with run time type information. Defaults to OFF. + **LLVM_ENABLE_WARNINGS**:BOOL Enable all compiler warnings. Defaults to ON. @@ -487,7 +495,7 @@ into LLVM source tree. You can achieve it in two easy steps: #. Adding ``add_subdirectory()`` line into ``/lib/Transform/CMakeLists.txt``. -Compiler/Platform specific topics +Compiler/Platform-specific topics ================================= Notes for specific compilers and/or platforms. diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst index cc099465b8ea..5736e4378d72 100644 --- a/docs/CodeGenerator.rst +++ b/docs/CodeGenerator.rst @@ -1228,7 +1228,7 @@ used. Each virtual register can only be mapped to physical registers of a particular class. For instance, in the X86 architecture, some virtuals can only be allocated to 8 bit registers. A register class is described by ``TargetRegisterClass`` objects. To discover if a virtual register is -compatible with a given physical, this code can be used:

+compatible with a given physical, this code can be used: .. code-block:: c++ @@ -1683,7 +1683,7 @@ ones supported by the matcher), through a Requires clause: def : MnemonicAlias<"pushf", "pushfq">, Requires<[In64BitMode]>; def : MnemonicAlias<"pushf", "pushfl">, Requires<[In32BitMode]>; -In this example, the mnemonic gets mapped into different a new one depending on +In this example, the mnemonic gets mapped into a different one depending on the current instruction set. Instruction Aliases @@ -2027,7 +2027,7 @@ supported on x86/x86-64 and PowerPC. It is performed if: * Option ``-tailcallopt`` is enabled. -* Platform specific constraints are met. +* Platform-specific constraints are met. x86/x86-64 constraints: diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst index edbef3ace53c..3cfa1f66ab4e 100644 --- a/docs/CodingStandards.rst +++ b/docs/CodingStandards.rst @@ -107,10 +107,7 @@ unlikely to be supported by our host compilers. * Trailing return types: N2541_ * Lambdas: N2927_ - * But *not* ``std::function``, until Clang implements `MSVC-compatible RTTI`_. - In many cases, you may be able to use ``llvm::function_ref`` instead, and it - is a superior choice in those cases. - * And *not* lambdas with default arguments. + * But *not* lambdas with default arguments. * ``decltype``: N2343_ * Nested closing right angle brackets: N1757_ diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst index 5a60d60ae3df..af01503792e8 100644 --- a/docs/CommandGuide/FileCheck.rst +++ b/docs/CommandGuide/FileCheck.rst @@ -49,6 +49,17 @@ OPTIONS The :option:`--strict-whitespace` argument disables this behavior. End-of-line sequences are canonicalized to UNIX-style ``\n`` in all modes. +.. option:: --implicit-check-not check-pattern + + Adds implicit negative checks for the specified patterns between positive + checks. The option allows writing stricter tests without stuffing them with + ``CHECK-NOT``\ s. + + For example, "``--implicit-check-not warning:``" can be useful when testing + diagnostic messages from tools that don't have an option similar to ``clang + -verify``. With this option FileCheck will verify that input does not contain + warnings not covered by any ``CHECK:`` patterns. + .. option:: -version Show the version number of this program. diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst index 323a6ea81659..dc6dab1d336c 100644 --- a/docs/GarbageCollection.rst +++ b/docs/GarbageCollection.rst @@ -633,7 +633,7 @@ Threaded Denotes a multithreaded mutator; the collector must still stop the mutator ("stop the world") before beginning reachability analysis. Stopping a multithreaded mutator is a complicated problem. It generally requires highly - platform specific code in the runtime, and the production of carefully + platform-specific code in the runtime, and the production of carefully designed machine code at safe points. Concurrent diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst index 6de9b9004e0f..d409f623f868 100644 --- a/docs/GettingStarted.rst +++ b/docs/GettingStarted.rst @@ -713,13 +713,6 @@ The following options can be used to set or enable LLVM specific options: generating the documentation can take a long time and producess 100s of megabytes of output. -``--with-udis86`` - - LLVM can use external disassembler library for various purposes (now it's used - only for examining code produced by JIT). This option will enable usage of - `udis86 `_ x86 (both 32 and 64 bits) - disassembler library. - To configure LLVM, follow these steps: #. Change directory into the object root directory: diff --git a/docs/HowToReleaseLLVM.rst b/docs/HowToReleaseLLVM.rst index 61aa9e869f27..26e9f3b2ee87 100644 --- a/docs/HowToReleaseLLVM.rst +++ b/docs/HowToReleaseLLVM.rst @@ -146,25 +146,25 @@ following commands: :: - $ svn mkdir https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XY + $ svn mkdir https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XYZ $ svn copy https://llvm.org/svn/llvm-project/llvm/branches/release_XY \ - https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XY/rc1 + https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XYZ/rc1 - $ svn mkdir https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XY + $ svn mkdir https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XYZ $ svn copy https://llvm.org/svn/llvm-project/cfe/branches/release_XY \ - https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XY/rc1 + https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XYZ/rc1 - $ svn mkdir https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XY + $ svn mkdir https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XYZ $ svn copy https://llvm.org/svn/llvm-project/dragonegg/branches/release_XY \ - https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XY/rc1 + https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XYZ/rc1 - $ svn mkdir https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XY + $ svn mkdir https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XYZ $ svn copy https://llvm.org/svn/llvm-project/test-suite/branches/release_XY \ - https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XY/rc1 + https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XYZ/rc1 Similarly, **Release Candidate 2** would be named ``RC2`` and so on. This keeps a permanent copy of the release candidate around for people to export and build -as they wish. The final released sources will be tagged in the ``RELEASE_XY`` +as they wish. The final released sources will be tagged in the ``RELEASE_XYZ`` directory as ``Final`` (c.f. :ref:`tag`). The Release Manager may supply pre-packaged source tarballs for users. This can @@ -172,10 +172,10 @@ be done with the following commands: :: - $ svn export https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XY/rc1 llvm-X.Yrc1 - $ svn export https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XY/rc1 clang-X.Yrc1 - $ svn export https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XY/rc1 dragonegg-X.Yrc1 - $ svn export https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XY/rc1 llvm-test-X.Yrc1 + $ svn export https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XYZ/rc1 llvm-X.Yrc1 + $ svn export https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XYZ/rc1 clang-X.Yrc1 + $ svn export https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XYZ/rc1 dragonegg-X.Yrc1 + $ svn export https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XYZ/rc1 llvm-test-X.Yrc1 $ tar -cvf - llvm-X.Yrc1 | gzip > llvm-X.Yrc1.src.tar.gz $ tar -cvf - clang-X.Yrc1 | gzip > clang-X.Yrc1.src.tar.gz @@ -389,16 +389,16 @@ Tag the final release sources using the following procedure: :: $ svn copy https://llvm.org/svn/llvm-project/llvm/branches/release_XY \ - https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XY/Final + https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XYZ/Final $ svn copy https://llvm.org/svn/llvm-project/cfe/branches/release_XY \ - https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XY/Final + https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XYZ/Final $ svn copy https://llvm.org/svn/llvm-project/dragonegg/branches/release_XY \ - https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XY/Final + https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XYZ/Final $ svn copy https://llvm.org/svn/llvm-project/test-suite/branches/release_XY \ - https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XY/Final + https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XYZ/Final Update the LLVM Demo Page ------------------------- diff --git a/docs/LangRef.rst b/docs/LangRef.rst index cb94d3967b13..867ef741da17 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -562,6 +562,8 @@ is zero. The address space qualifier must precede any other attributes. LLVM allows an explicit section to be specified for globals. If the target supports it, it will emit globals to the section specified. +Additionally, the global can placed in a comdat if the target has the necessary +support. By default, global initializers are optimized by assuming that global variables defined within the module are not modified from their @@ -580,7 +582,7 @@ to over-align the global if the global has an assigned section. In this case, the extra alignment could be observable: for example, code could assume that the globals are densely packed in their section and try to iterate over them as an array, alignment padding would break this -iteration. +iteration. The maximum alignment is ``1 << 29``. Globals can also have a :ref:`DLL storage class `. @@ -627,8 +629,9 @@ an optional ``unnamed_addr`` attribute, a return type, an optional :ref:`parameter attribute ` for the return type, a function name, a (possibly empty) argument list (each with optional :ref:`parameter attributes `), optional :ref:`function attributes `, -an optional section, an optional alignment, an optional :ref:`garbage -collector name `, an optional :ref:`prefix `, an opening +an optional section, an optional alignment, +an optional :ref:`comdat `, +an optional :ref:`garbage collector name `, an optional :ref:`prefix `, an opening curly brace, a list of basic blocks, and a closing curly brace. LLVM function declarations consist of the "``declare``" keyword, an @@ -658,6 +661,7 @@ predecessors, it also cannot have any :ref:`PHI nodes `. LLVM allows an explicit section to be specified for functions. If the target supports it, it will emit functions to the section specified. +Additionally, the function can placed in a COMDAT. An explicit alignment may be specified for a function. If not present, or if the alignment is set to zero, the alignment of the function is set @@ -673,8 +677,8 @@ Syntax:: define [linkage] [visibility] [DLLStorageClass] [cconv] [ret attrs] @ ([argument list]) - [unnamed_addr] [fn Attrs] [section "name"] [align N] - [gc] [prefix Constant] { ... } + [unnamed_addr] [fn Attrs] [section "name"] [comdat $] + [align N] [gc] [prefix Constant] { ... } .. _langref_aliases: @@ -716,6 +720,89 @@ some can only be checked when producing an object file: * No global value in the expression can be a declaration, since that would require a relocation, which is not possible. +.. _langref_comdats: + +Comdats +------- + +Comdat IR provides access to COFF and ELF object file COMDAT functionality. + +Comdats have a name which represents the COMDAT key. All global objects which +specify this key will only end up in the final object file if the linker chooses +that key over some other key. Aliases are placed in the same COMDAT that their +aliasee computes to, if any. + +Comdats have a selection kind to provide input on how the linker should +choose between keys in two different object files. + +Syntax:: + + $ = comdat SelectionKind + +The selection kind must be one of the following: + +``any`` + The linker may choose any COMDAT key, the choice is arbitrary. +``exactmatch`` + The linker may choose any COMDAT key but the sections must contain the + same data. +``largest`` + The linker will choose the section containing the largest COMDAT key. +``noduplicates`` + The linker requires that only section with this COMDAT key exist. +``samesize`` + The linker may choose any COMDAT key but the sections must contain the + same amount of data. + +Note that the Mach-O platform doesn't support COMDATs and ELF only supports +``any`` as a selection kind. + +Here is an example of a COMDAT group where a function will only be selected if +the COMDAT key's section is the largest: + +.. code-block:: llvm + + $foo = comdat largest + @foo = global i32 2, comdat $foo + + define void @bar() comdat $foo { + ret void + } + +In a COFF object file, this will create a COMDAT section with selection kind +``IMAGE_COMDAT_SELECT_LARGEST`` containing the contents of the ``@foo`` symbol +and another COMDAT section with selection kind +``IMAGE_COMDAT_SELECT_ASSOCIATIVE`` which is associated with the first COMDAT +section and contains the contents of the ``@baz`` symbol. + +There are some restrictions on the properties of the global object. +It, or an alias to it, must have the same name as the COMDAT group when +targeting COFF. +The contents and size of this object may be used during link-time to determine +which COMDAT groups get selected depending on the selection kind. +Because the name of the object must match the name of the COMDAT group, the +linkage of the global object must not be local; local symbols can get renamed +if a collision occurs in the symbol table. + +The combined use of COMDATS and section attributes may yield surprising results. +For example: + +.. code-block:: llvm + + $foo = comdat any + $bar = comdat any + @g1 = global i32 42, section "sec", comdat $foo + @g2 = global i32 42, section "sec", comdat $bar + +From the object file perspective, this requires the creation of two sections +with the same name. This is necessary because both globals belong to different +COMDAT groups and COMDATs, at the object file level, are represented by +sections. + +Note that certain IR constructs like global variables and functions may create +COMDATs in the object file in addition to any which are specified using COMDAT +IR. This arises, for example, when a global variable has linkonce_odr linkage. + .. _namedmetadatastructure: Named Metadata @@ -804,7 +891,7 @@ Currently, only the following parameter attributes are defined: address of outgoing stack arguments. An ``inalloca`` argument must be a pointer to stack memory produced by an ``alloca`` instruction. The alloca, or argument allocation, must also be tagged with the - inalloca keyword. Only the past argument may have the ``inalloca`` + inalloca keyword. Only the last argument may have the ``inalloca`` attribute, and that argument is guaranteed to be passed in memory. An argument allocation may be used by a call at most once because @@ -882,6 +969,17 @@ Currently, only the following parameter attributes are defined: passed in is non-null, or the callee must ensure that the returned pointer is non-null. +``dereferenceable()`` + This indicates that the parameter or return pointer is dereferenceable. This + attribute may only be applied to pointer typed parameters. A pointer that + is dereferenceable can be loaded from speculatively without a risk of + trapping. The number of bytes known to be dereferenceable must be provided + in parentheses. It is legal for the number of bytes to be less than the + size of the pointee type. The ``nonnull`` attribute does not imply + dereferenceability (consider a pointer to one element past the end of an + array), however ``dereferenceable()`` does imply ``nonnull`` in + ``addrspace(0)`` (which is the default address space). + .. _gc: Garbage Collector Names @@ -2749,11 +2847,12 @@ number representing the maximum relative error, for example: '``range``' Metadata ^^^^^^^^^^^^^^^^^^^^ -``range`` metadata may be attached only to loads of integer types. It -expresses the possible ranges the loaded value is in. The ranges are -represented with a flattened list of integers. The loaded value is known -to be in the union of the ranges defined by each consecutive pair. Each -pair has the following properties: +``range`` metadata may be attached only to ``load``, ``call`` and ``invoke`` of +integer types. It expresses the possible ranges the loaded value or the value +returned by the called function at this call site is in. The ranges are +represented with a flattened list of integers. The loaded value or the value +returned is known to be in the union of the ranges defined by each consecutive +pair. Each pair has the following properties: - The type must match the type loaded by the instruction. - The pair ``a,b`` represents the range ``[a,b)``. @@ -2771,8 +2870,9 @@ Examples: %a = load i8* %x, align 1, !range !0 ; Can only be 0 or 1 %b = load i8* %y, align 1, !range !1 ; Can only be 255 (-1), 0 or 1 - %c = load i8* %z, align 1, !range !2 ; Can only be 0, 1, 3, 4 or 5 - %d = load i8* %z, align 1, !range !3 ; Can only be -2, -1, 3, 4 or 5 + %c = call i8 @foo(), !range !2 ; Can only be 0, 1, 3, 4 or 5 + %d = invoke i8 @bar() to label %cont + unwind label %lpad, !range !3 ; Can only be -2, -1, 3, 4 or 5 ... !0 = metadata !{ i8 0, i8 2 } !1 = metadata !{ i8 255, i8 2 } @@ -2800,17 +2900,121 @@ constructs: !0 = metadata !{ metadata !0 } !1 = metadata !{ metadata !1 } -The loop identifier metadata can be used to specify additional per-loop -metadata. Any operands after the first operand can be treated as user-defined -metadata. For example the ``llvm.vectorizer.unroll`` metadata is understood -by the loop vectorizer to indicate how many times to unroll the loop: +The loop identifier metadata can be used to specify additional +per-loop metadata. Any operands after the first operand can be treated +as user-defined metadata. For example the ``llvm.loop.unroll.count`` +suggests an unroll factor to the loop unroller: .. code-block:: llvm br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !0 ... !0 = metadata !{ metadata !0, metadata !1 } - !1 = metadata !{ metadata !"llvm.vectorizer.unroll", i32 2 } + !1 = metadata !{ metadata !"llvm.loop.unroll.count", i32 4 } + +'``llvm.loop.vectorize``' and '``llvm.loop.interleave``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Metadata prefixed with ``llvm.loop.vectorize`` or ``llvm.loop.interleave`` are +used to control per-loop vectorization and interleaving parameters such as +vectorization width and interleave count. These metadata should be used in +conjunction with ``llvm.loop`` loop identification metadata. The +``llvm.loop.vectorize`` and ``llvm.loop.interleave`` metadata are only +optimization hints and the optimizer will only interleave and vectorize loops if +it believes it is safe to do so. The ``llvm.mem.parallel_loop_access`` metadata +which contains information about loop-carried memory dependencies can be helpful +in determining the safety of these transformations. + +'``llvm.loop.interleave.count``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata suggests an interleave count to the loop interleaver. +The first operand is the string ``llvm.loop.interleave.count`` and the +second operand is an integer specifying the interleave count. For +example: + +.. code-block:: llvm + + !0 = metadata !{ metadata !"llvm.loop.interleave.count", i32 4 } + +Note that setting ``llvm.loop.interleave.count`` to 1 disables interleaving +multiple iterations of the loop. If ``llvm.loop.interleave.count`` is set to 0 +then the interleave count will be determined automatically. + +'``llvm.loop.vectorize.enable``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata selectively enables or disables vectorization for the loop. The +first operand is the string ``llvm.loop.vectorize.enable`` and the second operand +is a bit. If the bit operand value is 1 vectorization is enabled. A value of +0 disables vectorization: + +.. code-block:: llvm + + !0 = metadata !{ metadata !"llvm.loop.vectorize.enable", i1 0 } + !1 = metadata !{ metadata !"llvm.loop.vectorize.enable", i1 1 } + +'``llvm.loop.vectorize.width``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata sets the target width of the vectorizer. The first +operand is the string ``llvm.loop.vectorize.width`` and the second +operand is an integer specifying the width. For example: + +.. code-block:: llvm + + !0 = metadata !{ metadata !"llvm.loop.vectorize.width", i32 4 } + +Note that setting ``llvm.loop.vectorize.width`` to 1 disables +vectorization of the loop. If ``llvm.loop.vectorize.width`` is set to +0 or if the loop does not have this metadata the width will be +determined automatically. + +'``llvm.loop.unroll``' +^^^^^^^^^^^^^^^^^^^^^^ + +Metadata prefixed with ``llvm.loop.unroll`` are loop unrolling +optimization hints such as the unroll factor. ``llvm.loop.unroll`` +metadata should be used in conjunction with ``llvm.loop`` loop +identification metadata. The ``llvm.loop.unroll`` metadata are only +optimization hints and the unrolling will only be performed if the +optimizer believes it is safe to do so. + +'``llvm.loop.unroll.enable``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata either disables loop unrolling or suggests that the loop +be unrolled fully. The first operand is the string +``llvm.loop.unroll.enable`` and the second operand is a bit. If the +bit operand value is 0 loop unrolling is disabled. A value of 1 +indicates that the loop should be fully unrolled. For example: + +.. code-block:: llvm + + !0 = metadata !{ metadata !"llvm.loop.unroll.enable", i1 0 } + !1 = metadata !{ metadata !"llvm.loop.unroll.enable", i1 1 } + +'``llvm.loop.unroll.count``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata suggests an unroll factor to the loop unroller. The +first operand is the string ``llvm.loop.unroll.count`` and the second +operand is a positive integer specifying the unroll factor. For +example: + +.. code-block:: llvm + + !0 = metadata !{ metadata !"llvm.loop.unroll.count", i32 4 } + +If the trip count of the loop is less than the unroll count the loop +will be partially unrolled. + +If a loop has both a ``llvm.loop.unroll.enable`` metadata and +``llvm.loop.unroll.count`` metadata the behavior depends upon the +value of the ``llvm.loop.unroll.enable`` operand. If the value is 0, +the loop will not be unrolled. If the value is 1, the loop will be +unrolled with a factor determined by the ``llvm.loop.unroll.count`` +operand effectively ignoring the ``llvm.loop.unroll.enable`` metadata. '``llvm.mem``' ^^^^^^^^^^^^^^^ @@ -2895,55 +3099,6 @@ the loop identifier metadata node directly: !1 = metadata !{ metadata !1 } ; an identifier for the inner loop !2 = metadata !{ metadata !2 } ; an identifier for the outer loop -'``llvm.vectorizer``' -^^^^^^^^^^^^^^^^^^^^^ - -Metadata prefixed with ``llvm.vectorizer`` is used to control per-loop -vectorization parameters such as vectorization factor and unroll factor. - -``llvm.vectorizer`` metadata should be used in conjunction with ``llvm.loop`` -loop identification metadata. - -'``llvm.vectorizer.unroll``' Metadata -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This metadata instructs the loop vectorizer to unroll the specified -loop exactly ``N`` times. - -The first operand is the string ``llvm.vectorizer.unroll`` and the second -operand is an integer specifying the unroll factor. For example: - -.. code-block:: llvm - - !0 = metadata !{ metadata !"llvm.vectorizer.unroll", i32 4 } - -Note that setting ``llvm.vectorizer.unroll`` to 1 disables unrolling of the -loop. - -If ``llvm.vectorizer.unroll`` is set to 0 then the amount of unrolling will be -determined automatically. - -'``llvm.vectorizer.width``' Metadata -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This metadata sets the target width of the vectorizer to ``N``. Without -this metadata, the vectorizer will choose a width automatically. -Regardless of this metadata, the vectorizer will only vectorize loops if -it believes it is valid to do so. - -The first operand is the string ``llvm.vectorizer.width`` and the second -operand is an integer specifying the width. For example: - -.. code-block:: llvm - - !0 = metadata !{ metadata !"llvm.vectorizer.width", i32 4 } - -Note that setting ``llvm.vectorizer.width`` to 1 disables vectorization of the -loop. - -If ``llvm.vectorizer.width`` is set to 0 then the width will be determined -automatically. - Module Flags Metadata ===================== @@ -3144,6 +3299,42 @@ Each individual option is required to be either a valid option for the target's linker, or an option that is reserved by the target specific assembly writer or object file emitter. No other aspect of these options is defined by the IR. +C type width Module Flags Metadata +---------------------------------- + +The ARM backend emits a section into each generated object file describing the +options that it was compiled with (in a compiler-independent way) to prevent +linking incompatible objects, and to allow automatic library selection. Some +of these options are not visible at the IR level, namely wchar_t width and enum +width. + +To pass this information to the backend, these options are encoded in module +flags metadata, using the following key-value pairs: + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Key + - Value + + * - short_wchar + - * 0 --- sizeof(wchar_t) == 4 + * 1 --- sizeof(wchar_t) == 2 + + * - short_enum + - * 0 --- Enums are at least as large as an ``int``. + * 1 --- Enums are stored in the smallest integer type which can + represent all of its values. + +For example, the following metadata section specifies that the module was +compiled with a ``wchar_t`` width of 4 bytes, and the underlying type of an +enum is the smallest type which can represent all of its values:: + + !llvm.module.flags = !{!0, !1} + !0 = metadata !{i32 1, metadata !"short_wchar", i32 1} + !1 = metadata !{i32 1, metadata !"short_enum", i32 0} + .. _intrinsicglobalvariables: Intrinsic Global Variables @@ -4800,9 +4991,10 @@ bytes of memory on the runtime stack, returning a pointer of the appropriate type to the program. If "NumElements" is specified, it is the number of elements allocated, otherwise "NumElements" is defaulted to be one. If a constant alignment is specified, the value result of the -allocation is guaranteed to be aligned to at least that boundary. If not -specified, or if zero, the target can choose to align the allocation on -any convenient boundary compatible with the type. +allocation is guaranteed to be aligned to at least that boundary. The +alignment may not be greater than ``1 << 29``. If not specified, or if +zero, the target can choose to align the allocation on any convenient +boundary compatible with the type. '``type``' may be any sized type. @@ -4876,7 +5068,8 @@ or an omitted ``align`` argument means that the operation has the ABI alignment for the target. It is the responsibility of the code emitter to ensure that the alignment information is correct. Overestimating the alignment results in undefined behavior. Underestimating the alignment -may produce less efficient code. An alignment of 1 is always safe. +may produce less efficient code. An alignment of 1 is always safe. The +maximum possible alignment is ``1 << 29``. The optional ``!nontemporal`` metadata must reference a single metadata name ```` corresponding to a metadata node with one @@ -4962,7 +5155,7 @@ alignment for the target. It is the responsibility of the code emitter to ensure that the alignment information is correct. Overestimating the alignment results in undefined behavior. Underestimating the alignment may produce less efficient code. An alignment of 1 is always -safe. +safe. The maximum possible alignment is ``1 << 29``. The optional ``!nontemporal`` metadata must reference a single metadata name ```` corresponding to a metadata node with one ``i32`` entry of @@ -6283,7 +6476,7 @@ This instruction requires several arguments: uses value of call or is void). - Option ``-tailcallopt`` is enabled, or ``llvm::GuaranteedTailCallOpt`` is ``true``. - - `Platform specific constraints are + - `Platform-specific constraints are met. `_ #. The optional "cconv" marker indicates which :ref:`calling @@ -8526,14 +8719,14 @@ Syntax: :: - declare i16 @llvm.convert.to.fp16(f32 %a) + declare i16 @llvm.convert.to.fp16.f32(float %a) + declare i16 @llvm.convert.to.fp16.f64(double %a) Overview: """"""""" -The '``llvm.convert.to.fp16``' intrinsic function performs a conversion -from single precision floating point format to half precision floating -point format. +The '``llvm.convert.to.fp16``' intrinsic function performs a conversion from a +conventional floating point type to half precision floating point format. Arguments: """""""""" @@ -8544,17 +8737,16 @@ converted. Semantics: """""""""" -The '``llvm.convert.to.fp16``' intrinsic function performs a conversion -from single precision floating point format to half precision floating -point format. The return value is an ``i16`` which contains the -converted number. +The '``llvm.convert.to.fp16``' intrinsic function performs a conversion from a +conventional floating point format to half precision floating point format. The +return value is an ``i16`` which contains the converted number. Examples: """"""""" .. code-block:: llvm - %res = call i16 @llvm.convert.to.fp16(f32 %a) + %res = call i16 @llvm.convert.to.fp16.f32(float %a) store i16 %res, i16* @x, align 2 .. _int_convert_from_fp16: @@ -8567,7 +8759,8 @@ Syntax: :: - declare f32 @llvm.convert.from.fp16(i16 %a) + declare float @llvm.convert.from.fp16.f32(i16 %a) + declare double @llvm.convert.from.fp16.f64(i16 %a) Overview: """"""""" @@ -8596,7 +8789,7 @@ Examples: .. code-block:: llvm %a = load i16* @x, align 2 - %res = call f32 @llvm.convert.from.fp16(i16 %a) + %res = call float @llvm.convert.from.fp16(i16 %a) Debugger Intrinsics ------------------- @@ -8717,7 +8910,7 @@ Semantics: """""""""" On some architectures the address of the code to be executed needs to be -different to the address where the trampoline is actually stored. This +different than the address where the trampoline is actually stored. This intrinsic returns the executable address corresponding to ``tramp`` after performing the required machine specific adjustments. The pointer returned can then be :ref:`bitcast and executed `. @@ -8725,7 +8918,7 @@ returned can then be :ref:`bitcast and executed `. Memory Use Markers ------------------ -This class of intrinsics exists to information about the lifetime of +This class of intrinsics provides information about the lifetime of memory objects and ranges where variables are immutable. .. _int_lifestart: diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst index 11f1341f5cbd..65b7a3e84cda 100644 --- a/docs/Lexicon.rst +++ b/docs/Lexicon.rst @@ -50,7 +50,7 @@ C Common Subexpression Elimination. An optimization that removes common subexpression compuation. For example ``(a+b)*(a+b)`` has two subexpressions that are the same: ``(a+b)``. This optimization would perform the addition - only once and then perform the multiply (but only if it's compulationally + only once and then perform the multiply (but only if it's computationally correct/safe). D @@ -159,7 +159,7 @@ R ``Constant::replaceUsesOfWithOnConstant()`` implement the replacement of one Value with another by iterating over its def/use chain and fixing up all of the pointers to point to the new value. See - also `def/use chains `_. + also `def/use chains `_. **Reassociation** Rearranging associative expressions to promote better redundancy elimination diff --git a/docs/Makefile.sphinx b/docs/Makefile.sphinx index 21f66488b2b7..b12168cfed8c 100644 --- a/docs/Makefile.sphinx +++ b/docs/Makefile.sphinx @@ -10,7 +10,7 @@ BUILDDIR = _build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) -W . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst index 18b2817fc041..8ac9afec6c39 100644 --- a/docs/Phabricator.rst +++ b/docs/Phabricator.rst @@ -5,18 +5,29 @@ Code Reviews with Phabricator .. contents:: :local: -If you prefer to use a web user interface for code reviews, -you can now submit your patches for Clang and LLVM at -`LLVM's Phabricator`_. +If you prefer to use a web user interface for code reviews, you can now submit +your patches for Clang and LLVM at `LLVM's Phabricator`_ instance. + +While Phabricator is a useful tool for some, the relevant -commits mailing list +is the system of record for all LLVM code review. The mailing list should be +added as a subscriber on all reviews, and Phabricator users should be prepared +to respond to free-form comments in mail sent to the commits list. Sign up ------- +To get started with Phabricator, navigate to `http://reviews.llvm.org`_ and +click the power icon in the top right. You can register with a GitHub account, +a Google account, or you can create your own profile. + +Make *sure* that the email address registered with Phabricator is subscribed +to the relevant -commits mailing list. If your are not subscribed to the commit +list, all mail sent by Phabricator on your behalf will be held for moderation. + Note that if you use your Subversion user name as Phabricator user name, Phabricator will automatically connect your submits to your Phabricator user in the `Code Repository Browser`_. - Requesting a review via the command line ---------------------------------------- @@ -90,6 +101,15 @@ a change from Phabricator. Committing a change ------------------- +Arcanist can manage the commit transparently. It will retrieve the description, +reviewers, the ``Differential Revision``, etc from the review and commit it to the repository. + +:: + + arc patch D + arc commit --revision D + + When committing an LLVM change that has been reviewed using Phabricator, the convention is for the commit message to end with the line: @@ -113,6 +133,7 @@ Status Please let us know whether you like it and what could be improved! .. _LLVM's Phabricator: http://reviews.llvm.org +.. _`http://reviews.llvm.org`: http://reviews.llvm.org .. _Code Repository Browser: http://reviews.llvm.org/diffusion/ .. _Arcanist Quick Start: http://www.phabricator.com/docs/phabricator/article/Arcanist_Quick_Start.html .. _Arcanist User Guide: http://www.phabricator.com/docs/phabricator/article/Arcanist_User_Guide.html diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst index 4973e5c66719..46ec15f93a32 100644 --- a/docs/ProgrammersManual.rst +++ b/docs/ProgrammersManual.rst @@ -298,7 +298,9 @@ The ``function_ref`` (`doxygen `__) class template represents a reference to a callable object, templated over the type of the callable. This is a good choice for passing a callback to a function, -if you don't need to hold onto the callback after the function returns. +if you don't need to hold onto the callback after the function returns. In this +way, ``function_ref`` is to ``std::function`` as ``StringRef`` is to +``std::string``. ``function_ref`` can be implicitly constructed from any callable object that can be called with arguments of type ``Param1``, @@ -323,17 +325,11 @@ can be called using: return false; }); -Note that a ``function_ref`` object contains pointers to external memory, so -it is not generally safe to store an instance of the class (unless you know -that the external storage will not be freed). -``function_ref`` is small enough that it should always be passed by value. - -``std::function`` -^^^^^^^^^^^^^^^^^ - -You cannot use ``std::function`` within LLVM code, because it is not supported -by all our target toolchains. - +Note that a ``function_ref`` object contains pointers to external memory, so it +is not generally safe to store an instance of the class (unless you know that +the external storage will not be freed). If you need this ability, consider +using ``std::function``. ``function_ref`` is small enough that it should always +be passed by value. .. _DEBUG: @@ -1441,8 +1437,10 @@ order, making it an easy (but somewhat expensive) solution for non-deterministic iteration over maps of pointers. It is implemented by mapping from key to an index in a vector of key,value -pairs. This provides fast lookup and iteration, but has two main drawbacks: The -key is stored twice and it doesn't support removing elements. +pairs. This provides fast lookup and iteration, but has two main drawbacks: +the key is stored twice and removing elements takes linear time. If it is +necessary to remove elements, it's best to remove them in bulk using +``remove_if()``. .. _dss_inteqclasses: @@ -2170,46 +2168,13 @@ compiler, consider compiling LLVM and LLVM-GCC in single-threaded mode, and using the resultant compiler to build a copy of LLVM with multithreading support. -.. _startmultithreaded: - -Entering and Exiting Multithreaded Mode ---------------------------------------- - -In order to properly protect its internal data structures while avoiding -excessive locking overhead in the single-threaded case, the LLVM must intialize -certain data structures necessary to provide guards around its internals. To do -so, the client program must invoke ``llvm_start_multithreaded()`` before making -any concurrent LLVM API calls. To subsequently tear down these structures, use -the ``llvm_stop_multithreaded()`` call. You can also use the -``llvm_is_multithreaded()`` call to check the status of multithreaded mode. - -Note that both of these calls must be made *in isolation*. That is to say that -no other LLVM API calls may be executing at any time during the execution of -``llvm_start_multithreaded()`` or ``llvm_stop_multithreaded``. It is the -client's responsibility to enforce this isolation. - -The return value of ``llvm_start_multithreaded()`` indicates the success or -failure of the initialization. Failure typically indicates that your copy of -LLVM was built without multithreading support, typically because GCC atomic -intrinsics were not found in your system compiler. In this case, the LLVM API -will not be safe for concurrent calls. However, it *will* be safe for hosting -threaded applications in the JIT, though :ref:`care must be taken -` to ensure that side exits and the like do not accidentally -result in concurrent LLVM API calls. - .. _shutdown: Ending Execution with ``llvm_shutdown()`` ----------------------------------------- When you are done using the LLVM APIs, you should call ``llvm_shutdown()`` to -deallocate memory used for internal structures. This will also invoke -``llvm_stop_multithreaded()`` if LLVM is operating in multithreaded mode. As -such, ``llvm_shutdown()`` requires the same isolation guarantees as -``llvm_stop_multithreaded()``. - -Note that, if you use scope-based shutdown, you can use the -``llvm_shutdown_obj`` class, which calls ``llvm_shutdown()`` in its destructor. +deallocate memory used for internal structures. .. _managedstatic: @@ -2217,20 +2182,11 @@ Lazy Initialization with ``ManagedStatic`` ------------------------------------------ ``ManagedStatic`` is a utility class in LLVM used to implement static -initialization of static resources, such as the global type tables. Before the -invocation of ``llvm_shutdown()``, it implements a simple lazy initialization -scheme. Once ``llvm_start_multithreaded()`` returns, however, it uses +initialization of static resources, such as the global type tables. In a +single-threaded environment, it implements a simple lazy initialization scheme. +When LLVM is compiled with support for multi-threading, however, it uses double-checked locking to implement thread-safe lazy initialization. -Note that, because no other threads are allowed to issue LLVM API calls before -``llvm_start_multithreaded()`` returns, it is possible to have -``ManagedStatic``\ s of ``llvm::sys::Mutex``\ s. - -The ``llvm_acquire_global_lock()`` and ``llvm_release_global_lock`` APIs provide -access to the global lock used to implement the double-checked locking for lazy -initialization. These should only be used internally to LLVM, and only if you -know what you're doing! - .. _llvmcontext: Achieving Isolation with ``LLVMContext`` diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index 8dc16813580e..c76b9d946073 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -50,11 +50,21 @@ Non-comprehensive list of changes in this release the ``-no-integrated-as`` option, * llvm-ar now handles IR files like regular object files. In particular, a - regular symbol table is created for symbols defined in IR files. + regular symbol table is created for symbols defined in IR files, including + those in file scope inline assembly. * LLVM now always uses cfi directives for producing most stack unwinding information. +* The prefix for loop vectorizer hint metadata has been changed from + ``llvm.vectorizer`` to ``llvm.loop.vectorize``. In addition, + ``llvm.vectorizer.unroll`` metadata has been renamed + ``llvm.loop.interleave.count``. + +* Some backends previously implemented Atomic NAND(x,y) as ``x & ~y``. Now + all backends implement it as ``~(x & y)``, matching the semantics of GCC 4.4 + and later. + .. NOTE For small 1-3 sentence descriptions, just add an entry at the end of this list. If your description won't fit comfortably in one bullet diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst index a4fbd72167f0..869d3a383107 100644 --- a/docs/SourceLevelDebugging.rst +++ b/docs/SourceLevelDebugging.rst @@ -570,6 +570,7 @@ Local variables metadata, ;; Reference to the type descriptor i32, ;; flags metadata ;; (optional) Reference to inline location + metadata ;; (optional) Reference to a complex expression (see below) } These descriptors are used to define variables local to a sub program. The diff --git a/docs/TableGen/LangIntro.rst b/docs/TableGen/LangIntro.rst index 3e74dffb00e2..54d88731012b 100644 --- a/docs/TableGen/LangIntro.rst +++ b/docs/TableGen/LangIntro.rst @@ -208,6 +208,9 @@ supported include: on string, int and bit objects. Use !cast to compare other types of objects. +``!shl(a,b)`` ``!srl(a,b)`` ``!sra(a,b)`` ``!add(a,b)`` + The usual logical and arithmetic operators. + Note that all of the values have rules specifying how they convert to values for different types. These rules allow you to assign a value like "``7``" to a "``bits<4>``" value, for example. diff --git a/docs/Vectorizers.rst b/docs/Vectorizers.rst index 887ccaa8f815..2b702179bf28 100644 --- a/docs/Vectorizers.rst +++ b/docs/Vectorizers.rst @@ -51,6 +51,89 @@ Users can control the unroll factor using the command line flag "-force-vector-u $ clang -mllvm -force-vector-unroll=2 ... $ opt -loop-vectorize -force-vector-unroll=2 ... +Pragma loop hint directives +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``#pragma clang loop`` directive allows loop vectorization hints to be +specified for the subsequent for, while, do-while, or c++11 range-based for +loop. The directive allows vectorization and interleaving to be enabled or +disabled. Vector width as well as interleave count can also be manually +specified. The following example explicitly enables vectorization and +interleaving: + +.. code-block:: c++ + + #pragma clang loop vectorize(enable) interleave(enable) + while(...) { + ... + } + +The following example implicitly enables vectorization and interleaving by +specifying a vector width and interleaving count: + +.. code-block:: c++ + + #pragma clang loop vectorize_width(2) interleave_count(2) + for(...) { + ... + } + +See the Clang +`language extensions +`_ +for details. + +Diagnostics +----------- + +Many loops cannot be vectorized including loops with complicated control flow, +unvectorizable types, and unvectorizable calls. The loop vectorizer generates +optimization remarks which can be queried using command line options to identify +and diagnose loops that are skipped by the loop-vectorizer. + +Optimization remarks are enabled using: + +``-Rpass=loop-vectorize`` identifies loops that were successfully vectorized. + +``-Rpass-missed=loop-vectorize`` identifies loops that failed vectorization and +indicates if vectorization was specified. + +``-Rpass-analysis=loop-vectorize`` identifies the statements that caused +vectorization to fail. + +Consider the following loop: + +.. code-block:: c++ + + #pragma clang loop vectorize(enable) + for (int i = 0; i < Length; i++) { + switch(A[i]) { + case 0: A[i] = i*2; break; + case 1: A[i] = i; break; + default: A[i] = 0; + } + } + +The command line ``-Rpass-missed=loop-vectorized`` prints the remark: + +.. code-block:: console + + no_switch.cpp:4:5: remark: loop not vectorized: vectorization is explicitly enabled [-Rpass-missed=loop-vectorize] + +And the command line ``-Rpass-analysis=loop-vectorize`` indicates that the +switch statement cannot be vectorized. + +.. code-block:: console + + no_switch.cpp:4:5: remark: loop not vectorized: loop contains a switch statement [-Rpass-analysis=loop-vectorize] + switch(A[i]) { + ^ + +To ensure line and column numbers are produced include the command line options +``-gline-tables-only`` and ``-gcolumn-info``. See the Clang `user manual +`_ +for details + Features -------- diff --git a/examples/BrainF/CMakeLists.txt b/examples/BrainF/CMakeLists.txt index 025d09336405..65589d9f39f2 100644 --- a/examples/BrainF/CMakeLists.txt +++ b/examples/BrainF/CMakeLists.txt @@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS Core ExecutionEngine JIT + MC Support nativecodegen ) diff --git a/examples/ExceptionDemo/CMakeLists.txt b/examples/ExceptionDemo/CMakeLists.txt index 5324acd21eab..a08a7c30bd8a 100644 --- a/examples/ExceptionDemo/CMakeLists.txt +++ b/examples/ExceptionDemo/CMakeLists.txt @@ -6,7 +6,9 @@ set(LLVM_LINK_COMPONENTS nativecodegen ) +# Enable EH and RTTI for this demo set(LLVM_REQUIRES_EH 1) +set(LLVM_REQUIRES_RTTI 1) add_llvm_example(ExceptionDemo ExceptionDemo.cpp diff --git a/examples/Fibonacci/CMakeLists.txt b/examples/Fibonacci/CMakeLists.txt index 724a0f6715d3..c015e50ac350 100644 --- a/examples/Fibonacci/CMakeLists.txt +++ b/examples/Fibonacci/CMakeLists.txt @@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS ExecutionEngine Interpreter JIT + MC Support nativecodegen ) diff --git a/examples/HowToUseJIT/CMakeLists.txt b/examples/HowToUseJIT/CMakeLists.txt index 88aed026bf6f..237cbea861d2 100644 --- a/examples/HowToUseJIT/CMakeLists.txt +++ b/examples/HowToUseJIT/CMakeLists.txt @@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS ExecutionEngine Interpreter JIT + MC Support nativecodegen ) diff --git a/examples/Kaleidoscope/Chapter4/CMakeLists.txt b/examples/Kaleidoscope/Chapter4/CMakeLists.txt index 72a9f0512cd2..2b87e8684986 100644 --- a/examples/Kaleidoscope/Chapter4/CMakeLists.txt +++ b/examples/Kaleidoscope/Chapter4/CMakeLists.txt @@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS ExecutionEngine InstCombine JIT + MC ScalarOpts Support nativecodegen diff --git a/examples/Kaleidoscope/Chapter5/CMakeLists.txt b/examples/Kaleidoscope/Chapter5/CMakeLists.txt index c7d0276194cf..c3e7c43cb411 100644 --- a/examples/Kaleidoscope/Chapter5/CMakeLists.txt +++ b/examples/Kaleidoscope/Chapter5/CMakeLists.txt @@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS ExecutionEngine InstCombine JIT + MC ScalarOpts Support nativecodegen diff --git a/examples/Kaleidoscope/Chapter6/CMakeLists.txt b/examples/Kaleidoscope/Chapter6/CMakeLists.txt index 669c7eb171b8..cd61cec89d55 100644 --- a/examples/Kaleidoscope/Chapter6/CMakeLists.txt +++ b/examples/Kaleidoscope/Chapter6/CMakeLists.txt @@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS ExecutionEngine InstCombine JIT + MC ScalarOpts Support nativecodegen diff --git a/examples/Kaleidoscope/Chapter7/CMakeLists.txt b/examples/Kaleidoscope/Chapter7/CMakeLists.txt index 0a0c8e7cab58..cdb13c465d14 100644 --- a/examples/Kaleidoscope/Chapter7/CMakeLists.txt +++ b/examples/Kaleidoscope/Chapter7/CMakeLists.txt @@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS ExecutionEngine InstCombine JIT + MC ScalarOpts Support TransformUtils diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h index 0e78ed71fa9a..fdff77bc5e51 100644 --- a/include/llvm-c/Core.h +++ b/include/llvm-c/Core.h @@ -168,6 +168,7 @@ typedef enum { LLVMInAllocaAttribute = 1ULL << 36, LLVMNonNullAttribute = 1ULL << 37, LLVMJumpTableAttribute = 1ULL << 38, + LLVMDereferenceableAttribute = 1ULL << 39, */ } LLVMAttribute; @@ -2848,16 +2849,13 @@ void LLVMDisposePassManager(LLVMPassManagerRef PM); * @{ */ -/** Allocate and initialize structures needed to make LLVM safe for - multithreading. The return value indicates whether multithreaded - initialization succeeded. Must be executed in isolation from all - other LLVM api calls. - @see llvm::llvm_start_multithreaded */ +/** Deprecated: Multi-threading can only be enabled/disabled with the compile + time define LLVM_ENABLE_THREADS. This function always returns + LLVMIsMultithreaded(). */ LLVMBool LLVMStartMultithreaded(void); -/** Deallocate structures necessary to make LLVM safe for multithreading. - Must be executed in isolation from all other LLVM api calls. - @see llvm::llvm_stop_multithreaded */ +/** Deprecated: Multi-threading can only be enabled/disabled with the compile + time define LLVM_ENABLE_THREADS. */ void LLVMStopMultithreaded(void); /** Check whether LLVM is executing in thread-safe mode or not. diff --git a/include/llvm-c/Transforms/Scalar.h b/include/llvm-c/Transforms/Scalar.h index 9b820b2334ca..0ca72cec0cd2 100644 --- a/include/llvm-c/Transforms/Scalar.h +++ b/include/llvm-c/Transforms/Scalar.h @@ -44,6 +44,9 @@ void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM); /** See llvm::createScalarizerPass function. */ void LLVMAddScalarizerPass(LLVMPassManagerRef PM); +/** See llvm::createMergedLoadStoreMotionPass function. */ +void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM); + /** See llvm::createGVNPass function. */ void LLVMAddGVNPass(LLVMPassManagerRef PM); diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h index 1b64fee9a597..0fff505d8d01 100644 --- a/include/llvm/ADT/ArrayRef.h +++ b/include/llvm/ADT/ArrayRef.h @@ -147,6 +147,12 @@ namespace llvm { return ArrayRef(data()+N, M); } + // \brief Drop the last \p N elements of the array. + ArrayRef drop_back(unsigned N = 1) const { + assert(size() >= N && "Dropping more elements than exist"); + return slice(0, size() - N); + } + /// @} /// @name Operator Overloads /// @{ diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h index da2b3ad7e7c8..34e2284311b3 100644 --- a/include/llvm/ADT/BitVector.h +++ b/include/llvm/ADT/BitVector.h @@ -34,6 +34,7 @@ class BitVector { unsigned Capacity; // Size of allocated memory in BitWord. public: + typedef unsigned size_type; // Encapsulation of a single bit. class reference { friend class BitVector; @@ -111,10 +112,10 @@ class BitVector { bool empty() const { return Size == 0; } /// size - Returns the number of bits in this bitvector. - unsigned size() const { return Size; } + size_type size() const { return Size; } /// count - Returns the number of bits which are set. - unsigned count() const { + size_type count() const { unsigned NumBits = 0; for (unsigned i = 0; i < NumBitWords(size()); ++i) if (sizeof(BitWord) == 4) diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h index 826913289e99..85f37b9051b1 100644 --- a/include/llvm/ADT/DenseMap.h +++ b/include/llvm/ADT/DenseMap.h @@ -43,6 +43,7 @@ class DenseMapBase { typedef std::pair BucketT; public: + typedef unsigned size_type; typedef KeyT key_type; typedef ValueT mapped_type; typedef BucketT value_type; @@ -70,7 +71,7 @@ class DenseMapBase { unsigned size() const { return getNumEntries(); } /// Grow the densemap so that it has at least Size buckets. Does not shrink - void resize(size_t Size) { + void resize(size_type Size) { if (Size > getNumBuckets()) grow(Size); } @@ -99,10 +100,10 @@ class DenseMapBase { setNumTombstones(0); } - /// count - Return true if the specified key is in the map. - bool count(const KeyT &Val) const { + /// Return 1 if the specified key is in the map, 0 otherwise. + size_type count(const KeyT &Val) const { const BucketT *TheBucket; - return LookupBucketFor(Val, TheBucket); + return LookupBucketFor(Val, TheBucket) ? 1 : 0; } iterator find(const KeyT &Val) { diff --git a/include/llvm/ADT/DenseSet.h b/include/llvm/ADT/DenseSet.h index 1d8c39c1441b..37a81b0c7ee2 100644 --- a/include/llvm/ADT/DenseSet.h +++ b/include/llvm/ADT/DenseSet.h @@ -29,11 +29,12 @@ class DenseSet { public: typedef ValueT key_type; typedef ValueT value_type; + typedef unsigned size_type; explicit DenseSet(unsigned NumInitBuckets = 0) : TheMap(NumInitBuckets) {} bool empty() const { return TheMap.empty(); } - unsigned size() const { return TheMap.size(); } + size_type size() const { return TheMap.size(); } size_t getMemorySize() const { return TheMap.getMemorySize(); } /// Grow the DenseSet so that it has at least Size buckets. Will not shrink @@ -44,7 +45,8 @@ class DenseSet { TheMap.clear(); } - bool count(const ValueT &V) const { + /// Return 1 if the specified key is in the set, 0 otherwise. + size_type count(const ValueT &V) const { return TheMap.count(V); } diff --git a/include/llvm/ADT/IntrusiveRefCntPtr.h b/include/llvm/ADT/IntrusiveRefCntPtr.h index 128ada0ec835..f9df3781257e 100644 --- a/include/llvm/ADT/IntrusiveRefCntPtr.h +++ b/include/llvm/ADT/IntrusiveRefCntPtr.h @@ -154,13 +154,13 @@ class ThreadSafeRefCountedBase { } template - IntrusiveRefCntPtr(IntrusiveRefCntPtr&& S) : Obj(S.getPtr()) { + IntrusiveRefCntPtr(IntrusiveRefCntPtr&& S) : Obj(S.get()) { S.Obj = 0; } template IntrusiveRefCntPtr(const IntrusiveRefCntPtr& S) - : Obj(S.getPtr()) { + : Obj(S.get()) { retain(); } @@ -175,7 +175,7 @@ class ThreadSafeRefCountedBase { T* operator->() const { return Obj; } - T* getPtr() const { return Obj; } + T* get() const { return Obj; } LLVM_EXPLICIT operator bool() const { return Obj; } @@ -203,42 +203,42 @@ class ThreadSafeRefCountedBase { inline bool operator==(const IntrusiveRefCntPtr& A, const IntrusiveRefCntPtr& B) { - return A.getPtr() == B.getPtr(); + return A.get() == B.get(); } template inline bool operator!=(const IntrusiveRefCntPtr& A, const IntrusiveRefCntPtr& B) { - return A.getPtr() != B.getPtr(); + return A.get() != B.get(); } template inline bool operator==(const IntrusiveRefCntPtr& A, U* B) { - return A.getPtr() == B; + return A.get() == B; } template inline bool operator!=(const IntrusiveRefCntPtr& A, U* B) { - return A.getPtr() != B; + return A.get() != B; } template inline bool operator==(T* A, const IntrusiveRefCntPtr& B) { - return A == B.getPtr(); + return A == B.get(); } template inline bool operator!=(T* A, const IntrusiveRefCntPtr& B) { - return A != B.getPtr(); + return A != B.get(); } template @@ -268,14 +268,14 @@ class ThreadSafeRefCountedBase { template struct simplify_type > { typedef T* SimpleType; static SimpleType getSimplifiedValue(IntrusiveRefCntPtr& Val) { - return Val.getPtr(); + return Val.get(); } }; template struct simplify_type > { typedef /*const*/ T* SimpleType; static SimpleType getSimplifiedValue(const IntrusiveRefCntPtr& Val) { - return Val.getPtr(); + return Val.get(); } }; diff --git a/include/llvm/ADT/MapVector.h b/include/llvm/ADT/MapVector.h index 7fd1570cbf12..4e1fc1527270 100644 --- a/include/llvm/ADT/MapVector.h +++ b/include/llvm/ADT/MapVector.h @@ -29,7 +29,7 @@ template, typename VectorType = std::vector > > class MapVector { - typedef typename VectorType::size_type SizeType; + typedef typename VectorType::size_type size_type; MapType Map; VectorType Vector; @@ -38,7 +38,7 @@ class MapVector { typedef typename VectorType::iterator iterator; typedef typename VectorType::const_iterator const_iterator; - SizeType size() const { + size_type size() const { return Vector.size(); } @@ -100,7 +100,7 @@ class MapVector { return std::make_pair(begin() + I, false); } - unsigned count(const KeyT &Key) const { + size_type count(const KeyT &Key) const { typename MapType::const_iterator Pos = Map.find(Key); return Pos == Map.end()? 0 : 1; } @@ -123,8 +123,59 @@ class MapVector { Map.erase(Pos); Vector.pop_back(); } + + /// \brief Remove the element given by Iterator. + /// + /// Returns an iterator to the element following the one which was removed, + /// which may be end(). + /// + /// \note This is a deceivingly expensive operation (linear time). It's + /// usually better to use \a remove_if() if possible. + typename VectorType::iterator erase(typename VectorType::iterator Iterator) { + Map.erase(Iterator->first); + auto Next = Vector.erase(Iterator); + if (Next == Vector.end()) + return Next; + + // Update indices in the map. + size_t Index = Next - Vector.begin(); + for (auto &I : Map) { + assert(I.second != Index && "Index was already erased!"); + if (I.second > Index) + --I.second; + } + return Next; + } + + /// \brief Remove the elements that match the predicate. + /// + /// Erase all elements that match \c Pred in a single pass. Takes linear + /// time. + template void remove_if(Predicate Pred); }; +template +template +void MapVector::remove_if(Function Pred) { + auto O = Vector.begin(); + for (auto I = O, E = Vector.end(); I != E; ++I) { + if (Pred(*I)) { + // Erase from the map. + Map.erase(I->first); + continue; + } + + if (I != O) { + // Move the value and update the index in the map. + *O = std::move(*I); + Map[O->first] = O - Vector.begin(); + } + ++O; + } + // Erase trailing entries in the vector. + Vector.erase(O, Vector.end()); } +} // end namespace llvm + #endif diff --git a/include/llvm/ADT/OwningPtr.h b/include/llvm/ADT/OwningPtr.h deleted file mode 100644 index 5e83358fc071..000000000000 --- a/include/llvm/ADT/OwningPtr.h +++ /dev/null @@ -1,165 +0,0 @@ -//===- llvm/ADT/OwningPtr.h - Smart ptr that owns the pointee ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines and implements the OwningPtr class. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_ADT_OWNINGPTR_H -#define LLVM_ADT_OWNINGPTR_H - -#include "llvm/Support/Compiler.h" -#include -#include -#include - -namespace llvm { - -/// OwningPtr smart pointer - OwningPtr mimics a built-in pointer except that it -/// guarantees deletion of the object pointed to, either on destruction of the -/// OwningPtr or via an explicit reset(). Once created, ownership of the -/// pointee object can be taken away from OwningPtr by using the take method. -template -class OwningPtr { - OwningPtr(OwningPtr const &) LLVM_DELETED_FUNCTION; - OwningPtr &operator=(OwningPtr const &) LLVM_DELETED_FUNCTION; - T *Ptr; -public: - explicit OwningPtr(T *P = 0) : Ptr(P) {} - - OwningPtr(OwningPtr &&Other) : Ptr(Other.take()) {} - - OwningPtr &operator=(OwningPtr &&Other) { - reset(Other.take()); - return *this; - } - - OwningPtr(std::unique_ptr Other) : Ptr(Other.release()) {} - - OwningPtr &operator=(std::unique_ptr Other) { - reset(Other.release()); - return *this; - } - -#if LLVM_HAS_RVALUE_REFERENCE_THIS - operator std::unique_ptr() && { return std::unique_ptr(take()); } -#endif - - ~OwningPtr() { - delete Ptr; - } - - /// reset - Change the current pointee to the specified pointer. Note that - /// calling this with any pointer (including a null pointer) deletes the - /// current pointer. - void reset(T *P = 0) { - if (P == Ptr) return; - T *Tmp = Ptr; - Ptr = P; - delete Tmp; - } - - /// take - Reset the owning pointer to null and return its pointer. This does - /// not delete the pointer before returning it. - T *take() { - T *Tmp = Ptr; - Ptr = nullptr; - return Tmp; - } - - T *release() { return take(); } - - std::unique_ptr take_unique() { return std::unique_ptr(take()); } - - T &operator*() const { - assert(Ptr && "Cannot dereference null pointer"); - return *Ptr; - } - - T *operator->() const { return Ptr; } - T *get() const { return Ptr; } - LLVM_EXPLICIT operator bool() const { return Ptr != nullptr; } - bool operator!() const { return Ptr == nullptr; } - bool isValid() const { return Ptr != nullptr; } - - void swap(OwningPtr &RHS) { - T *Tmp = RHS.Ptr; - RHS.Ptr = Ptr; - Ptr = Tmp; - } -}; - -template -inline void swap(OwningPtr &a, OwningPtr &b) { - a.swap(b); -} - -/// OwningArrayPtr smart pointer - OwningArrayPtr provides the same -/// functionality as OwningPtr, except that it works for array types. -template -class OwningArrayPtr { - OwningArrayPtr(OwningArrayPtr const &) LLVM_DELETED_FUNCTION; - OwningArrayPtr &operator=(OwningArrayPtr const &) LLVM_DELETED_FUNCTION; - T *Ptr; -public: - explicit OwningArrayPtr(T *P = 0) : Ptr(P) {} - - OwningArrayPtr(OwningArrayPtr &&Other) : Ptr(Other.take()) {} - - OwningArrayPtr &operator=(OwningArrayPtr &&Other) { - reset(Other.take()); - return *this; - } - - ~OwningArrayPtr() { - delete [] Ptr; - } - - /// reset - Change the current pointee to the specified pointer. Note that - /// calling this with any pointer (including a null pointer) deletes the - /// current pointer. - void reset(T *P = 0) { - if (P == Ptr) return; - T *Tmp = Ptr; - Ptr = P; - delete [] Tmp; - } - - /// take - Reset the owning pointer to null and return its pointer. This does - /// not delete the pointer before returning it. - T *take() { - T *Tmp = Ptr; - Ptr = 0; - return Tmp; - } - - T &operator[](std::ptrdiff_t i) const { - assert(Ptr && "Cannot dereference null pointer"); - return Ptr[i]; - } - - T *get() const { return Ptr; } - LLVM_EXPLICIT operator bool() const { return Ptr != 0; } - bool operator!() const { return Ptr == nullptr; } - - void swap(OwningArrayPtr &RHS) { - T *Tmp = RHS.Ptr; - RHS.Ptr = Ptr; - Ptr = Tmp; - } -}; - -template -inline void swap(OwningArrayPtr &a, OwningArrayPtr &b) { - a.swap(b); -} - -} // end namespace llvm - -#endif diff --git a/include/llvm/ADT/ScopedHashTable.h b/include/llvm/ADT/ScopedHashTable.h index 3cc7738df8aa..02a6ea345834 100644 --- a/include/llvm/ADT/ScopedHashTable.h +++ b/include/llvm/ADT/ScopedHashTable.h @@ -148,6 +148,7 @@ class ScopedHashTable { /// ScopeTy - This is a helpful typedef that allows clients to get easy access /// to the name of the scope for this hash table. typedef ScopedHashTableScope ScopeTy; + typedef unsigned size_type; private: typedef ScopedHashTableVal ValTy; DenseMap TopLevelMap; @@ -170,7 +171,8 @@ class ScopedHashTable { AllocatorTy &getAllocator() { return Allocator; } const AllocatorTy &getAllocator() const { return Allocator; } - bool count(const K &Key) const { + /// Return 1 if the specified key is in the table, 0 otherwise. + size_type count(const K &Key) const { return TopLevelMap.count(Key); } diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h index e965bc464326..0922017ea61a 100644 --- a/include/llvm/ADT/SmallBitVector.h +++ b/include/llvm/ADT/SmallBitVector.h @@ -54,6 +54,7 @@ class SmallBitVector { }; public: + typedef unsigned size_type; // Encapsulation of a single bit. class reference { SmallBitVector &TheVector; @@ -173,7 +174,7 @@ class SmallBitVector { } /// count - Returns the number of bits which are set. - unsigned count() const { + size_type count() const { if (isSmall()) { uintptr_t Bits = getSmallBits(); if (NumBaseBits == 32) diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h index 67104f3cae50..74f3fd43cec4 100644 --- a/include/llvm/ADT/SmallPtrSet.h +++ b/include/llvm/ADT/SmallPtrSet.h @@ -73,8 +73,9 @@ class SmallPtrSetImplBase { ~SmallPtrSetImplBase(); public: + typedef unsigned size_type; bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const { return size() == 0; } - unsigned size() const { return NumElements; } + size_type size() const { return NumElements; } void clear() { // If the capacity of the array is huge, and the # elements used is small, @@ -263,7 +264,7 @@ class SmallPtrSetImpl : public SmallPtrSetImplBase { } /// count - Return 1 if the specified pointer is in the set, 0 otherwise. - unsigned count(PtrType Ptr) const { + size_type count(PtrType Ptr) const { return count_imp(PtrTraits::getAsVoidPointer(Ptr)) ? 1 : 0; } diff --git a/include/llvm/ADT/SmallSet.h b/include/llvm/ADT/SmallSet.h index 6f36234cb4dd..bb1971eb7c5d 100644 --- a/include/llvm/ADT/SmallSet.h +++ b/include/llvm/ADT/SmallSet.h @@ -37,18 +37,19 @@ class SmallSet { typedef typename SmallVector::const_iterator VIterator; typedef typename SmallVector::iterator mutable_iterator; public: + typedef size_t size_type; SmallSet() {} bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const { return Vector.empty() && Set.empty(); } - unsigned size() const { + size_type size() const { return isSmall() ? Vector.size() : Set.size(); } /// count - Return 1 if the element is in the set, 0 otherwise. - unsigned count(const T &V) const { + size_type count(const T &V) const { if (isSmall()) { // Since the collection is small, just do a linear search. return vfind(V) == Vector.end() ? 0 : 1; diff --git a/include/llvm/ADT/SparseBitVector.h b/include/llvm/ADT/SparseBitVector.h index 706f24862264..36754d682355 100644 --- a/include/llvm/ADT/SparseBitVector.h +++ b/include/llvm/ADT/SparseBitVector.h @@ -45,6 +45,7 @@ struct SparseBitVectorElement : public ilist_node > { public: typedef unsigned long BitWord; + typedef unsigned size_type; enum { BITWORD_SIZE = sizeof(BitWord) * CHAR_BIT, BITWORDS_PER_ELEMENT = (ElementSize + BITWORD_SIZE - 1) / BITWORD_SIZE, @@ -120,7 +121,7 @@ struct SparseBitVectorElement return Bits[Idx / BITWORD_SIZE] & (1L << (Idx % BITWORD_SIZE)); } - unsigned count() const { + size_type count() const { unsigned NumBits = 0; for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) if (sizeof(BitWord) == 4) diff --git a/include/llvm/ADT/SparseMultiSet.h b/include/llvm/ADT/SparseMultiSet.h index d2b2f8d9b6bd..dc1273eb7ff6 100644 --- a/include/llvm/ADT/SparseMultiSet.h +++ b/include/llvm/ADT/SparseMultiSet.h @@ -185,6 +185,7 @@ class SparseMultiSet { typedef const ValueT &const_reference; typedef ValueT *pointer; typedef const ValueT *const_pointer; + typedef unsigned size_type; SparseMultiSet() : Sparse(nullptr), Universe(0), FreelistIdx(SMSNode::INVALID), NumFree(0) {} @@ -327,7 +328,7 @@ class SparseMultiSet { /// This is not the same as BitVector::size() which returns the size of the /// universe. /// - unsigned size() const { + size_type size() const { assert(NumFree <= Dense.size() && "Out-of-bounds free entries"); return Dense.size() - NumFree; } @@ -378,7 +379,7 @@ class SparseMultiSet { /// Returns the number of elements identified by Key. This will be linear in /// the number of elements of that key. - unsigned count(const KeyT &Key) const { + size_type count(const KeyT &Key) const { unsigned Ret = 0; for (const_iterator It = find(Key); It != end(); ++It) ++Ret; diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h index 899f2e4da03c..632d52ad9d82 100644 --- a/include/llvm/ADT/SparseSet.h +++ b/include/llvm/ADT/SparseSet.h @@ -124,6 +124,7 @@ class SparseSet { typedef typename KeyFunctorT::argument_type KeyT; typedef SmallVector DenseT; + typedef unsigned size_type; DenseT Dense; SparseT *Sparse; unsigned Universe; @@ -186,7 +187,7 @@ class SparseSet { /// This is not the same as BitVector::size() which returns the size of the /// universe. /// - unsigned size() const { return Dense.size(); } + size_type size() const { return Dense.size(); } /// clear - Clears the set. This is a very fast constant time operation. /// @@ -231,7 +232,7 @@ class SparseSet { /// count - Returns 1 if this set contains an element identified by Key, /// 0 otherwise. /// - unsigned count(const KeyT &Key) const { + size_type count(const KeyT &Key) const { return find(Key) == end() ? 0 : 1; } diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h index a152f4d3c2ad..0992f5d4a549 100644 --- a/include/llvm/ADT/StringExtras.h +++ b/include/llvm/ADT/StringExtras.h @@ -53,7 +53,7 @@ static inline unsigned hexDigitValue(char C) { /// This should only be used with unsigned types. /// template -static inline char *utohex_buffer(IntTy X, char *BufferEnd) { +static inline char *utohex_buffer(IntTy X, char *BufferEnd, bool LowerCase = false) { char *BufPtr = BufferEnd; *--BufPtr = 0; // Null terminate buffer. if (X == 0) { @@ -63,15 +63,15 @@ static inline char *utohex_buffer(IntTy X, char *BufferEnd) { while (X) { unsigned char Mod = static_cast(X) & 15; - *--BufPtr = hexdigit(Mod); + *--BufPtr = hexdigit(Mod, LowerCase); X >>= 4; } return BufPtr; } -static inline std::string utohexstr(uint64_t X) { +static inline std::string utohexstr(uint64_t X, bool LowerCase = false) { char Buffer[17]; - return utohex_buffer(X, Buffer+17); + return utohex_buffer(X, Buffer+17, LowerCase); } static inline std::string utostr_32(uint32_t X, bool isNeg = false) { diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h index 5b1868187986..c40e5e2b3d87 100644 --- a/include/llvm/ADT/StringMap.h +++ b/include/llvm/ADT/StringMap.h @@ -64,7 +64,7 @@ class StringMapImpl { } StringMapImpl(unsigned InitSize, unsigned ItemSize); - void RehashTable(); + unsigned RehashTable(unsigned BucketNo = 0); /// LookupBucketFor - Look up the bucket that the specified string should end /// up in. If it already exists as a key in the map, the Item pointer for the @@ -323,6 +323,28 @@ class StringMap : public StringMapImpl { return true; } + /// insert - Inserts the specified key/value pair into the map if the key + /// isn't already in the map. The bool component of the returned pair is true + /// if and only if the insertion takes place, and the iterator component of + /// the pair points to the element with key equivalent to the key of the pair. + std::pair insert(std::pair KV) { + unsigned BucketNo = LookupBucketFor(KV.first); + StringMapEntryBase *&Bucket = TheTable[BucketNo]; + if (Bucket && Bucket != getTombstoneVal()) + return std::make_pair(iterator(TheTable + BucketNo, false), + false); // Already exists in map. + + if (Bucket == getTombstoneVal()) + --NumTombstones; + Bucket = + MapEntryTy::Create(KV.first, Allocator, std::move(KV.second)); + ++NumItems; + assert(NumItems + NumTombstones <= NumBuckets); + + BucketNo = RehashTable(BucketNo); + return std::make_pair(iterator(TheTable + BucketNo, false), true); + } + // clear - Empties out the StringMap void clear() { if (empty()) return; @@ -346,24 +368,7 @@ class StringMap : public StringMapImpl { /// return. template MapEntryTy &GetOrCreateValue(StringRef Key, InitTy Val) { - unsigned BucketNo = LookupBucketFor(Key); - StringMapEntryBase *&Bucket = TheTable[BucketNo]; - if (Bucket && Bucket != getTombstoneVal()) - return *static_cast(Bucket); - - MapEntryTy *NewItem = MapEntryTy::Create(Key, Allocator, std::move(Val)); - - if (Bucket == getTombstoneVal()) - --NumTombstones; - ++NumItems; - assert(NumItems + NumTombstones <= NumBuckets); - - // Fill in the bucket for the hash table. The FullHashValue was already - // filled in by LookupBucketFor. - Bucket = NewItem; - - RehashTable(); - return *NewItem; + return *insert(std::make_pair(Key, std::move(Val))).first; } MapEntryTy &GetOrCreateValue(StringRef Key) { diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h index a10bc734da2c..b96f11435520 100644 --- a/include/llvm/ADT/Triple.h +++ b/include/llvm/ADT/Triple.h @@ -76,7 +76,23 @@ class Triple { le32, // le32: generic little-endian 32-bit CPU (PNaCl / Emscripten) amdil, // amdil: amd IL spir, // SPIR: standard portable IR for OpenCL 32-bit version - spir64 // SPIR: standard portable IR for OpenCL 64-bit version + spir64, // SPIR: standard portable IR for OpenCL 64-bit version + kalimba // Kalimba: generic kalimba + }; + enum SubArchType { + NoSubArch, + + ARMSubArch_v8, + ARMSubArch_v7, + ARMSubArch_v7em, + ARMSubArch_v7m, + ARMSubArch_v7s, + ARMSubArch_v6, + ARMSubArch_v6m, + ARMSubArch_v6t2, + ARMSubArch_v5, + ARMSubArch_v5te, + ARMSubArch_v4t }; enum VendorType { UnknownVendor, @@ -88,7 +104,10 @@ class Triple { BGQ, Freescale, IBM, - NVIDIA + ImaginationTechnologies, + MipsTechnologies, + NVIDIA, + CSR }; enum OSType { UnknownOS, @@ -148,6 +167,9 @@ class Triple { /// The parsed arch type. ArchType Arch; + /// The parsed subarchitecture type. + SubArchType SubArch; + /// The parsed vendor type. VendorType Vendor; @@ -190,6 +212,9 @@ class Triple { /// getArch - Get the parsed architecture type of this triple. ArchType getArch() const { return Arch; } + /// getSubArch - get the parsed subarchitecture type for this triple. + SubArchType getSubArch() const { return SubArch; } + /// getVendor - Get the parsed vendor type of this triple. VendorType getVendor() const { return Vendor; } @@ -474,6 +499,12 @@ class Triple { /// architecture if no such variant can be found. llvm::Triple get64BitArchVariant() const; + /// Get the (LLVM) name of the minimum ARM CPU for the arch we are targeting. + /// + /// \param Arch the architecture name (e.g., "armv7s"). If it is an empty + /// string then the triple's arch name is used. + const char* getARMCPUForArch(StringRef Arch = StringRef()) const; + /// @} /// @name Static helpers for IDs. /// @{ diff --git a/include/llvm/ADT/UniqueVector.h b/include/llvm/ADT/UniqueVector.h index 2d02d1ce166f..a9cb2f5709eb 100644 --- a/include/llvm/ADT/UniqueVector.h +++ b/include/llvm/ADT/UniqueVector.h @@ -22,13 +22,18 @@ namespace llvm { /// class should have an implementation of operator== and of operator<. /// Entries can be fetched using operator[] with the entry ID. template class UniqueVector { +public: + typedef typename std::vector VectorType; + typedef typename VectorType::iterator iterator; + typedef typename VectorType::const_iterator const_iterator; + private: // Map - Used to handle the correspondence of entry to ID. std::map Map; // Vector - ID ordered vector of entries. Entries can be indexed by ID - 1. // - std::vector Vector; + VectorType Vector; public: /// insert - Append entry to the vector if it doesn't already exist. Returns @@ -68,6 +73,18 @@ template class UniqueVector { return Vector[ID - 1]; } + /// \brief Return an iterator to the start of the vector. + iterator begin() { return Vector.begin(); } + + /// \brief Return an iterator to the start of the vector. + const_iterator begin() const { return Vector.begin(); } + + /// \brief Return an iterator to the end of the vector. + iterator end() { return Vector.end(); } + + /// \brief Return an iterator to the end of the vector. + const_iterator end() const { return Vector.end(); } + /// size - Returns the number of entries in the vector. /// size_t size() const { return Vector.size(); } diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h index 885286604ccb..689766446445 100644 --- a/include/llvm/Analysis/AliasAnalysis.h +++ b/include/llvm/Analysis/AliasAnalysis.h @@ -274,6 +274,14 @@ class AliasAnalysis { UnknownModRefBehavior = Anywhere | ModRef }; + /// Get the location associated with a pointer argument of a callsite. + /// The mask bits are set to indicate the allowed aliasing ModRef kinds. + /// Note that these mask bits do not necessarily account for the overall + /// behavior of the function, but rather only provide additional + /// per-argument information. + virtual Location getArgLocation(ImmutableCallSite CS, unsigned ArgIdx, + ModRefResult &Mask); + /// getModRefBehavior - Return the behavior when calling the given call site. virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS); @@ -597,6 +605,13 @@ bool isNoAliasArgument(const Value *V); /// bool isIdentifiedObject(const Value *V); +/// isIdentifiedFunctionLocal - Return true if V is umabigously identified +/// at the function-level. Different IdentifiedFunctionLocals can't alias. +/// Further, an IdentifiedFunctionLocal can not alias with any function +/// arguments other than itself, which is not necessarily true for +/// IdentifiedObjects. +bool isIdentifiedFunctionLocal(const Value *V); + } // End llvm namespace #endif diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h index bd72d3ed6d4d..bb256c7bbcc8 100644 --- a/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -22,6 +22,7 @@ #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ScaledNumber.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -30,684 +31,25 @@ #define DEBUG_TYPE "block-freq" -//===----------------------------------------------------------------------===// -// -// UnsignedFloat definition. -// -// TODO: Make this private to BlockFrequencyInfoImpl or delete. -// -//===----------------------------------------------------------------------===// namespace llvm { -class UnsignedFloatBase { -public: - static const int32_t MaxExponent = 16383; - static const int32_t MinExponent = -16382; - static const int DefaultPrecision = 10; - - static void dump(uint64_t D, int16_t E, int Width); - static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width, - unsigned Precision); - static std::string toString(uint64_t D, int16_t E, int Width, - unsigned Precision); - static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); } - static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); } - static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); } - - static std::pair splitSigned(int64_t N) { - if (N >= 0) - return std::make_pair(N, false); - uint64_t Unsigned = N == INT64_MIN ? UINT64_C(1) << 63 : uint64_t(-N); - return std::make_pair(Unsigned, true); - } - static int64_t joinSigned(uint64_t U, bool IsNeg) { - if (U > uint64_t(INT64_MAX)) - return IsNeg ? INT64_MIN : INT64_MAX; - return IsNeg ? -int64_t(U) : int64_t(U); - } - - static int32_t extractLg(const std::pair &Lg) { - return Lg.first; - } - static int32_t extractLgFloor(const std::pair &Lg) { - return Lg.first - (Lg.second > 0); - } - static int32_t extractLgCeiling(const std::pair &Lg) { - return Lg.first + (Lg.second < 0); - } - - static std::pair divide64(uint64_t L, uint64_t R); - static std::pair multiply64(uint64_t L, uint64_t R); - - static int compare(uint64_t L, uint64_t R, int Shift) { - assert(Shift >= 0); - assert(Shift < 64); - - uint64_t L_adjusted = L >> Shift; - if (L_adjusted < R) - return -1; - if (L_adjusted > R) - return 1; - - return L > L_adjusted << Shift ? 1 : 0; - } -}; - -/// \brief Simple representation of an unsigned floating point. -/// -/// UnsignedFloat is a unsigned floating point number. It uses simple -/// saturation arithmetic, and every operation is well-defined for every value. -/// -/// The number is split into a signed exponent and unsigned digits. The number -/// represented is \c getDigits()*2^getExponent(). In this way, the digits are -/// much like the mantissa in the x87 long double, but there is no canonical -/// form, so the same number can be represented by many bit representations -/// (it's always in "denormal" mode). -/// -/// UnsignedFloat is templated on the underlying integer type for digits, which -/// is expected to be one of uint64_t, uint32_t, uint16_t or uint8_t. -/// -/// Unlike builtin floating point types, UnsignedFloat is portable. -/// -/// Unlike APFloat, UnsignedFloat does not model architecture floating point -/// behaviour (this should make it a little faster), and implements most -/// operators (this makes it usable). -/// -/// UnsignedFloat is totally ordered. However, there is no canonical form, so -/// there are multiple representations of most scalars. E.g.: -/// -/// UnsignedFloat(8u, 0) == UnsignedFloat(4u, 1) -/// UnsignedFloat(4u, 1) == UnsignedFloat(2u, 2) -/// UnsignedFloat(2u, 2) == UnsignedFloat(1u, 3) -/// -/// UnsignedFloat implements most arithmetic operations. Precision is kept -/// where possible. Uses simple saturation arithmetic, so that operations -/// saturate to 0.0 or getLargest() rather than under or overflowing. It has -/// some extra arithmetic for unit inversion. 0.0/0.0 is defined to be 0.0. -/// Any other division by 0.0 is defined to be getLargest(). -/// -/// As a convenience for modifying the exponent, left and right shifting are -/// both implemented, and both interpret negative shifts as positive shifts in -/// the opposite direction. -/// -/// Exponents are limited to the range accepted by x87 long double. This makes -/// it trivial to add functionality to convert to APFloat (this is already -/// relied on for the implementation of printing). -/// -/// The current plan is to gut this and make the necessary parts of it (even -/// more) private to BlockFrequencyInfo. -template class UnsignedFloat : UnsignedFloatBase { -public: - static_assert(!std::numeric_limits::is_signed, - "only unsigned floats supported"); - - typedef DigitsT DigitsType; - -private: - typedef std::numeric_limits DigitsLimits; - - static const int Width = sizeof(DigitsType) * 8; - static_assert(Width <= 64, "invalid integer width for digits"); - -private: - DigitsType Digits; - int16_t Exponent; - -public: - UnsignedFloat() : Digits(0), Exponent(0) {} - - UnsignedFloat(DigitsType Digits, int16_t Exponent) - : Digits(Digits), Exponent(Exponent) {} - -private: - UnsignedFloat(const std::pair &X) - : Digits(X.first), Exponent(X.second) {} - -public: - static UnsignedFloat getZero() { return UnsignedFloat(0, 0); } - static UnsignedFloat getOne() { return UnsignedFloat(1, 0); } - static UnsignedFloat getLargest() { - return UnsignedFloat(DigitsLimits::max(), MaxExponent); - } - static UnsignedFloat getFloat(uint64_t N) { return adjustToWidth(N, 0); } - static UnsignedFloat getInverseFloat(uint64_t N) { - return getFloat(N).invert(); - } - static UnsignedFloat getFraction(DigitsType N, DigitsType D) { - return getQuotient(N, D); - } - - int16_t getExponent() const { return Exponent; } - DigitsType getDigits() const { return Digits; } - - /// \brief Convert to the given integer type. - /// - /// Convert to \c IntT using simple saturating arithmetic, truncating if - /// necessary. - template IntT toInt() const; - - bool isZero() const { return !Digits; } - bool isLargest() const { return *this == getLargest(); } - bool isOne() const { - if (Exponent > 0 || Exponent <= -Width) - return false; - return Digits == DigitsType(1) << -Exponent; - } - - /// \brief The log base 2, rounded. - /// - /// Get the lg of the scalar. lg 0 is defined to be INT32_MIN. - int32_t lg() const { return extractLg(lgImpl()); } - - /// \brief The log base 2, rounded towards INT32_MIN. - /// - /// Get the lg floor. lg 0 is defined to be INT32_MIN. - int32_t lgFloor() const { return extractLgFloor(lgImpl()); } - - /// \brief The log base 2, rounded towards INT32_MAX. - /// - /// Get the lg ceiling. lg 0 is defined to be INT32_MIN. - int32_t lgCeiling() const { return extractLgCeiling(lgImpl()); } - - bool operator==(const UnsignedFloat &X) const { return compare(X) == 0; } - bool operator<(const UnsignedFloat &X) const { return compare(X) < 0; } - bool operator!=(const UnsignedFloat &X) const { return compare(X) != 0; } - bool operator>(const UnsignedFloat &X) const { return compare(X) > 0; } - bool operator<=(const UnsignedFloat &X) const { return compare(X) <= 0; } - bool operator>=(const UnsignedFloat &X) const { return compare(X) >= 0; } - - bool operator!() const { return isZero(); } - - /// \brief Convert to a decimal representation in a string. - /// - /// Convert to a string. Uses scientific notation for very large/small - /// numbers. Scientific notation is used roughly for numbers outside of the - /// range 2^-64 through 2^64. - /// - /// \c Precision indicates the number of decimal digits of precision to use; - /// 0 requests the maximum available. - /// - /// As a special case to make debugging easier, if the number is small enough - /// to convert without scientific notation and has more than \c Precision - /// digits before the decimal place, it's printed accurately to the first - /// digit past zero. E.g., assuming 10 digits of precision: - /// - /// 98765432198.7654... => 98765432198.8 - /// 8765432198.7654... => 8765432198.8 - /// 765432198.7654... => 765432198.8 - /// 65432198.7654... => 65432198.77 - /// 5432198.7654... => 5432198.765 - std::string toString(unsigned Precision = DefaultPrecision) { - return UnsignedFloatBase::toString(Digits, Exponent, Width, Precision); - } - - /// \brief Print a decimal representation. - /// - /// Print a string. See toString for documentation. - raw_ostream &print(raw_ostream &OS, - unsigned Precision = DefaultPrecision) const { - return UnsignedFloatBase::print(OS, Digits, Exponent, Width, Precision); - } - void dump() const { return UnsignedFloatBase::dump(Digits, Exponent, Width); } - - UnsignedFloat &operator+=(const UnsignedFloat &X); - UnsignedFloat &operator-=(const UnsignedFloat &X); - UnsignedFloat &operator*=(const UnsignedFloat &X); - UnsignedFloat &operator/=(const UnsignedFloat &X); - UnsignedFloat &operator<<=(int16_t Shift) { shiftLeft(Shift); return *this; } - UnsignedFloat &operator>>=(int16_t Shift) { shiftRight(Shift); return *this; } - -private: - void shiftLeft(int32_t Shift); - void shiftRight(int32_t Shift); - - /// \brief Adjust two floats to have matching exponents. - /// - /// Adjust \c this and \c X to have matching exponents. Returns the new \c X - /// by value. Does nothing if \a isZero() for either. - /// - /// The value that compares smaller will lose precision, and possibly become - /// \a isZero(). - UnsignedFloat matchExponents(UnsignedFloat X); - - /// \brief Increase exponent to match another float. - /// - /// Increases \c this to have an exponent matching \c X. May decrease the - /// exponent of \c X in the process, and \c this may possibly become \a - /// isZero(). - void increaseExponentToMatch(UnsignedFloat &X, int32_t ExponentDiff); - -public: - /// \brief Scale a large number accurately. - /// - /// Scale N (multiply it by this). Uses full precision multiplication, even - /// if Width is smaller than 64, so information is not lost. - uint64_t scale(uint64_t N) const; - uint64_t scaleByInverse(uint64_t N) const { - // TODO: implement directly, rather than relying on inverse. Inverse is - // expensive. - return inverse().scale(N); - } - int64_t scale(int64_t N) const { - std::pair Unsigned = splitSigned(N); - return joinSigned(scale(Unsigned.first), Unsigned.second); - } - int64_t scaleByInverse(int64_t N) const { - std::pair Unsigned = splitSigned(N); - return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second); - } - - int compare(const UnsignedFloat &X) const; - int compareTo(uint64_t N) const { - UnsignedFloat Float = getFloat(N); - int Compare = compare(Float); - if (Width == 64 || Compare != 0) - return Compare; - - // Check for precision loss. We know *this == RoundTrip. - uint64_t RoundTrip = Float.template toInt(); - return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1; - } - int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); } - - UnsignedFloat &invert() { return *this = UnsignedFloat::getFloat(1) / *this; } - UnsignedFloat inverse() const { return UnsignedFloat(*this).invert(); } - -private: - static UnsignedFloat getProduct(DigitsType L, DigitsType R); - static UnsignedFloat getQuotient(DigitsType Dividend, DigitsType Divisor); - - std::pair lgImpl() const; - static int countLeadingZerosWidth(DigitsType Digits) { - if (Width == 64) - return countLeadingZeros64(Digits); - if (Width == 32) - return countLeadingZeros32(Digits); - return countLeadingZeros32(Digits) + Width - 32; - } - - static UnsignedFloat adjustToWidth(uint64_t N, int32_t S) { - assert(S >= MinExponent); - assert(S <= MaxExponent); - if (Width == 64 || N <= DigitsLimits::max()) - return UnsignedFloat(N, S); - - // Shift right. - int Shift = 64 - Width - countLeadingZeros64(N); - DigitsType Shifted = N >> Shift; - - // Round. - assert(S + Shift <= MaxExponent); - return getRounded(UnsignedFloat(Shifted, S + Shift), - N & UINT64_C(1) << (Shift - 1)); - } - - static UnsignedFloat getRounded(UnsignedFloat P, bool Round) { - if (!Round) - return P; - if (P.Digits == DigitsLimits::max()) - // Careful of overflow in the exponent. - return UnsignedFloat(1, P.Exponent) <<= Width; - return UnsignedFloat(P.Digits + 1, P.Exponent); - } -}; - -#define UNSIGNED_FLOAT_BOP(op, base) \ - template \ - UnsignedFloat operator op(const UnsignedFloat &L, \ - const UnsignedFloat &R) { \ - return UnsignedFloat(L) base R; \ - } -UNSIGNED_FLOAT_BOP(+, += ) -UNSIGNED_FLOAT_BOP(-, -= ) -UNSIGNED_FLOAT_BOP(*, *= ) -UNSIGNED_FLOAT_BOP(/, /= ) -UNSIGNED_FLOAT_BOP(<<, <<= ) -UNSIGNED_FLOAT_BOP(>>, >>= ) -#undef UNSIGNED_FLOAT_BOP - -template -raw_ostream &operator<<(raw_ostream &OS, const UnsignedFloat &X) { - return X.print(OS, 10); -} - -#define UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, T1, T2) \ - template \ - bool operator op(const UnsignedFloat &L, T1 R) { \ - return L.compareTo(T2(R)) op 0; \ - } \ - template \ - bool operator op(T1 L, const UnsignedFloat &R) { \ - return 0 op R.compareTo(T2(L)); \ - } -#define UNSIGNED_FLOAT_COMPARE_TO(op) \ - UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, uint64_t, uint64_t) \ - UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, uint32_t, uint64_t) \ - UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, int64_t, int64_t) \ - UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, int32_t, int64_t) -UNSIGNED_FLOAT_COMPARE_TO(< ) -UNSIGNED_FLOAT_COMPARE_TO(> ) -UNSIGNED_FLOAT_COMPARE_TO(== ) -UNSIGNED_FLOAT_COMPARE_TO(!= ) -UNSIGNED_FLOAT_COMPARE_TO(<= ) -UNSIGNED_FLOAT_COMPARE_TO(>= ) -#undef UNSIGNED_FLOAT_COMPARE_TO -#undef UNSIGNED_FLOAT_COMPARE_TO_TYPE - -template -uint64_t UnsignedFloat::scale(uint64_t N) const { - if (Width == 64 || N <= DigitsLimits::max()) - return (getFloat(N) * *this).template toInt(); - - // Defer to the 64-bit version. - return UnsignedFloat(Digits, Exponent).scale(N); -} - -template -UnsignedFloat UnsignedFloat::getProduct(DigitsType L, - DigitsType R) { - // Check for zero. - if (!L || !R) - return getZero(); - - // Check for numbers that we can compute with 64-bit math. - if (Width <= 32 || (L <= UINT32_MAX && R <= UINT32_MAX)) - return adjustToWidth(uint64_t(L) * uint64_t(R), 0); - - // Do the full thing. - return UnsignedFloat(multiply64(L, R)); -} -template -UnsignedFloat UnsignedFloat::getQuotient(DigitsType Dividend, - DigitsType Divisor) { - // Check for zero. - if (!Dividend) - return getZero(); - if (!Divisor) - return getLargest(); - - if (Width == 64) - return UnsignedFloat(divide64(Dividend, Divisor)); - - // We can compute this with 64-bit math. - int Shift = countLeadingZeros64(Dividend); - uint64_t Shifted = uint64_t(Dividend) << Shift; - uint64_t Quotient = Shifted / Divisor; - - // If Quotient needs to be shifted, then adjustToWidth will round. - if (Quotient > DigitsLimits::max()) - return adjustToWidth(Quotient, -Shift); - - // Round based on the value of the next bit. - return getRounded(UnsignedFloat(Quotient, -Shift), - Shifted % Divisor >= getHalf(Divisor)); -} - -template -template -IntT UnsignedFloat::toInt() const { - typedef std::numeric_limits Limits; - if (*this < 1) - return 0; - if (*this >= Limits::max()) - return Limits::max(); - - IntT N = Digits; - if (Exponent > 0) { - assert(size_t(Exponent) < sizeof(IntT) * 8); - return N << Exponent; - } - if (Exponent < 0) { - assert(size_t(-Exponent) < sizeof(IntT) * 8); - return N >> -Exponent; - } - return N; -} - -template -std::pair UnsignedFloat::lgImpl() const { - if (isZero()) - return std::make_pair(INT32_MIN, 0); - - // Get the floor of the lg of Digits. - int32_t LocalFloor = Width - countLeadingZerosWidth(Digits) - 1; - - // Get the floor of the lg of this. - int32_t Floor = Exponent + LocalFloor; - if (Digits == UINT64_C(1) << LocalFloor) - return std::make_pair(Floor, 0); - - // Round based on the next digit. - assert(LocalFloor >= 1); - bool Round = Digits & UINT64_C(1) << (LocalFloor - 1); - return std::make_pair(Floor + Round, Round ? 1 : -1); -} - -template -UnsignedFloat UnsignedFloat::matchExponents(UnsignedFloat X) { - if (isZero() || X.isZero() || Exponent == X.Exponent) - return X; - - int32_t Diff = int32_t(X.Exponent) - int32_t(Exponent); - if (Diff > 0) - increaseExponentToMatch(X, Diff); - else - X.increaseExponentToMatch(*this, -Diff); - return X; -} -template -void UnsignedFloat::increaseExponentToMatch(UnsignedFloat &X, - int32_t ExponentDiff) { - assert(ExponentDiff > 0); - if (ExponentDiff >= 2 * Width) { - *this = getZero(); - return; - } - - // Use up any leading zeros on X, and then shift this. - int32_t ShiftX = std::min(countLeadingZerosWidth(X.Digits), ExponentDiff); - assert(ShiftX < Width); - - int32_t ShiftThis = ExponentDiff - ShiftX; - if (ShiftThis >= Width) { - *this = getZero(); - return; - } - - X.Digits <<= ShiftX; - X.Exponent -= ShiftX; - Digits >>= ShiftThis; - Exponent += ShiftThis; - return; -} - -template -UnsignedFloat &UnsignedFloat:: -operator+=(const UnsignedFloat &X) { - if (isLargest() || X.isZero()) - return *this; - if (isZero() || X.isLargest()) - return *this = X; - - // Normalize exponents. - UnsignedFloat Scaled = matchExponents(X); - - // Check for zero again. - if (isZero()) - return *this = Scaled; - if (Scaled.isZero()) - return *this; - - // Compute sum. - DigitsType Sum = Digits + Scaled.Digits; - bool DidOverflow = Sum < Digits; - Digits = Sum; - if (!DidOverflow) - return *this; - - if (Exponent == MaxExponent) - return *this = getLargest(); - - ++Exponent; - Digits = UINT64_C(1) << (Width - 1) | Digits >> 1; - - return *this; -} -template -UnsignedFloat &UnsignedFloat:: -operator-=(const UnsignedFloat &X) { - if (X.isZero()) - return *this; - if (*this <= X) - return *this = getZero(); - - // Normalize exponents. - UnsignedFloat Scaled = matchExponents(X); - assert(Digits >= Scaled.Digits); - - // Compute difference. - if (!Scaled.isZero()) { - Digits -= Scaled.Digits; - return *this; - } - - // Check if X just barely lost its last bit. E.g., for 32-bit: - // - // 1*2^32 - 1*2^0 == 0xffffffff != 1*2^32 - if (*this == UnsignedFloat(1, X.lgFloor() + Width)) { - Digits = DigitsType(0) - 1; - --Exponent; - } - return *this; -} -template -UnsignedFloat &UnsignedFloat:: -operator*=(const UnsignedFloat &X) { - if (isZero()) - return *this; - if (X.isZero()) - return *this = X; - - // Save the exponents. - int32_t Exponents = int32_t(Exponent) + int32_t(X.Exponent); - - // Get the raw product. - *this = getProduct(Digits, X.Digits); - - // Combine with exponents. - return *this <<= Exponents; -} -template -UnsignedFloat &UnsignedFloat:: -operator/=(const UnsignedFloat &X) { - if (isZero()) - return *this; - if (X.isZero()) - return *this = getLargest(); - - // Save the exponents. - int32_t Exponents = int32_t(Exponent) - int32_t(X.Exponent); - - // Get the raw quotient. - *this = getQuotient(Digits, X.Digits); - - // Combine with exponents. - return *this <<= Exponents; -} -template -void UnsignedFloat::shiftLeft(int32_t Shift) { - if (!Shift || isZero()) - return; - assert(Shift != INT32_MIN); - if (Shift < 0) { - shiftRight(-Shift); - return; - } - - // Shift as much as we can in the exponent. - int32_t ExponentShift = std::min(Shift, MaxExponent - Exponent); - Exponent += ExponentShift; - if (ExponentShift == Shift) - return; - - // Check this late, since it's rare. - if (isLargest()) - return; - - // Shift the digits themselves. - Shift -= ExponentShift; - if (Shift > countLeadingZerosWidth(Digits)) { - // Saturate. - *this = getLargest(); - return; - } - - Digits <<= Shift; - return; -} - -template -void UnsignedFloat::shiftRight(int32_t Shift) { - if (!Shift || isZero()) - return; - assert(Shift != INT32_MIN); - if (Shift < 0) { - shiftLeft(-Shift); - return; - } - - // Shift as much as we can in the exponent. - int32_t ExponentShift = std::min(Shift, Exponent - MinExponent); - Exponent -= ExponentShift; - if (ExponentShift == Shift) - return; - - // Shift the digits themselves. - Shift -= ExponentShift; - if (Shift >= Width) { - // Saturate. - *this = getZero(); - return; - } - - Digits >>= Shift; - return; -} +class BasicBlock; +class BranchProbabilityInfo; +class Function; +class Loop; +class LoopInfo; +class MachineBasicBlock; +class MachineBranchProbabilityInfo; +class MachineFunction; +class MachineLoop; +class MachineLoopInfo; -template -int UnsignedFloat::compare(const UnsignedFloat &X) const { - // Check for zero. - if (isZero()) - return X.isZero() ? 0 : -1; - if (X.isZero()) - return 1; - - // Check for the scale. Use lgFloor to be sure that the exponent difference - // is always lower than 64. - int32_t lgL = lgFloor(), lgR = X.lgFloor(); - if (lgL != lgR) - return lgL < lgR ? -1 : 1; - - // Compare digits. - if (Exponent < X.Exponent) - return UnsignedFloatBase::compare(Digits, X.Digits, X.Exponent - Exponent); - - return -UnsignedFloatBase::compare(X.Digits, Digits, Exponent - X.Exponent); -} +namespace bfi_detail { -template struct isPodLike> { - static const bool value = true; -}; -} +struct IrreducibleGraph; -//===----------------------------------------------------------------------===// -// -// BlockMass definition. -// -// TODO: Make this private to BlockFrequencyInfoImpl or delete. -// -//===----------------------------------------------------------------------===// -namespace llvm { +// This is part of a workaround for a GCC 4.7 crash on lambdas. +template struct BlockEdgesAdder; /// \brief Mass of a block. /// @@ -770,11 +112,11 @@ class BlockMass { bool operator<(const BlockMass &X) const { return Mass < X.Mass; } bool operator>(const BlockMass &X) const { return Mass > X.Mass; } - /// \brief Convert to floating point. + /// \brief Convert to scaled number. /// - /// Convert to a float. \a isFull() gives 1.0, while \a isEmpty() gives - /// slightly above 0.0. - UnsignedFloat toFloat() const; + /// Convert to \a ScaledNumber. \a isFull() gives 1.0, while \a isEmpty() + /// gives slightly above 0.0. + ScaledNumber toScaled() const; void dump() const; raw_ostream &print(raw_ostream &OS) const; @@ -797,35 +139,11 @@ inline raw_ostream &operator<<(raw_ostream &OS, const BlockMass &X) { return X.print(OS); } -template <> struct isPodLike { +} // end namespace bfi_detail + +template <> struct isPodLike { static const bool value = true; }; -} - -//===----------------------------------------------------------------------===// -// -// BlockFrequencyInfoImpl definition. -// -//===----------------------------------------------------------------------===// -namespace llvm { - -class BasicBlock; -class BranchProbabilityInfo; -class Function; -class Loop; -class LoopInfo; -class MachineBasicBlock; -class MachineBranchProbabilityInfo; -class MachineFunction; -class MachineLoop; -class MachineLoopInfo; - -namespace bfi_detail { -struct IrreducibleGraph; - -// This is part of a workaround for a GCC 4.7 crash on lambdas. -template struct BlockEdgesAdder; -} /// \brief Base class for BlockFrequencyInfoImpl /// @@ -837,7 +155,8 @@ template struct BlockEdgesAdder; /// BlockFrequencyInfoImpl. See there for details. class BlockFrequencyInfoImplBase { public: - typedef UnsignedFloat Float; + typedef ScaledNumber Scaled64; + typedef bfi_detail::BlockMass BlockMass; /// \brief Representative of a block. /// @@ -866,7 +185,7 @@ class BlockFrequencyInfoImplBase { /// \brief Stats about a block itself. struct FrequencyData { - Float Floating; + Scaled64 Scaled; uint64_t Integer; }; @@ -884,7 +203,7 @@ class BlockFrequencyInfoImplBase { NodeList Nodes; ///< Header and the members of the loop. BlockMass BackedgeMass; ///< Mass returned to loop header. BlockMass Mass; - Float Scale; + Scaled64 Scale; LoopData(LoopData *Parent, const BlockNode &Header) : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {} @@ -1003,6 +322,8 @@ class BlockFrequencyInfoImplBase { BlockNode TargetNode; uint64_t Amount; Weight() : Type(Local), Amount(0) {} + Weight(DistType Type, BlockNode TargetNode, uint64_t Amount) + : Type(Type), TargetNode(TargetNode), Amount(Amount) {} }; /// \brief Distribution of unscaled probability weight. @@ -1131,7 +452,7 @@ class BlockFrequencyInfoImplBase { virtual raw_ostream &print(raw_ostream &OS) const { return OS; } void dump() const { print(dbgs()); } - Float getFloatingBlockFreq(const BlockNode &Node) const; + Scaled64 getFloatingBlockFreq(const BlockNode &Node) const; BlockFrequency getBlockFreq(const BlockNode &Node) const; @@ -1310,7 +631,7 @@ void IrreducibleGraph::addEdges(const BlockNode &Node, /// entries point to this block. Its successors are the headers, which split /// the frequency evenly. /// -/// This algorithm leverages BlockMass and UnsignedFloat to maintain precision, +/// This algorithm leverages BlockMass and ScaledNumber to maintain precision, /// separates mass distribution from loop scaling, and dithers to eliminate /// probability mass loss. /// @@ -1568,7 +889,7 @@ template class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase { BlockFrequency getBlockFreq(const BlockT *BB) const { return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB)); } - Float getFloatingBlockFreq(const BlockT *BB) const { + Scaled64 getFloatingBlockFreq(const BlockT *BB) const { return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB)); } @@ -1852,7 +1173,8 @@ raw_ostream &BlockFrequencyInfoImpl::print(raw_ostream &OS) const { OS << "\n"; return OS; } -} + +} // end namespace llvm #undef DEBUG_TYPE diff --git a/include/llvm/Analysis/CaptureTracking.h b/include/llvm/Analysis/CaptureTracking.h index eccf1f811381..8b7c7a90f7c0 100644 --- a/include/llvm/Analysis/CaptureTracking.h +++ b/include/llvm/Analysis/CaptureTracking.h @@ -18,6 +18,8 @@ namespace llvm { class Value; class Use; + class Instruction; + class DominatorTree; /// PointerMayBeCaptured - Return true if this pointer value may be captured /// by the enclosing function (which is required to exist). This routine can @@ -30,6 +32,20 @@ namespace llvm { bool ReturnCaptures, bool StoreCaptures); + /// PointerMayBeCapturedBefore - Return true if this pointer value may be + /// captured by the enclosing function (which is required to exist). If a + /// DominatorTree is provided, only captures which happen before the given + /// instruction are considered. This routine can be expensive, so consider + /// caching the results. The boolean ReturnCaptures specifies whether + /// returning the value (or part of it) from the function counts as capturing + /// it or not. The boolean StoreCaptures specified whether storing the value + /// (or part of it) into memory anywhere automatically counts as capturing it + /// or not. Captures by the provided instruction are considered if the + /// final parameter is true. + bool PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures, + bool StoreCaptures, const Instruction *I, + DominatorTree *DT, bool IncludeI = false); + /// This callback is used in conjunction with PointerMayBeCaptured. In /// addition to the interface here, you'll need to provide your own getters /// to see whether anything was captured. diff --git a/include/llvm/Analysis/DominanceFrontier.h b/include/llvm/Analysis/DominanceFrontier.h index 0fbaa13bd3ba..f42b9cbbfedd 100644 --- a/include/llvm/Analysis/DominanceFrontier.h +++ b/include/llvm/Analysis/DominanceFrontier.h @@ -23,168 +23,186 @@ #include namespace llvm { - + //===----------------------------------------------------------------------===// /// DominanceFrontierBase - Common base class for computing forward and inverse /// dominance frontiers for a function. /// -class DominanceFrontierBase : public FunctionPass { +template +class DominanceFrontierBase { public: - typedef std::set DomSetType; // Dom set for a bb - typedef std::map DomSetMapType; // Dom set map + typedef std::set DomSetType; // Dom set for a bb + typedef std::map DomSetMapType; // Dom set map + protected: + typedef GraphTraits BlockTraits; + DomSetMapType Frontiers; - std::vector Roots; + std::vector Roots; const bool IsPostDominators; public: - DominanceFrontierBase(char &ID, bool isPostDom) - : FunctionPass(ID), IsPostDominators(isPostDom) {} + DominanceFrontierBase(bool isPostDom) : IsPostDominators(isPostDom) {} /// getRoots - Return the root blocks of the current CFG. This may include /// multiple blocks if we are computing post dominators. For forward /// dominators, this will always be a single block (the entry node). /// - inline const std::vector &getRoots() const { return Roots; } + inline const std::vector &getRoots() const { + return Roots; + } + + BlockT *getRoot() const { + assert(Roots.size() == 1 && "Should always have entry node!"); + return Roots[0]; + } /// isPostDominator - Returns true if analysis based of postdoms /// - bool isPostDominator() const { return IsPostDominators; } + bool isPostDominator() const { + return IsPostDominators; + } - void releaseMemory() override { Frontiers.clear(); } + void releaseMemory() { + Frontiers.clear(); + } // Accessor interface: - typedef DomSetMapType::iterator iterator; - typedef DomSetMapType::const_iterator const_iterator; - iterator begin() { return Frontiers.begin(); } + typedef typename DomSetMapType::iterator iterator; + typedef typename DomSetMapType::const_iterator const_iterator; + iterator begin() { return Frontiers.begin(); } const_iterator begin() const { return Frontiers.begin(); } - iterator end() { return Frontiers.end(); } - const_iterator end() const { return Frontiers.end(); } - iterator find(BasicBlock *B) { return Frontiers.find(B); } - const_iterator find(BasicBlock *B) const { return Frontiers.find(B); } + iterator end() { return Frontiers.end(); } + const_iterator end() const { return Frontiers.end(); } + iterator find(BlockT *B) { return Frontiers.find(B); } + const_iterator find(BlockT *B) const { return Frontiers.find(B); } - iterator addBasicBlock(BasicBlock *BB, const DomSetType &frontier) { + iterator addBasicBlock(BlockT *BB, const DomSetType &frontier) { assert(find(BB) == end() && "Block already in DominanceFrontier!"); return Frontiers.insert(std::make_pair(BB, frontier)).first; } /// removeBlock - Remove basic block BB's frontier. - void removeBlock(BasicBlock *BB) { - assert(find(BB) != end() && "Block is not in DominanceFrontier!"); - for (iterator I = begin(), E = end(); I != E; ++I) - I->second.erase(BB); - Frontiers.erase(BB); - } + void removeBlock(BlockT *BB); - void addToFrontier(iterator I, BasicBlock *Node) { - assert(I != end() && "BB is not in DominanceFrontier!"); - I->second.insert(Node); - } + void addToFrontier(iterator I, BlockT *Node); - void removeFromFrontier(iterator I, BasicBlock *Node) { - assert(I != end() && "BB is not in DominanceFrontier!"); - assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB"); - I->second.erase(Node); - } + void removeFromFrontier(iterator I, BlockT *Node); /// compareDomSet - Return false if two domsets match. Otherwise /// return true; - bool compareDomSet(DomSetType &DS1, const DomSetType &DS2) const { - std::set tmpSet; - for (DomSetType::const_iterator I = DS2.begin(), - E = DS2.end(); I != E; ++I) - tmpSet.insert(*I); - - for (DomSetType::const_iterator I = DS1.begin(), - E = DS1.end(); I != E; ) { - BasicBlock *Node = *I++; - - if (tmpSet.erase(Node) == 0) - // Node is in DS1 but not in DS2. - return true; - } - - if (!tmpSet.empty()) - // There are nodes that are in DS2 but not in DS1. - return true; - - // DS1 and DS2 matches. - return false; - } + bool compareDomSet(DomSetType &DS1, const DomSetType &DS2) const; /// compare - Return true if the other dominance frontier base matches /// this dominance frontier base. Otherwise return false. - bool compare(DominanceFrontierBase &Other) const { - DomSetMapType tmpFrontiers; - for (DomSetMapType::const_iterator I = Other.begin(), - E = Other.end(); I != E; ++I) - tmpFrontiers.insert(std::make_pair(I->first, I->second)); - - for (DomSetMapType::iterator I = tmpFrontiers.begin(), - E = tmpFrontiers.end(); I != E; ) { - BasicBlock *Node = I->first; - const_iterator DFI = find(Node); - if (DFI == end()) - return true; - - if (compareDomSet(I->second, DFI->second)) - return true; - - ++I; - tmpFrontiers.erase(Node); - } - - if (!tmpFrontiers.empty()) - return true; - - return false; - } + bool compare(DominanceFrontierBase &Other) const; /// print - Convert to human readable form /// - void print(raw_ostream &OS, const Module* = nullptr) const override; + void print(raw_ostream &OS) const; /// dump - Dump the dominance frontier to dbgs(). void dump() const; }; - //===------------------------------------- /// DominanceFrontier Class - Concrete subclass of DominanceFrontierBase that is /// used to compute a forward dominator frontiers. /// -class DominanceFrontier : public DominanceFrontierBase { - virtual void anchor(); +template +class ForwardDominanceFrontierBase : public DominanceFrontierBase { +private: + typedef GraphTraits BlockTraits; + public: + typedef DominatorTreeBase DomTreeT; + typedef DomTreeNodeBase DomTreeNodeT; + typedef typename DominanceFrontierBase::DomSetType DomSetType; + + ForwardDominanceFrontierBase() : DominanceFrontierBase(false) {} + + void analyze(DomTreeT &DT) { + this->Roots = DT.getRoots(); + assert(this->Roots.size() == 1 && + "Only one entry block for forward domfronts!"); + calculate(DT, DT[this->Roots[0]]); + } + + const DomSetType &calculate(const DomTreeT &DT, const DomTreeNodeT *Node); +}; + +class DominanceFrontier : public FunctionPass { + ForwardDominanceFrontierBase Base; + +public: + typedef DominatorTreeBase DomTreeT; + typedef DomTreeNodeBase DomTreeNodeT; + typedef DominanceFrontierBase::DomSetType DomSetType; + typedef DominanceFrontierBase::iterator iterator; + typedef DominanceFrontierBase::const_iterator const_iterator; + static char ID; // Pass ID, replacement for typeid - DominanceFrontier() : - DominanceFrontierBase(ID, false) { - initializeDominanceFrontierPass(*PassRegistry::getPassRegistry()); - } - BasicBlock *getRoot() const { - assert(Roots.size() == 1 && "Should always have entry node!"); - return Roots[0]; + DominanceFrontier(); + + ForwardDominanceFrontierBase &getBase() { return Base; } + + inline const std::vector &getRoots() const { + return Base.getRoots(); } - bool runOnFunction(Function &) override { - Frontiers.clear(); - DominatorTree &DT = getAnalysis().getDomTree(); - Roots = DT.getRoots(); - assert(Roots.size() == 1 && "Only one entry block for forward domfronts!"); - calculate(DT, DT[Roots[0]]); - return false; + BasicBlock *getRoot() const { return Base.getRoot(); } + + bool isPostDominator() const { return Base.isPostDominator(); } + + iterator begin() { return Base.begin(); } + + const_iterator begin() const { return Base.begin(); } + + iterator end() { return Base.end(); } + + const_iterator end() const { return Base.end(); } + + iterator find(BasicBlock *B) { return Base.find(B); } + + const_iterator find(BasicBlock *B) const { return Base.find(B); } + + iterator addBasicBlock(BasicBlock *BB, const DomSetType &frontier) { + return Base.addBasicBlock(BB, frontier); + } + + void removeBlock(BasicBlock *BB) { return Base.removeBlock(BB); } + + void addToFrontier(iterator I, BasicBlock *Node) { + return Base.addToFrontier(I, Node); + } + + void removeFromFrontier(iterator I, BasicBlock *Node) { + return Base.removeFromFrontier(I, Node); } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - AU.addRequired(); + bool compareDomSet(DomSetType &DS1, const DomSetType &DS2) const { + return Base.compareDomSet(DS1, DS2); } - const DomSetType &calculate(const DominatorTree &DT, - const DomTreeNode *Node); + bool compare(DominanceFrontierBase &Other) const { + return Base.compare(Other); + } + + void releaseMemory() override; + + bool runOnFunction(Function &) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + void print(raw_ostream &OS, const Module * = nullptr) const override; + + void dump() const; }; +EXTERN_TEMPLATE_INSTANTIATION(class DominanceFrontierBase); +EXTERN_TEMPLATE_INSTANTIATION(class ForwardDominanceFrontierBase); + } // End llvm namespace #endif diff --git a/include/llvm/Analysis/DominanceFrontierImpl.h b/include/llvm/Analysis/DominanceFrontierImpl.h new file mode 100644 index 000000000000..04df2cc35d46 --- /dev/null +++ b/include/llvm/Analysis/DominanceFrontierImpl.h @@ -0,0 +1,228 @@ +//===- llvm/Analysis/DominanceFrontier.h - Dominator Frontiers --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is the generic implementation of the DominanceFrontier class, which +// calculate and holds the dominance frontier for a function for. +// +// This should be considered deprecated, don't add any more uses of this data +// structure. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_DOMINANCEFRONTIER_IMPL_H +#define LLVM_ANALYSIS_DOMINANCEFRONTIER_IMPL_H + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Support/Debug.h" + +namespace llvm { + +namespace { +template +class DFCalculateWorkObject { +public: + typedef DomTreeNodeBase DomTreeNodeT; + + DFCalculateWorkObject(BlockT *B, BlockT *P, const DomTreeNodeT *N, + const DomTreeNodeT *PN) + : currentBB(B), parentBB(P), Node(N), parentNode(PN) {} + BlockT *currentBB; + BlockT *parentBB; + const DomTreeNodeT *Node; + const DomTreeNodeT *parentNode; +}; +} + +template +void DominanceFrontierBase::removeBlock(BlockT *BB) { + assert(find(BB) != end() && "Block is not in DominanceFrontier!"); + for (iterator I = begin(), E = end(); I != E; ++I) + I->second.erase(BB); + Frontiers.erase(BB); +} + +template +void DominanceFrontierBase::addToFrontier(iterator I, + BlockT *Node) { + assert(I != end() && "BB is not in DominanceFrontier!"); + assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB"); + I->second.erase(Node); +} + +template +void DominanceFrontierBase::removeFromFrontier(iterator I, + BlockT *Node) { + assert(I != end() && "BB is not in DominanceFrontier!"); + assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB"); + I->second.erase(Node); +} + +template +bool DominanceFrontierBase::compareDomSet(DomSetType &DS1, + const DomSetType &DS2) const { + std::set tmpSet; + for (BlockT *BB : DS2) + tmpSet.insert(BB); + + for (typename DomSetType::const_iterator I = DS1.begin(), E = DS1.end(); + I != E;) { + BlockT *Node = *I++; + + if (tmpSet.erase(Node) == 0) + // Node is in DS1 but tnot in DS2. + return true; + } + + if (!tmpSet.empty()) { + // There are nodes that are in DS2 but not in DS1. + return true; + } + + // DS1 and DS2 matches. + return false; +} + +template +bool DominanceFrontierBase::compare( + DominanceFrontierBase &Other) const { + DomSetMapType tmpFrontiers; + for (typename DomSetMapType::const_iterator I = Other.begin(), + E = Other.end(); + I != E; ++I) + tmpFrontiers.insert(std::make_pair(I->first, I->second)); + + for (typename DomSetMapType::iterator I = tmpFrontiers.begin(), + E = tmpFrontiers.end(); + I != E;) { + BlockT *Node = I->first; + const_iterator DFI = find(Node); + if (DFI == end()) + return true; + + if (compareDomSet(I->second, DFI->second)) + return true; + + ++I; + tmpFrontiers.erase(Node); + } + + if (!tmpFrontiers.empty()) + return true; + + return false; +} + +template +void DominanceFrontierBase::print(raw_ostream &OS) const { + for (const_iterator I = begin(), E = end(); I != E; ++I) { + OS << " DomFrontier for BB "; + if (I->first) + I->first->printAsOperand(OS, false); + else + OS << " <>"; + OS << " is:\t"; + + const std::set &BBs = I->second; + + for (const BlockT *BB : BBs) { + OS << ' '; + if (BB) + BB->printAsOperand(OS, false); + else + OS << "<>"; + } + OS << '\n'; + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +template +void DominanceFrontierBase::dump() const { + print(dbgs()); +} +#endif + +template +const typename ForwardDominanceFrontierBase::DomSetType & +ForwardDominanceFrontierBase::calculate(const DomTreeT &DT, + const DomTreeNodeT *Node) { + BlockT *BB = Node->getBlock(); + DomSetType *Result = nullptr; + + std::vector> workList; + SmallPtrSet visited; + + workList.push_back(DFCalculateWorkObject(BB, nullptr, Node, nullptr)); + do { + DFCalculateWorkObject *currentW = &workList.back(); + assert(currentW && "Missing work object."); + + BlockT *currentBB = currentW->currentBB; + BlockT *parentBB = currentW->parentBB; + const DomTreeNodeT *currentNode = currentW->Node; + const DomTreeNodeT *parentNode = currentW->parentNode; + assert(currentBB && "Invalid work object. Missing current Basic Block"); + assert(currentNode && "Invalid work object. Missing current Node"); + DomSetType &S = this->Frontiers[currentBB]; + + // Visit each block only once. + if (visited.count(currentBB) == 0) { + visited.insert(currentBB); + + // Loop over CFG successors to calculate DFlocal[currentNode] + for (auto SI = BlockTraits::child_begin(currentBB), + SE = BlockTraits::child_end(currentBB); + SI != SE; ++SI) { + // Does Node immediately dominate this successor? + if (DT[*SI]->getIDom() != currentNode) + S.insert(*SI); + } + } + + // At this point, S is DFlocal. Now we union in DFup's of our children... + // Loop through and visit the nodes that Node immediately dominates (Node's + // children in the IDomTree) + bool visitChild = false; + for (typename DomTreeNodeT::const_iterator NI = currentNode->begin(), + NE = currentNode->end(); + NI != NE; ++NI) { + DomTreeNodeT *IDominee = *NI; + BlockT *childBB = IDominee->getBlock(); + if (visited.count(childBB) == 0) { + workList.push_back(DFCalculateWorkObject( + childBB, currentBB, IDominee, currentNode)); + visitChild = true; + } + } + + // If all children are visited or there is any child then pop this block + // from the workList. + if (!visitChild) { + if (!parentBB) { + Result = &S; + break; + } + + typename DomSetType::const_iterator CDFI = S.begin(), CDFE = S.end(); + DomSetType &parentSet = this->Frontiers[parentBB]; + for (; CDFI != CDFE; ++CDFI) { + if (!DT.properlyDominates(parentNode, DT[*CDFI])) + parentSet.insert(*CDFI); + } + workList.pop_back(); + } + + } while (!workList.empty()); + + return *Result; +} + +} // End llvm namespace + +#endif diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h index 82a788d1bb82..49c88fd5caeb 100644 --- a/include/llvm/Analysis/RegionInfo.h +++ b/include/llvm/Analysis/RegionInfo.h @@ -22,26 +22,77 @@ // itself is not, but in practice runtime seems to be in the order of magnitude // of dominance tree calculation. // +// WARNING: LLVM is generally very concerned about compile time such that +// the use of additional analysis passes in the default +// optimization sequence is avoided as much as possible. +// Specifically, if you do not need the RegionInfo, but dominance +// information could be sufficient please base your work only on +// the dominator tree. Most passes maintain it, such that using +// it has often near zero cost. In contrast RegionInfo is by +// default not available, is not maintained by existing +// transformations and there is no intention to do so. +// //===----------------------------------------------------------------------===// #ifndef LLVM_ANALYSIS_REGIONINFO_H #define LLVM_ANALYSIS_REGIONINFO_H +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PointerIntPair.h" -#include "llvm/ADT/iterator_range.h" -#include "llvm/Analysis/DominanceFrontier.h" -#include "llvm/Analysis/PostDominators.h" -#include "llvm/Support/Allocator.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" #include #include +#include namespace llvm { -class Region; -class RegionInfo; -class raw_ostream; +// RegionTraits - Class to be specialized for different users of RegionInfo +// (i.e. BasicBlocks or MachineBasicBlocks). This is only to avoid needing to +// pass around an unreasonable number of template parameters. +template +struct RegionTraits { + // FuncT + // BlockT + // RegionT + // RegionNodeT + // RegionInfoT + typedef typename FuncT_::UnknownRegionTypeError BrokenT; +}; + +class DominatorTree; +class DominanceFrontier; class Loop; class LoopInfo; +struct PostDominatorTree; +class raw_ostream; +class Region; +template +class RegionBase; +class RegionNode; +class RegionInfo; +template +class RegionInfoBase; + +template <> +struct RegionTraits { + typedef Function FuncT; + typedef BasicBlock BlockT; + typedef Region RegionT; + typedef RegionNode RegionNodeT; + typedef RegionInfo RegionInfoT; + typedef DominatorTree DomTreeT; + typedef DomTreeNode DomTreeNodeT; + typedef DominanceFrontier DomFrontierT; + typedef PostDominatorTree PostDomTreeT; + typedef Instruction InstT; + typedef Loop LoopT; + typedef LoopInfo LoopInfoT; + + static unsigned getNumSuccessors(BasicBlock *BB) { + return BB->getTerminator()->getNumSuccessors(); + } +}; /// @brief Marker class to iterate over the elements of a Region in flat mode. /// @@ -55,11 +106,18 @@ class FlatIt {}; /// @brief A RegionNode represents a subregion or a BasicBlock that is part of a /// Region. -class RegionNode { - RegionNode(const RegionNode &) LLVM_DELETED_FUNCTION; - const RegionNode &operator=(const RegionNode &) LLVM_DELETED_FUNCTION; +template +class RegionNodeBase { + friend class RegionBase; + +public: + typedef typename Tr::BlockT BlockT; + typedef typename Tr::RegionT RegionT; + +private: + RegionNodeBase(const RegionNodeBase &) LLVM_DELETED_FUNCTION; + const RegionNodeBase &operator=(const RegionNodeBase &) LLVM_DELETED_FUNCTION; -protected: /// This is the entry basic block that starts this region node. If this is a /// BasicBlock RegionNode, then entry is just the basic block, that this /// RegionNode represents. Otherwise it is the entry of this (Sub)RegionNode. @@ -70,13 +128,13 @@ class RegionNode { /// The node can hold either a Region or a BasicBlock. /// Use one bit to save, if this RegionNode is a subregion or BasicBlock /// RegionNode. - PointerIntPair entry; + PointerIntPair entry; /// @brief The parent Region of this RegionNode. /// @see getParent() - Region* parent; + RegionT *parent; -public: +protected: /// @brief Create a RegionNode. /// /// @param Parent The parent of this RegionNode. @@ -85,9 +143,11 @@ class RegionNode { /// BasicBlock itself. If it represents a subregion, this /// is the entry BasicBlock of the subregion. /// @param isSubRegion If this RegionNode represents a SubRegion. - inline RegionNode(Region* Parent, BasicBlock* Entry, bool isSubRegion = 0) - : entry(Entry, isSubRegion), parent(Parent) {} + inline RegionNodeBase(RegionT *Parent, BlockT *Entry, + bool isSubRegion = false) + : entry(Entry, isSubRegion), parent(Parent) {} +public: /// @brief Get the parent Region of this RegionNode. /// /// The parent Region is the Region this RegionNode belongs to. If for @@ -96,7 +156,7 @@ class RegionNode { /// pointing to the Region this RegionNode belongs to. /// /// @return Get the parent Region of this RegionNode. - inline Region* getParent() const { return parent; } + inline RegionT *getParent() const { return parent; } /// @brief Get the entry BasicBlock of this RegionNode. /// @@ -104,7 +164,7 @@ class RegionNode { /// itself, otherwise we return the entry BasicBlock of the Subregion /// /// @return The entry BasicBlock of this RegionNode. - inline BasicBlock* getEntry() const { return entry.getPointer(); } + inline BlockT *getEntry() const { return entry.getPointer(); } /// @brief Get the content of this RegionNode. /// @@ -112,33 +172,15 @@ class RegionNode { /// check the type of the content with the isSubRegion() function call. /// /// @return The content of this RegionNode. - template - inline T* getNodeAs() const; + template inline T *getNodeAs() const; /// @brief Is this RegionNode a subregion? /// /// @return True if it contains a subregion. False if it contains a /// BasicBlock. - inline bool isSubRegion() const { - return entry.getInt(); - } + inline bool isSubRegion() const { return entry.getInt(); } }; -/// Print a RegionNode. -inline raw_ostream &operator<<(raw_ostream &OS, const RegionNode &Node); - -template<> -inline BasicBlock* RegionNode::getNodeAs() const { - assert(!isSubRegion() && "This is not a BasicBlock RegionNode!"); - return getEntry(); -} - -template<> -inline Region* RegionNode::getNodeAs() const { - assert(isSubRegion() && "This is not a subregion RegionNode!"); - return reinterpret_cast(const_cast(this)); -} - //===----------------------------------------------------------------------===// /// @brief A single entry single exit Region. /// @@ -201,37 +243,53 @@ inline Region* RegionNode::getNodeAs() const { /// /// The first call returns a textual representation of the program structure /// tree, the second one creates a graphical representation using graphviz. -class Region : public RegionNode { - friend class RegionInfo; - Region(const Region &) LLVM_DELETED_FUNCTION; - const Region &operator=(const Region &) LLVM_DELETED_FUNCTION; +template +class RegionBase : public RegionNodeBase { + typedef typename Tr::FuncT FuncT; + typedef typename Tr::BlockT BlockT; + typedef typename Tr::RegionInfoT RegionInfoT; + typedef typename Tr::RegionT RegionT; + typedef typename Tr::RegionNodeT RegionNodeT; + typedef typename Tr::DomTreeT DomTreeT; + typedef typename Tr::LoopT LoopT; + typedef typename Tr::LoopInfoT LoopInfoT; + typedef typename Tr::InstT InstT; + + typedef GraphTraits BlockTraits; + typedef GraphTraits> InvBlockTraits; + typedef typename BlockTraits::ChildIteratorType SuccIterTy; + typedef typename InvBlockTraits::ChildIteratorType PredIterTy; + + friend class RegionInfoBase; + RegionBase(const RegionBase &) LLVM_DELETED_FUNCTION; + const RegionBase &operator=(const RegionBase &) LLVM_DELETED_FUNCTION; // Information necessary to manage this Region. - RegionInfo* RI; - DominatorTree *DT; + RegionInfoT *RI; + DomTreeT *DT; // The exit BasicBlock of this region. // (The entry BasicBlock is part of RegionNode) - BasicBlock *exit; + BlockT *exit; - typedef std::vector> RegionSet; + typedef std::vector> RegionSet; // The subregions of this region. RegionSet children; - typedef std::map BBNodeMapT; + typedef std::map BBNodeMapT; // Save the BasicBlock RegionNodes that are element of this Region. mutable BBNodeMapT BBNodeMap; /// verifyBBInRegion - Check if a BB is in this Region. This check also works /// if the region is incorrectly built. (EXPENSIVE!) - void verifyBBInRegion(BasicBlock* BB) const; + void verifyBBInRegion(BlockT *BB) const; /// verifyWalk - Walk over all the BBs of the region starting from BB and /// verify that all reachable basic blocks are elements of the region. /// (EXPENSIVE!) - void verifyWalk(BasicBlock* BB, std::set* visitedBB) const; + void verifyWalk(BlockT *BB, std::set *visitedBB) const; /// verifyRegionNest - Verify if the region and its children are valid /// regions (EXPENSIVE!) @@ -246,27 +304,29 @@ class Region : public RegionNode { /// @param DT The dominator tree of the current function. /// @param Parent The surrounding region or NULL if this is a top level /// region. - Region(BasicBlock *Entry, BasicBlock *Exit, RegionInfo* RI, - DominatorTree *DT, Region *Parent = nullptr); + RegionBase(BlockT *Entry, BlockT *Exit, RegionInfoT *RI, DomTreeT *DT, + RegionT *Parent = nullptr); /// Delete the Region and all its subregions. - ~Region(); + ~RegionBase(); /// @brief Get the entry BasicBlock of the Region. /// @return The entry BasicBlock of the region. - BasicBlock *getEntry() const { return RegionNode::getEntry(); } + BlockT *getEntry() const { + return RegionNodeBase::getEntry(); + } /// @brief Replace the entry basic block of the region with the new basic /// block. /// /// @param BB The new entry basic block of the region. - void replaceEntry(BasicBlock *BB); + void replaceEntry(BlockT *BB); /// @brief Replace the exit basic block of the region with the new basic /// block. /// /// @param BB The new exit basic block of the region. - void replaceExit(BasicBlock *BB); + void replaceExit(BlockT *BB); /// @brief Recursively replace the entry basic block of the region. /// @@ -275,7 +335,7 @@ class Region : public RegionNode { /// this region. /// /// @param NewEntry The new entry basic block. - void replaceEntryRecursive(BasicBlock *NewEntry); + void replaceEntryRecursive(BlockT *NewEntry); /// @brief Recursively replace the exit basic block of the region. /// @@ -284,22 +344,25 @@ class Region : public RegionNode { /// this region. /// /// @param NewExit The new exit basic block. - void replaceExitRecursive(BasicBlock *NewExit); + void replaceExitRecursive(BlockT *NewExit); /// @brief Get the exit BasicBlock of the Region. /// @return The exit BasicBlock of the Region, NULL if this is the TopLevel /// Region. - BasicBlock *getExit() const { return exit; } + BlockT *getExit() const { return exit; } /// @brief Get the parent of the Region. /// @return The parent of the Region or NULL if this is a top level /// Region. - Region *getParent() const { return RegionNode::getParent(); } + RegionT *getParent() const { + return RegionNodeBase::getParent(); + } /// @brief Get the RegionNode representing the current Region. /// @return The RegionNode representing the current Region. - RegionNode* getNode() const { - return const_cast(reinterpret_cast(this)); + RegionNodeT *getNode() const { + return const_cast( + reinterpret_cast(this)); } /// @brief Get the nesting level of this Region. @@ -320,21 +383,21 @@ class Region : public RegionNode { /// @return A region also starting at getEntry(), but reaching to the next /// basic block that forms with getEntry() a (non-canonical) region. /// NULL if such a basic block does not exist. - Region *getExpandedRegion() const; + RegionT *getExpandedRegion() const; /// @brief Return the first block of this region's single entry edge, /// if existing. /// /// @return The BasicBlock starting this region's single entry edge, /// else NULL. - BasicBlock *getEnteringBlock() const; + BlockT *getEnteringBlock() const; /// @brief Return the first block of this region's single exit edge, /// if existing. /// /// @return The BasicBlock starting this region's single exit edge, /// else NULL. - BasicBlock *getExitingBlock() const; + BlockT *getExitingBlock() const; /// @brief Is this a simple region? /// @@ -348,20 +411,18 @@ class Region : public RegionNode { std::string getNameStr() const; /// @brief Return the RegionInfo object, that belongs to this Region. - RegionInfo *getRegionInfo() const { - return RI; - } + RegionInfoT *getRegionInfo() const { return RI; } /// PrintStyle - Print region in difference ways. - enum PrintStyle { PrintNone, PrintBB, PrintRN }; + enum PrintStyle { PrintNone, PrintBB, PrintRN }; /// @brief Print the region. /// /// @param OS The output stream the Region is printed to. /// @param printTree Print also the tree of subregions. /// @param level The indentation level used for printing. - void print(raw_ostream& OS, bool printTree = true, unsigned level = 0, - enum PrintStyle Style = PrintNone) const; + void print(raw_ostream &OS, bool printTree = true, unsigned level = 0, + PrintStyle Style = PrintNone) const; /// @brief Print the region to stderr. void dump() const; @@ -370,28 +431,28 @@ class Region : public RegionNode { /// /// @param BB The BasicBlock that might be contained in this Region. /// @return True if the block is contained in the region otherwise false. - bool contains(const BasicBlock *BB) const; + bool contains(const BlockT *BB) const; /// @brief Check if the region contains another region. /// /// @param SubRegion The region that might be contained in this Region. /// @return True if SubRegion is contained in the region otherwise false. - bool contains(const Region *SubRegion) const { + bool contains(const RegionT *SubRegion) const { // Toplevel Region. if (!getExit()) return true; - return contains(SubRegion->getEntry()) - && (contains(SubRegion->getExit()) || SubRegion->getExit() == getExit()); + return contains(SubRegion->getEntry()) && + (contains(SubRegion->getExit()) || + SubRegion->getExit() == getExit()); } /// @brief Check if the region contains an Instruction. /// /// @param Inst The Instruction that might be contained in this region. - /// @return True if the Instruction is contained in the region otherwise false. - bool contains(const Instruction *Inst) const { - return contains(Inst->getParent()); - } + /// @return True if the Instruction is contained in the region otherwise + /// false. + bool contains(const InstT *Inst) const { return contains(Inst->getParent()); } /// @brief Check if the region contains a loop. /// @@ -400,7 +461,7 @@ class Region : public RegionNode { /// In case a NULL pointer is passed to this function the result /// is false, except for the region that describes the whole function. /// In that case true is returned. - bool contains(const Loop *L) const; + bool contains(const LoopT *L) const; /// @brief Get the outermost loop in the region that contains a loop. /// @@ -410,7 +471,7 @@ class Region : public RegionNode { /// @param L The loop the lookup is started. /// @return The outermost loop in the region, NULL if such a loop does not /// exist or if the region describes the whole function. - Loop *outermostLoopInRegion(Loop *L) const; + LoopT *outermostLoopInRegion(LoopT *L) const; /// @brief Get the outermost loop in the region that contains a basic block. /// @@ -421,13 +482,13 @@ class Region : public RegionNode { /// @param BB The basic block surrounded by the loop. /// @return The outermost loop in the region, NULL if such a loop does not /// exist or if the region describes the whole function. - Loop *outermostLoopInRegion(LoopInfo *LI, BasicBlock* BB) const; + LoopT *outermostLoopInRegion(LoopInfoT *LI, BlockT *BB) const; /// @brief Get the subregion that starts at a BasicBlock /// /// @param BB The BasicBlock the subregion should start. /// @return The Subregion if available, otherwise NULL. - Region* getSubRegionNode(BasicBlock *BB) const; + RegionT *getSubRegionNode(BlockT *BB) const; /// @brief Get the RegionNode for a BasicBlock /// @@ -435,32 +496,32 @@ class Region : public RegionNode { /// @return If available, the RegionNode that represents the subregion /// starting at BB. If no subregion starts at BB, the RegionNode /// representing BB. - RegionNode* getNode(BasicBlock *BB) const; + RegionNodeT *getNode(BlockT *BB) const; /// @brief Get the BasicBlock RegionNode for a BasicBlock /// /// @param BB The BasicBlock for which the RegionNode is requested. /// @return The RegionNode representing the BB. - RegionNode* getBBNode(BasicBlock *BB) const; + RegionNodeT *getBBNode(BlockT *BB) const; /// @brief Add a new subregion to this Region. /// /// @param SubRegion The new subregion that will be added. /// @param moveChildren Move the children of this region, that are also /// contained in SubRegion into SubRegion. - void addSubRegion(Region *SubRegion, bool moveChildren = false); + void addSubRegion(RegionT *SubRegion, bool moveChildren = false); /// @brief Remove a subregion from this Region. /// /// The subregion is not deleted, as it will probably be inserted into another /// region. /// @param SubRegion The SubRegion that will be removed. - Region *removeSubRegion(Region *SubRegion); + RegionT *removeSubRegion(RegionT *SubRegion); /// @brief Move all direct child nodes of this Region to another Region. /// /// @param To The Region the child nodes will be transferred to. - void transferChildrenTo(Region *To); + void transferChildrenTo(RegionT *To); /// @brief Verify if the region is a correct region. /// @@ -479,8 +540,8 @@ class Region : public RegionNode { /// /// These iterators iterator over all subregions of this Region. //@{ - typedef RegionSet::iterator iterator; - typedef RegionSet::const_iterator const_iterator; + typedef typename RegionSet::iterator iterator; + typedef typename RegionSet::const_iterator const_iterator; iterator begin() { return children.begin(); } iterator end() { return children.end(); } @@ -497,18 +558,18 @@ class Region : public RegionNode { //@{ template class block_iterator_wrapper - : public df_iterator::type *> { - typedef df_iterator::type *> super; + : public df_iterator< + typename std::conditional::type *> { + typedef df_iterator< + typename std::conditional::type *> super; public: typedef block_iterator_wrapper Self; typedef typename super::pointer pointer; // Construct the begin iterator. - block_iterator_wrapper(pointer Entry, pointer Exit) : super(df_begin(Entry)) - { + block_iterator_wrapper(pointer Entry, pointer Exit) + : super(df_begin(Entry)) { // Mark the exit of the region as visited, so that the children of the // exit and the exit itself, i.e. the block outside the region will never // be visited. @@ -516,35 +577,29 @@ class Region : public RegionNode { } // Construct the end iterator. - block_iterator_wrapper() : super(df_end((BasicBlock *)nullptr)) {} + block_iterator_wrapper() : super(df_end((BlockT *)nullptr)) {} /*implicit*/ block_iterator_wrapper(super I) : super(I) {} // FIXME: Even a const_iterator returns a non-const BasicBlock pointer. // This was introduced for backwards compatibility, but should // be removed as soon as all users are fixed. - BasicBlock *operator*() const { - return const_cast(super::operator*()); + BlockT *operator*() const { + return const_cast(super::operator*()); } }; typedef block_iterator_wrapper block_iterator; - typedef block_iterator_wrapper const_block_iterator; + typedef block_iterator_wrapper const_block_iterator; - block_iterator block_begin() { - return block_iterator(getEntry(), getExit()); - } + block_iterator block_begin() { return block_iterator(getEntry(), getExit()); } - block_iterator block_end() { - return block_iterator(); - } + block_iterator block_end() { return block_iterator(); } const_block_iterator block_begin() const { return const_block_iterator(getEntry(), getExit()); } - const_block_iterator block_end() const { - return const_block_iterator(); - } + const_block_iterator block_end() const { return const_block_iterator(); } typedef iterator_range block_range; typedef iterator_range const_block_range; @@ -568,12 +623,12 @@ class Region : public RegionNode { /// are direct children of this Region. It does not iterate over any /// RegionNodes that are also element of a subregion of this Region. //@{ - typedef df_iterator, false, - GraphTraits > element_iterator; + typedef df_iterator, false, + GraphTraits> element_iterator; - typedef df_iterator, - false, GraphTraits > - const_element_iterator; + typedef df_iterator, + false, + GraphTraits> const_element_iterator; element_iterator element_begin(); element_iterator element_end(); @@ -583,132 +638,143 @@ class Region : public RegionNode { //@} }; +/// Print a RegionNode. +template +inline raw_ostream &operator<<(raw_ostream &OS, const RegionNodeBase &Node); + //===----------------------------------------------------------------------===// /// @brief Analysis that detects all canonical Regions. /// /// The RegionInfo pass detects all canonical regions in a function. The Regions /// are connected using the parent relation. This builds a Program Structure /// Tree. -class RegionInfo : public FunctionPass { - typedef DenseMap BBtoBBMap; - typedef DenseMap BBtoRegionMap; - typedef SmallPtrSet RegionSet; +template +class RegionInfoBase { + typedef typename Tr::BlockT BlockT; + typedef typename Tr::FuncT FuncT; + typedef typename Tr::RegionT RegionT; + typedef typename Tr::RegionInfoT RegionInfoT; + typedef typename Tr::DomTreeT DomTreeT; + typedef typename Tr::DomTreeNodeT DomTreeNodeT; + typedef typename Tr::PostDomTreeT PostDomTreeT; + typedef typename Tr::DomFrontierT DomFrontierT; + typedef GraphTraits BlockTraits; + typedef GraphTraits> InvBlockTraits; + typedef typename BlockTraits::ChildIteratorType SuccIterTy; + typedef typename InvBlockTraits::ChildIteratorType PredIterTy; + + friend class RegionInfo; + friend class MachineRegionInfo; + typedef DenseMap BBtoBBMap; + typedef DenseMap BBtoRegionMap; + typedef SmallPtrSet RegionSet; - RegionInfo(const RegionInfo &) LLVM_DELETED_FUNCTION; - const RegionInfo &operator=(const RegionInfo &) LLVM_DELETED_FUNCTION; + RegionInfoBase(); + virtual ~RegionInfoBase(); - DominatorTree *DT; - PostDominatorTree *PDT; - DominanceFrontier *DF; + RegionInfoBase(const RegionInfoBase &) LLVM_DELETED_FUNCTION; + const RegionInfoBase &operator=(const RegionInfoBase &) LLVM_DELETED_FUNCTION; + + DomTreeT *DT; + PostDomTreeT *PDT; + DomFrontierT *DF; /// The top level region. - Region *TopLevelRegion; + RegionT *TopLevelRegion; +private: /// Map every BB to the smallest region, that contains BB. BBtoRegionMap BBtoRegion; // isCommonDomFrontier - Returns true if BB is in the dominance frontier of // entry, because it was inherited from exit. In the other case there is an // edge going from entry to BB without passing exit. - bool isCommonDomFrontier(BasicBlock* BB, BasicBlock* entry, - BasicBlock* exit) const; + bool isCommonDomFrontier(BlockT *BB, BlockT *entry, BlockT *exit) const; // isRegion - Check if entry and exit surround a valid region, based on // dominance tree and dominance frontier. - bool isRegion(BasicBlock* entry, BasicBlock* exit) const; + bool isRegion(BlockT *entry, BlockT *exit) const; // insertShortCut - Saves a shortcut pointing from entry to exit. // This function may extend this shortcut if possible. - void insertShortCut(BasicBlock* entry, BasicBlock* exit, - BBtoBBMap* ShortCut) const; + void insertShortCut(BlockT *entry, BlockT *exit, BBtoBBMap *ShortCut) const; // getNextPostDom - Returns the next BB that postdominates N, while skipping // all post dominators that cannot finish a canonical region. - DomTreeNode *getNextPostDom(DomTreeNode* N, BBtoBBMap *ShortCut) const; + DomTreeNodeT *getNextPostDom(DomTreeNodeT *N, BBtoBBMap *ShortCut) const; // isTrivialRegion - A region is trivial, if it contains only one BB. - bool isTrivialRegion(BasicBlock *entry, BasicBlock *exit) const; + bool isTrivialRegion(BlockT *entry, BlockT *exit) const; // createRegion - Creates a single entry single exit region. - Region *createRegion(BasicBlock *entry, BasicBlock *exit); + RegionT *createRegion(BlockT *entry, BlockT *exit); // findRegionsWithEntry - Detect all regions starting with bb 'entry'. - void findRegionsWithEntry(BasicBlock *entry, BBtoBBMap *ShortCut); + void findRegionsWithEntry(BlockT *entry, BBtoBBMap *ShortCut); // scanForRegions - Detects regions in F. - void scanForRegions(Function &F, BBtoBBMap *ShortCut); + void scanForRegions(FuncT &F, BBtoBBMap *ShortCut); // getTopMostParent - Get the top most parent with the same entry block. - Region *getTopMostParent(Region *region); + RegionT *getTopMostParent(RegionT *region); // buildRegionsTree - build the region hierarchy after all region detected. - void buildRegionsTree(DomTreeNode *N, Region *region); - - // Calculate - detecte all regions in function and build the region tree. - void Calculate(Function& F); - - void releaseMemory() override; + void buildRegionsTree(DomTreeNodeT *N, RegionT *region); // updateStatistics - Update statistic about created regions. - void updateStatistics(Region *R); + virtual void updateStatistics(RegionT *R) = 0; - // isSimple - Check if a region is a simple region with exactly one entry - // edge and exactly one exit edge. - bool isSimple(Region* R) const; + // calculate - detect all regions in function and build the region tree. + void calculate(FuncT &F); public: - static char ID; - explicit RegionInfo(); + static bool VerifyRegionInfo; + static typename RegionT::PrintStyle printStyle; - ~RegionInfo(); + void print(raw_ostream &OS) const; + void dump() const; - /// @name FunctionPass interface - //@{ - bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; - void print(raw_ostream &OS, const Module *) const override; - void verifyAnalysis() const override; - //@} + void releaseMemory(); /// @brief Get the smallest region that contains a BasicBlock. /// /// @param BB The basic block. /// @return The smallest region, that contains BB or NULL, if there is no /// region containing BB. - Region *getRegionFor(BasicBlock *BB) const; + RegionT *getRegionFor(BlockT *BB) const; /// @brief Set the smallest region that surrounds a basic block. /// /// @param BB The basic block surrounded by a region. /// @param R The smallest region that surrounds BB. - void setRegionFor(BasicBlock *BB, Region *R); + void setRegionFor(BlockT *BB, RegionT *R); /// @brief A shortcut for getRegionFor(). /// /// @param BB The basic block. /// @return The smallest region, that contains BB or NULL, if there is no /// region containing BB. - Region *operator[](BasicBlock *BB) const; + RegionT *operator[](BlockT *BB) const; /// @brief Return the exit of the maximal refined region, that starts at a /// BasicBlock. /// /// @param BB The BasicBlock the refined region starts. - BasicBlock *getMaxRegionExit(BasicBlock *BB) const; + BlockT *getMaxRegionExit(BlockT *BB) const; /// @brief Find the smallest region that contains two regions. /// /// @param A The first region. /// @param B The second region. /// @return The smallest region containing A and B. - Region *getCommonRegion(Region* A, Region *B) const; + RegionT *getCommonRegion(RegionT *A, RegionT *B) const; /// @brief Find the smallest region that contains two basic blocks. /// /// @param A The first basic block. /// @param B The second basic block. /// @return The smallest region that contains A and B. - Region* getCommonRegion(BasicBlock* A, BasicBlock *B) const { + RegionT *getCommonRegion(BlockT *A, BlockT *B) const { return getCommonRegion(getRegionFor(A), getRegionFor(B)); } @@ -716,23 +782,21 @@ class RegionInfo : public FunctionPass { /// /// @param Regions A vector of regions. /// @return The smallest region that contains all regions in Regions. - Region* getCommonRegion(SmallVectorImpl &Regions) const; + RegionT *getCommonRegion(SmallVectorImpl &Regions) const; /// @brief Find the smallest region that contains a set of basic blocks. /// /// @param BBs A vector of basic blocks. /// @return The smallest region that contains all basic blocks in BBS. - Region* getCommonRegion(SmallVectorImpl &BBs) const; + RegionT *getCommonRegion(SmallVectorImpl &BBs) const; - Region *getTopLevelRegion() const { - return TopLevelRegion; - } + RegionT *getTopLevelRegion() const { return TopLevelRegion; } /// @brief Update RegionInfo after a basic block was split. /// /// @param NewBB The basic block that was created before OldBB. /// @param OldBB The old basic block. - void splitBlock(BasicBlock* NewBB, BasicBlock *OldBB); + void splitBlock(BlockT *NewBB, BlockT *OldBB); /// @brief Clear the Node Cache for all Regions. /// @@ -741,14 +805,104 @@ class RegionInfo : public FunctionPass { if (TopLevelRegion) TopLevelRegion->clearNodeCache(); } + + void verifyAnalysis() const; +}; + +class Region; + +class RegionNode : public RegionNodeBase> { +public: + inline RegionNode(Region *Parent, BasicBlock *Entry, bool isSubRegion = false) + : RegionNodeBase>(Parent, Entry, isSubRegion) {} + + ~RegionNode() {} + + bool operator==(const Region &RN) const { + return this == reinterpret_cast(&RN); + } +}; + +class Region : public RegionBase> { +public: + Region(BasicBlock *Entry, BasicBlock *Exit, RegionInfo *RI, DominatorTree *DT, + Region *Parent = nullptr); + ~Region(); + + bool operator==(const RegionNode &RN) const { + return &RN == reinterpret_cast(this); + } +}; + +class RegionInfo : public RegionInfoBase> { +public: + explicit RegionInfo(); + + virtual ~RegionInfo(); + + // updateStatistics - Update statistic about created regions. + void updateStatistics(Region *R) final; + + void recalculate(Function &F, DominatorTree *DT, PostDominatorTree *PDT, + DominanceFrontier *DF); +}; + +class RegionInfoPass : public FunctionPass { + RegionInfo RI; + +public: + static char ID; + explicit RegionInfoPass(); + + ~RegionInfoPass(); + + RegionInfo &getRegionInfo() { return RI; } + + const RegionInfo &getRegionInfo() const { return RI; } + + /// @name FunctionPass interface + //@{ + bool runOnFunction(Function &F) override; + void releaseMemory() override; + void verifyAnalysis() const override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + void print(raw_ostream &OS, const Module *) const override; + void dump() const; + //@} }; -inline raw_ostream &operator<<(raw_ostream &OS, const RegionNode &Node) { +template <> +template <> +inline BasicBlock * +RegionNodeBase>::getNodeAs() const { + assert(!isSubRegion() && "This is not a BasicBlock RegionNode!"); + return getEntry(); +} + +template <> +template <> +inline Region * +RegionNodeBase>::getNodeAs() const { + assert(isSubRegion() && "This is not a subregion RegionNode!"); + auto Unconst = const_cast> *>(this); + return reinterpret_cast(Unconst); +} + +template +inline raw_ostream &operator<<(raw_ostream &OS, + const RegionNodeBase &Node) { + typedef typename Tr::BlockT BlockT; + typedef typename Tr::RegionT RegionT; + if (Node.isSubRegion()) - return OS << Node.getNodeAs()->getNameStr(); + return OS << Node.template getNodeAs()->getNameStr(); else - return OS << Node.getNodeAs()->getName(); + return OS << Node.template getNodeAs()->getName(); } + +EXTERN_TEMPLATE_INSTANTIATION(class RegionBase>); +EXTERN_TEMPLATE_INSTANTIATION(class RegionNodeBase>); +EXTERN_TEMPLATE_INSTANTIATION(class RegionInfoBase>); + } // End llvm namespace #endif - diff --git a/include/llvm/Analysis/RegionInfoImpl.h b/include/llvm/Analysis/RegionInfoImpl.h new file mode 100644 index 000000000000..4266b84c32c4 --- /dev/null +++ b/include/llvm/Analysis/RegionInfoImpl.h @@ -0,0 +1,919 @@ +//===- RegionInfoImpl.h - SESE region detection analysis --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Detects single entry single exit regions in the control flow graph. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_REGIONINFOIMPL_H +#define LLVM_ANALYSIS_REGIONINFOIMPL_H + +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/DominanceFrontier.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/RegionIterator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "region" + +//===----------------------------------------------------------------------===// +/// RegionBase Implementation +template +RegionBase::RegionBase(BlockT *Entry, BlockT *Exit, + typename Tr::RegionInfoT *RInfo, DomTreeT *dt, + RegionT *Parent) + : RegionNodeBase(Parent, Entry, 1), RI(RInfo), DT(dt), exit(Exit) {} + +template +RegionBase::~RegionBase() { + // Free the cached nodes. + for (typename BBNodeMapT::iterator it = BBNodeMap.begin(), + ie = BBNodeMap.end(); + it != ie; ++it) + delete it->second; + + // Only clean the cache for this Region. Caches of child Regions will be + // cleaned when the child Regions are deleted. + BBNodeMap.clear(); +} + +template +void RegionBase::replaceEntry(BlockT *BB) { + this->entry.setPointer(BB); +} + +template +void RegionBase::replaceExit(BlockT *BB) { + assert(exit && "No exit to replace!"); + exit = BB; +} + +template +void RegionBase::replaceEntryRecursive(BlockT *NewEntry) { + std::vector RegionQueue; + BlockT *OldEntry = getEntry(); + + RegionQueue.push_back(static_cast(this)); + while (!RegionQueue.empty()) { + RegionT *R = RegionQueue.back(); + RegionQueue.pop_back(); + + R->replaceEntry(NewEntry); + for (typename RegionT::const_iterator RI = R->begin(), RE = R->end(); + RI != RE; ++RI) { + if ((*RI)->getEntry() == OldEntry) + RegionQueue.push_back(RI->get()); + } + } +} + +template +void RegionBase::replaceExitRecursive(BlockT *NewExit) { + std::vector RegionQueue; + BlockT *OldExit = getExit(); + + RegionQueue.push_back(static_cast(this)); + while (!RegionQueue.empty()) { + RegionT *R = RegionQueue.back(); + RegionQueue.pop_back(); + + R->replaceExit(NewExit); + for (typename RegionT::const_iterator RI = R->begin(), RE = R->end(); + RI != RE; ++RI) { + if ((*RI)->getExit() == OldExit) + RegionQueue.push_back(RI->get()); + } + } +} + +template +bool RegionBase::contains(const BlockT *B) const { + BlockT *BB = const_cast(B); + + if (!DT->getNode(BB)) + return false; + + BlockT *entry = getEntry(), *exit = getExit(); + + // Toplevel region. + if (!exit) + return true; + + return (DT->dominates(entry, BB) && + !(DT->dominates(exit, BB) && DT->dominates(entry, exit))); +} + +template +bool RegionBase::contains(const LoopT *L) const { + // BBs that are not part of any loop are element of the Loop + // described by the NULL pointer. This loop is not part of any region, + // except if the region describes the whole function. + if (!L) + return getExit() == nullptr; + + if (!contains(L->getHeader())) + return false; + + SmallVector ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + for (BlockT *BB : ExitingBlocks) { + if (!contains(BB)) + return false; + } + + return true; +} + +template +typename Tr::LoopT *RegionBase::outermostLoopInRegion(LoopT *L) const { + if (!contains(L)) + return nullptr; + + while (L && contains(L->getParentLoop())) { + L = L->getParentLoop(); + } + + return L; +} + +template +typename Tr::LoopT *RegionBase::outermostLoopInRegion(LoopInfoT *LI, + BlockT *BB) const { + assert(LI && BB && "LI and BB cannot be null!"); + LoopT *L = LI->getLoopFor(BB); + return outermostLoopInRegion(L); +} + +template +typename RegionBase::BlockT *RegionBase::getEnteringBlock() const { + BlockT *entry = getEntry(); + BlockT *Pred; + BlockT *enteringBlock = nullptr; + + for (PredIterTy PI = InvBlockTraits::child_begin(entry), + PE = InvBlockTraits::child_end(entry); + PI != PE; ++PI) { + Pred = *PI; + if (DT->getNode(Pred) && !contains(Pred)) { + if (enteringBlock) + return nullptr; + + enteringBlock = Pred; + } + } + + return enteringBlock; +} + +template +typename RegionBase::BlockT *RegionBase::getExitingBlock() const { + BlockT *exit = getExit(); + BlockT *Pred; + BlockT *exitingBlock = nullptr; + + if (!exit) + return nullptr; + + for (PredIterTy PI = InvBlockTraits::child_begin(exit), + PE = InvBlockTraits::child_end(exit); + PI != PE; ++PI) { + Pred = *PI; + if (contains(Pred)) { + if (exitingBlock) + return nullptr; + + exitingBlock = Pred; + } + } + + return exitingBlock; +} + +template +bool RegionBase::isSimple() const { + return !isTopLevelRegion() && getEnteringBlock() && getExitingBlock(); +} + +template +std::string RegionBase::getNameStr() const { + std::string exitName; + std::string entryName; + + if (getEntry()->getName().empty()) { + raw_string_ostream OS(entryName); + + getEntry()->printAsOperand(OS, false); + } else + entryName = getEntry()->getName(); + + if (getExit()) { + if (getExit()->getName().empty()) { + raw_string_ostream OS(exitName); + + getExit()->printAsOperand(OS, false); + } else + exitName = getExit()->getName(); + } else + exitName = ""; + + return entryName + " => " + exitName; +} + +template +void RegionBase::verifyBBInRegion(BlockT *BB) const { + if (!contains(BB)) + llvm_unreachable("Broken region found!"); + + BlockT *entry = getEntry(), *exit = getExit(); + + for (SuccIterTy SI = BlockTraits::child_begin(BB), + SE = BlockTraits::child_end(BB); + SI != SE; ++SI) { + if (!contains(*SI) && exit != *SI) + llvm_unreachable("Broken region found!"); + } + + if (entry != BB) { + for (PredIterTy SI = InvBlockTraits::child_begin(BB), + SE = InvBlockTraits::child_end(BB); + SI != SE; ++SI) { + if (!contains(*SI)) + llvm_unreachable("Broken region found!"); + } + } +} + +template +void RegionBase::verifyWalk(BlockT *BB, std::set *visited) const { + BlockT *exit = getExit(); + + visited->insert(BB); + + verifyBBInRegion(BB); + + for (SuccIterTy SI = BlockTraits::child_begin(BB), + SE = BlockTraits::child_end(BB); + SI != SE; ++SI) { + if (*SI != exit && visited->find(*SI) == visited->end()) + verifyWalk(*SI, visited); + } +} + +template +void RegionBase::verifyRegion() const { + // Only do verification when user wants to, otherwise this expensive check + // will be invoked by PMDataManager::verifyPreservedAnalysis when + // a regionpass (marked PreservedAll) finish. + if (!RegionInfoBase::VerifyRegionInfo) + return; + + std::set visited; + verifyWalk(getEntry(), &visited); +} + +template +void RegionBase::verifyRegionNest() const { + for (typename RegionT::const_iterator RI = begin(), RE = end(); RI != RE; + ++RI) + (*RI)->verifyRegionNest(); + + verifyRegion(); +} + +template +typename RegionBase::element_iterator RegionBase::element_begin() { + return GraphTraits::nodes_begin(static_cast(this)); +} + +template +typename RegionBase::element_iterator RegionBase::element_end() { + return GraphTraits::nodes_end(static_cast(this)); +} + +template +typename RegionBase::const_element_iterator +RegionBase::element_begin() const { + return GraphTraits::nodes_begin( + static_cast(this)); +} + +template +typename RegionBase::const_element_iterator +RegionBase::element_end() const { + return GraphTraits::nodes_end( + static_cast(this)); +} + +template +typename Tr::RegionT *RegionBase::getSubRegionNode(BlockT *BB) const { + typedef typename Tr::RegionT RegionT; + RegionT *R = RI->getRegionFor(BB); + + if (!R || R == this) + return nullptr; + + // If we pass the BB out of this region, that means our code is broken. + assert(contains(R) && "BB not in current region!"); + + while (contains(R->getParent()) && R->getParent() != this) + R = R->getParent(); + + if (R->getEntry() != BB) + return nullptr; + + return R; +} + +template +typename Tr::RegionNodeT *RegionBase::getBBNode(BlockT *BB) const { + assert(contains(BB) && "Can get BB node out of this region!"); + + typename BBNodeMapT::const_iterator at = BBNodeMap.find(BB); + + if (at != BBNodeMap.end()) + return at->second; + + auto Deconst = const_cast *>(this); + RegionNodeT *NewNode = new RegionNodeT(static_cast(Deconst), BB); + BBNodeMap.insert(std::make_pair(BB, NewNode)); + return NewNode; +} + +template +typename Tr::RegionNodeT *RegionBase::getNode(BlockT *BB) const { + assert(contains(BB) && "Can get BB node out of this region!"); + if (RegionT *Child = getSubRegionNode(BB)) + return Child->getNode(); + + return getBBNode(BB); +} + +template +void RegionBase::transferChildrenTo(RegionT *To) { + for (iterator I = begin(), E = end(); I != E; ++I) { + (*I)->parent = To; + To->children.push_back(std::move(*I)); + } + children.clear(); +} + +template +void RegionBase::addSubRegion(RegionT *SubRegion, bool moveChildren) { + assert(!SubRegion->parent && "SubRegion already has a parent!"); + assert(std::find_if(begin(), end(), [&](const std::unique_ptr &R) { + return R.get() == SubRegion; + }) == children.end() && + "Subregion already exists!"); + + SubRegion->parent = static_cast(this); + children.push_back(std::unique_ptr(SubRegion)); + + if (!moveChildren) + return; + + assert(SubRegion->children.empty() && + "SubRegions that contain children are not supported"); + + for (element_iterator I = element_begin(), E = element_end(); I != E; ++I) { + if (!(*I)->isSubRegion()) { + BlockT *BB = (*I)->template getNodeAs(); + + if (SubRegion->contains(BB)) + RI->setRegionFor(BB, SubRegion); + } + } + + std::vector> Keep; + for (iterator I = begin(), E = end(); I != E; ++I) { + if (SubRegion->contains(I->get()) && I->get() != SubRegion) { + (*I)->parent = SubRegion; + SubRegion->children.push_back(std::move(*I)); + } else + Keep.push_back(std::move(*I)); + } + + children.clear(); + children.insert( + children.begin(), + std::move_iterator(Keep.begin()), + std::move_iterator(Keep.end())); +} + +template +typename Tr::RegionT *RegionBase::removeSubRegion(RegionT *Child) { + assert(Child->parent == this && "Child is not a child of this region!"); + Child->parent = nullptr; + typename RegionSet::iterator I = std::find_if( + children.begin(), children.end(), + [&](const std::unique_ptr &R) { return R.get() == Child; }); + assert(I != children.end() && "Region does not exit. Unable to remove."); + children.erase(children.begin() + (I - begin())); + return Child; +} + +template +unsigned RegionBase::getDepth() const { + unsigned Depth = 0; + + for (RegionT *R = getParent(); R != nullptr; R = R->getParent()) + ++Depth; + + return Depth; +} + +template +typename Tr::RegionT *RegionBase::getExpandedRegion() const { + unsigned NumSuccessors = Tr::getNumSuccessors(exit); + + if (NumSuccessors == 0) + return nullptr; + + for (PredIterTy PI = InvBlockTraits::child_begin(getExit()), + PE = InvBlockTraits::child_end(getExit()); + PI != PE; ++PI) { + if (!DT->dominates(getEntry(), *PI)) + return nullptr; + } + + RegionT *R = RI->getRegionFor(exit); + + if (R->getEntry() != exit) { + if (Tr::getNumSuccessors(exit) == 1) + return new RegionT(getEntry(), *BlockTraits::child_begin(exit), RI, DT); + return nullptr; + } + + while (R->getParent() && R->getParent()->getEntry() == exit) + R = R->getParent(); + + if (!DT->dominates(getEntry(), R->getExit())) { + for (PredIterTy PI = InvBlockTraits::child_begin(getExit()), + PE = InvBlockTraits::child_end(getExit()); + PI != PE; ++PI) { + if (!DT->dominates(R->getExit(), *PI)) + return nullptr; + } + } + + return new RegionT(getEntry(), R->getExit(), RI, DT); +} + +template +void RegionBase::print(raw_ostream &OS, bool print_tree, unsigned level, + PrintStyle Style) const { + if (print_tree) + OS.indent(level * 2) << '[' << level << "] " << getNameStr(); + else + OS.indent(level * 2) << getNameStr(); + + OS << '\n'; + + if (Style != PrintNone) { + OS.indent(level * 2) << "{\n"; + OS.indent(level * 2 + 2); + + if (Style == PrintBB) { + for (const auto &BB : blocks()) + OS << BB->getName() << ", "; // TODO: remove the last "," + } else if (Style == PrintRN) { + for (const_element_iterator I = element_begin(), E = element_end(); + I != E; ++I) { + OS << **I << ", "; // TODO: remove the last ", + } + } + + OS << '\n'; + } + + if (print_tree) { + for (const_iterator RI = begin(), RE = end(); RI != RE; ++RI) + (*RI)->print(OS, print_tree, level + 1, Style); + } + + if (Style != PrintNone) + OS.indent(level * 2) << "} \n"; +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +template +void RegionBase::dump() const { + print(dbgs(), true, getDepth(), RegionInfoBase::printStyle); +} +#endif + +template +void RegionBase::clearNodeCache() { + // Free the cached nodes. + for (typename BBNodeMapT::iterator I = BBNodeMap.begin(), + IE = BBNodeMap.end(); + I != IE; ++I) + delete I->second; + + BBNodeMap.clear(); + for (typename RegionT::iterator RI = begin(), RE = end(); RI != RE; ++RI) + (*RI)->clearNodeCache(); +} + +//===----------------------------------------------------------------------===// +// RegionInfoBase implementation +// + +template +RegionInfoBase::RegionInfoBase() + : TopLevelRegion(nullptr) {} + +template +RegionInfoBase::~RegionInfoBase() { + releaseMemory(); +} + +template +bool RegionInfoBase::isCommonDomFrontier(BlockT *BB, BlockT *entry, + BlockT *exit) const { + for (PredIterTy PI = InvBlockTraits::child_begin(BB), + PE = InvBlockTraits::child_end(BB); + PI != PE; ++PI) { + BlockT *P = *PI; + if (DT->dominates(entry, P) && !DT->dominates(exit, P)) + return false; + } + + return true; +} + +template +bool RegionInfoBase::isRegion(BlockT *entry, BlockT *exit) const { + assert(entry && exit && "entry and exit must not be null!"); + typedef typename DomFrontierT::DomSetType DST; + + DST *entrySuccs = &DF->find(entry)->second; + + // Exit is the header of a loop that contains the entry. In this case, + // the dominance frontier must only contain the exit. + if (!DT->dominates(entry, exit)) { + for (typename DST::iterator SI = entrySuccs->begin(), + SE = entrySuccs->end(); + SI != SE; ++SI) { + if (*SI != exit && *SI != entry) + return false; + } + + return true; + } + + DST *exitSuccs = &DF->find(exit)->second; + + // Do not allow edges leaving the region. + for (typename DST::iterator SI = entrySuccs->begin(), SE = entrySuccs->end(); + SI != SE; ++SI) { + if (*SI == exit || *SI == entry) + continue; + if (exitSuccs->find(*SI) == exitSuccs->end()) + return false; + if (!isCommonDomFrontier(*SI, entry, exit)) + return false; + } + + // Do not allow edges pointing into the region. + for (typename DST::iterator SI = exitSuccs->begin(), SE = exitSuccs->end(); + SI != SE; ++SI) { + if (DT->properlyDominates(entry, *SI) && *SI != exit) + return false; + } + + return true; +} + +template +void RegionInfoBase::insertShortCut(BlockT *entry, BlockT *exit, + BBtoBBMap *ShortCut) const { + assert(entry && exit && "entry and exit must not be null!"); + + typename BBtoBBMap::iterator e = ShortCut->find(exit); + + if (e == ShortCut->end()) + // No further region at exit available. + (*ShortCut)[entry] = exit; + else { + // We found a region e that starts at exit. Therefore (entry, e->second) + // is also a region, that is larger than (entry, exit). Insert the + // larger one. + BlockT *BB = e->second; + (*ShortCut)[entry] = BB; + } +} + +template +typename Tr::DomTreeNodeT * +RegionInfoBase::getNextPostDom(DomTreeNodeT *N, BBtoBBMap *ShortCut) const { + typename BBtoBBMap::iterator e = ShortCut->find(N->getBlock()); + + if (e == ShortCut->end()) + return N->getIDom(); + + return PDT->getNode(e->second)->getIDom(); +} + +template +bool RegionInfoBase::isTrivialRegion(BlockT *entry, BlockT *exit) const { + assert(entry && exit && "entry and exit must not be null!"); + + unsigned num_successors = + BlockTraits::child_end(entry) - BlockTraits::child_begin(entry); + + if (num_successors <= 1 && exit == *(BlockTraits::child_begin(entry))) + return true; + + return false; +} + +template +typename Tr::RegionT *RegionInfoBase::createRegion(BlockT *entry, + BlockT *exit) { + assert(entry && exit && "entry and exit must not be null!"); + + if (isTrivialRegion(entry, exit)) + return nullptr; + + RegionT *region = + new RegionT(entry, exit, static_cast(this), DT); + BBtoRegion.insert(std::make_pair(entry, region)); + +#ifdef XDEBUG + region->verifyRegion(); +#else + DEBUG(region->verifyRegion()); +#endif + + updateStatistics(region); + return region; +} + +template +void RegionInfoBase::findRegionsWithEntry(BlockT *entry, + BBtoBBMap *ShortCut) { + assert(entry); + + DomTreeNodeT *N = PDT->getNode(entry); + if (!N) + return; + + RegionT *lastRegion = nullptr; + BlockT *lastExit = entry; + + // As only a BasicBlock that postdominates entry can finish a region, walk the + // post dominance tree upwards. + while ((N = getNextPostDom(N, ShortCut))) { + BlockT *exit = N->getBlock(); + + if (!exit) + break; + + if (isRegion(entry, exit)) { + RegionT *newRegion = createRegion(entry, exit); + + if (lastRegion) + newRegion->addSubRegion(lastRegion); + + lastRegion = newRegion; + lastExit = exit; + } + + // This can never be a region, so stop the search. + if (!DT->dominates(entry, exit)) + break; + } + + // Tried to create regions from entry to lastExit. Next time take a + // shortcut from entry to lastExit. + if (lastExit != entry) + insertShortCut(entry, lastExit, ShortCut); +} + +template +void RegionInfoBase::scanForRegions(FuncT &F, BBtoBBMap *ShortCut) { + typedef typename std::add_pointer::type FuncPtrT; + BlockT *entry = GraphTraits::getEntryNode(&F); + DomTreeNodeT *N = DT->getNode(entry); + + // Iterate over the dominance tree in post order to start with the small + // regions from the bottom of the dominance tree. If the small regions are + // detected first, detection of bigger regions is faster, as we can jump + // over the small regions. + for (po_iterator FI = po_begin(N), FE = po_end(N); FI != FE; + ++FI) { + findRegionsWithEntry(FI->getBlock(), ShortCut); + } +} + +template +typename Tr::RegionT *RegionInfoBase::getTopMostParent(RegionT *region) { + while (region->getParent()) + region = region->getParent(); + + return region; +} + +template +void RegionInfoBase::buildRegionsTree(DomTreeNodeT *N, RegionT *region) { + BlockT *BB = N->getBlock(); + + // Passed region exit + while (BB == region->getExit()) + region = region->getParent(); + + typename BBtoRegionMap::iterator it = BBtoRegion.find(BB); + + // This basic block is a start block of a region. It is already in the + // BBtoRegion relation. Only the child basic blocks have to be updated. + if (it != BBtoRegion.end()) { + RegionT *newRegion = it->second; + region->addSubRegion(getTopMostParent(newRegion)); + region = newRegion; + } else { + BBtoRegion[BB] = region; + } + + for (typename DomTreeNodeT::iterator CI = N->begin(), CE = N->end(); CI != CE; + ++CI) { + buildRegionsTree(*CI, region); + } +} + +#ifdef XDEBUG +template +bool RegionInfoBase::VerifyRegionInfo = true; +#else +template +bool RegionInfoBase::VerifyRegionInfo = false; +#endif + +template +typename Tr::RegionT::PrintStyle RegionInfoBase::printStyle = + RegionBase::PrintNone; + +template +void RegionInfoBase::print(raw_ostream &OS) const { + OS << "Region tree:\n"; + TopLevelRegion->print(OS, true, 0, printStyle); + OS << "End region tree\n"; +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +template +void RegionInfoBase::dump() const { print(dbgs()); } +#endif + +template +void RegionInfoBase::releaseMemory() { + BBtoRegion.clear(); + if (TopLevelRegion) + delete TopLevelRegion; + TopLevelRegion = nullptr; +} + +template +void RegionInfoBase::verifyAnalysis() const { + TopLevelRegion->verifyRegionNest(); +} + +// Region pass manager support. +template +typename Tr::RegionT *RegionInfoBase::getRegionFor(BlockT *BB) const { + typename BBtoRegionMap::const_iterator I = BBtoRegion.find(BB); + return I != BBtoRegion.end() ? I->second : nullptr; +} + +template +void RegionInfoBase::setRegionFor(BlockT *BB, RegionT *R) { + BBtoRegion[BB] = R; +} + +template +typename Tr::RegionT *RegionInfoBase::operator[](BlockT *BB) const { + return getRegionFor(BB); +} + +template +typename RegionInfoBase::BlockT * +RegionInfoBase::getMaxRegionExit(BlockT *BB) const { + BlockT *Exit = nullptr; + + while (true) { + // Get largest region that starts at BB. + RegionT *R = getRegionFor(BB); + while (R && R->getParent() && R->getParent()->getEntry() == BB) + R = R->getParent(); + + // Get the single exit of BB. + if (R && R->getEntry() == BB) + Exit = R->getExit(); + else if (++BlockTraits::child_begin(BB) == BlockTraits::child_end(BB)) + Exit = *BlockTraits::child_begin(BB); + else // No single exit exists. + return Exit; + + // Get largest region that starts at Exit. + RegionT *ExitR = getRegionFor(Exit); + while (ExitR && ExitR->getParent() && + ExitR->getParent()->getEntry() == Exit) + ExitR = ExitR->getParent(); + + for (PredIterTy PI = InvBlockTraits::child_begin(Exit), + PE = InvBlockTraits::child_end(Exit); + PI != PE; ++PI) { + if (!R->contains(*PI) && !ExitR->contains(*PI)) + break; + } + + // This stops infinite cycles. + if (DT->dominates(Exit, BB)) + break; + + BB = Exit; + } + + return Exit; +} + +template +typename Tr::RegionT *RegionInfoBase::getCommonRegion(RegionT *A, + RegionT *B) const { + assert(A && B && "One of the Regions is NULL"); + + if (A->contains(B)) + return A; + + while (!B->contains(A)) + B = B->getParent(); + + return B; +} + +template +typename Tr::RegionT * +RegionInfoBase::getCommonRegion(SmallVectorImpl &Regions) const { + RegionT *ret = Regions.back(); + Regions.pop_back(); + + for (RegionT *R : Regions) + ret = getCommonRegion(ret, R); + + return ret; +} + +template +typename Tr::RegionT * +RegionInfoBase::getCommonRegion(SmallVectorImpl &BBs) const { + RegionT *ret = getRegionFor(BBs.back()); + BBs.pop_back(); + + for (BlockT *BB : BBs) + ret = getCommonRegion(ret, getRegionFor(BB)); + + return ret; +} + +template +void RegionInfoBase::splitBlock(BlockT *NewBB, BlockT *OldBB) { + RegionT *R = getRegionFor(OldBB); + + setRegionFor(NewBB, R); + + while (R->getEntry() == OldBB && !R->isTopLevelRegion()) { + R->replaceEntry(NewBB); + R = R->getParent(); + } + + setRegionFor(OldBB, R); +} + +template +void RegionInfoBase::calculate(FuncT &F) { + typedef typename std::add_pointer::type FuncPtrT; + + // ShortCut a function where for every BB the exit of the largest region + // starting with BB is stored. These regions can be threated as single BBS. + // This improves performance on linear CFGs. + BBtoBBMap ShortCut; + + scanForRegions(F, &ShortCut); + BlockT *BB = GraphTraits::getEntryNode(&F); + buildRegionsTree(DT->getNode(BB), TopLevelRegion); +} + +#endif diff --git a/include/llvm/Analysis/RegionIterator.h b/include/llvm/Analysis/RegionIterator.h index ab4d0e0fcc01..0daff58475dd 100644 --- a/include/llvm/Analysis/RegionIterator.h +++ b/include/llvm/Analysis/RegionIterator.h @@ -30,13 +30,16 @@ namespace llvm { /// /// For a subregion RegionNode there is just one successor. The RegionNode /// representing the exit of the subregion. -template +template class RNSuccIterator : public std::iterator -{ + NodeType, ptrdiff_t> { typedef std::iterator super; + + typedef GraphTraits BlockTraits; + typedef typename BlockTraits::ChildIteratorType SuccIterTy; + // The iterator works in two modes, bb mode or region mode. - enum ItMode{ + enum ItMode { // In BB mode it returns all successors of this BasicBlock as its // successors. ItBB, @@ -47,10 +50,10 @@ class RNSuccIterator : public std::iterator Node; + PointerIntPair Node; // The block successor iterator. - succ_iterator BItor; + SuccIterTy BItor; // advanceRegionSucc - A region node has only one successor. It reaches end // once we advance it. @@ -66,37 +69,36 @@ class RNSuccIterator : public std::iteratorgetParent()->getNode(BB); assert(succ && "BB not in Region or entered subregion!"); return succ; } // getRegionSucc - Return the successor basic block of a SubRegion RegionNode. - inline BasicBlock* getRegionSucc() const { + inline BlockT* getRegionSucc() const { assert(Node.getInt() == ItRgBegin && "Cannot get the region successor!"); - return getNode()->template getNodeAs()->getExit(); + return getNode()->template getNodeAs()->getExit(); } // isExit - Is this the exit BB of the Region? - inline bool isExit(BasicBlock* BB) const { + inline bool isExit(BlockT* BB) const { return getNode()->getParent()->getExit() == BB; } public: - typedef RNSuccIterator Self; + typedef RNSuccIterator Self; typedef typename super::pointer pointer; /// @brief Create begin iterator of a RegionNode. inline RNSuccIterator(NodeType* node) : Node(node, node->isSubRegion() ? ItRgBegin : ItBB), - BItor(succ_begin(node->getEntry())) { - + BItor(BlockTraits::child_begin(node->getEntry())) { // Skip the exit block if (!isRegionMode()) - while (succ_end(node->getEntry()) != BItor && isExit(*BItor)) + while (BlockTraits::child_end(node->getEntry()) != BItor && isExit(*BItor)) ++BItor; if (isRegionMode() && isExit(getRegionSucc())) @@ -106,7 +108,7 @@ class RNSuccIterator : public std::iteratorisSubRegion() ? ItRgEnd : ItBB), - BItor(succ_end(node->getEntry())) {} + BItor(BlockTraits::child_end(node->getEntry())) {} inline bool operator==(const Self& x) const { assert(isRegionMode() == x.isRegionMode() && "Broken iterator!"); @@ -119,7 +121,7 @@ class RNSuccIterator : public std::iteratorgetEntry()) + while (BItor != BlockTraits::child_end(getNode()->getEntry()) && isExit(*BItor)); } return *this; @@ -162,36 +164,41 @@ class RNSuccIterator : public std::iterator -class RNSuccIterator > - : public std::iterator -{ +template +class RNSuccIterator, BlockT, RegionT> + : public std::iterator { typedef std::iterator super; + typedef GraphTraits BlockTraits; + typedef typename BlockTraits::ChildIteratorType SuccIterTy; + NodeType* Node; - succ_iterator Itor; + SuccIterTy Itor; public: - typedef RNSuccIterator > Self; + typedef RNSuccIterator, BlockT, RegionT> Self; typedef typename super::pointer pointer; /// @brief Create the iterator from a RegionNode. /// /// Note that the incoming node must be a bb node, otherwise it will trigger /// an assertion when we try to get a BasicBlock. - inline RNSuccIterator(NodeType* node) : Node(node), - Itor(succ_begin(node->getEntry())) { + inline RNSuccIterator(NodeType* node) : + Node(node), + Itor(BlockTraits::child_begin(node->getEntry())) { assert(!Node->isSubRegion() && "Subregion node not allowed in flat iterating mode!"); assert(Node->getParent() && "A BB node must have a parent!"); // Skip the exit block of the iterating region. - while (succ_end(Node->getEntry()) != Itor + while (BlockTraits::child_end(Node->getEntry()) != Itor && Node->getParent()->getExit() == *Itor) ++Itor; } + /// @brief Create an end iterator - inline RNSuccIterator(NodeType* node, bool) : Node(node), - Itor(succ_end(node->getEntry())) { + inline RNSuccIterator(NodeType* node, bool) : + Node(node), + Itor(BlockTraits::child_end(node->getEntry())) { assert(!Node->isSubRegion() && "Subregion node not allowed in flat iterating mode!"); } @@ -206,10 +213,10 @@ class RNSuccIterator > inline bool operator!=(const Self& x) const { return !operator==(x); } inline pointer operator*() const { - BasicBlock* BB = *Itor; + BlockT *BB = *Itor; // Get the iterating region. - Region* Parent = Node->getParent(); + RegionT *Parent = Node->getParent(); // The only case that the successor reaches out of the region is it reaches // the exit of the region. @@ -245,14 +252,14 @@ class RNSuccIterator > } }; -template -inline RNSuccIterator succ_begin(NodeType* Node) { - return RNSuccIterator(Node); +template +inline RNSuccIterator succ_begin(NodeType* Node) { + return RNSuccIterator(Node); } -template -inline RNSuccIterator succ_end(NodeType* Node) { - return RNSuccIterator(Node, true); +template +inline RNSuccIterator succ_end(NodeType* Node) { + return RNSuccIterator(Node, true); } //===--------------------------------------------------------------------===// @@ -262,27 +269,27 @@ inline RNSuccIterator succ_end(NodeType* Node) { // NodeT can either be region node or const region node, otherwise child_begin // and child_end fail. -#define RegionNodeGraphTraits(NodeT) \ - template<> struct GraphTraits { \ +#define RegionNodeGraphTraits(NodeT, BlockT, RegionT) \ + template<> struct GraphTraits { \ typedef NodeT NodeType; \ - typedef RNSuccIterator ChildIteratorType; \ + typedef RNSuccIterator ChildIteratorType; \ static NodeType *getEntryNode(NodeType* N) { return N; } \ static inline ChildIteratorType child_begin(NodeType *N) { \ - return RNSuccIterator(N); \ + return RNSuccIterator(N); \ } \ static inline ChildIteratorType child_end(NodeType *N) { \ - return RNSuccIterator(N, true); \ + return RNSuccIterator(N, true); \ } \ }; \ -template<> struct GraphTraits > { \ +template<> struct GraphTraits> { \ typedef NodeT NodeType; \ - typedef RNSuccIterator > ChildIteratorType; \ + typedef RNSuccIterator, BlockT, RegionT > ChildIteratorType; \ static NodeType *getEntryNode(NodeType* N) { return N; } \ static inline ChildIteratorType child_begin(NodeType *N) { \ - return RNSuccIterator >(N); \ + return RNSuccIterator, BlockT, RegionT>(N); \ } \ static inline ChildIteratorType child_end(NodeType *N) { \ - return RNSuccIterator >(N, true); \ + return RNSuccIterator, BlockT, RegionT>(N, true); \ } \ } @@ -315,8 +322,8 @@ template<> struct GraphTraits > \ } \ } -RegionNodeGraphTraits(RegionNode); -RegionNodeGraphTraits(const RegionNode); +RegionNodeGraphTraits(RegionNode, BasicBlock, Region); +RegionNodeGraphTraits(const RegionNode, BasicBlock, Region); RegionGraphTraits(Region, RegionNode); RegionGraphTraits(const Region, const RegionNode); @@ -337,6 +344,22 @@ template <> struct GraphTraits } }; +template <> struct GraphTraits + : public GraphTraits { + typedef df_iterator, false, + GraphTraits > > nodes_iterator; + + static NodeType *getEntryNode(RegionInfoPass *RI) { + return GraphTraits::getEntryNode(&RI->getRegionInfo()); + } + static nodes_iterator nodes_begin(RegionInfoPass* RI) { + return GraphTraits::nodes_begin(&RI->getRegionInfo()); + } + static nodes_iterator nodes_end(RegionInfoPass *RI) { + return GraphTraits::nodes_end(&RI->getRegionInfo()); + } +}; + } // End namespace llvm #endif diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h index 057082676824..617e54541ee1 100644 --- a/include/llvm/Analysis/ScalarEvolution.h +++ b/include/llvm/Analysis/ScalarEvolution.h @@ -795,7 +795,8 @@ namespace llvm { /// forgetLoop - This method should be called by the client when it has /// changed a loop in a way that may effect ScalarEvolution's ability to - /// compute a trip count, or if the loop is deleted. + /// compute a trip count, or if the loop is deleted. This call is + /// potentially expensive for large loop bodies. void forgetLoop(const Loop *L); /// forgetValue - This method should be called by the client when it has diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h index 01b034f8a011..2f1b1c3841f3 100644 --- a/include/llvm/Analysis/ScalarEvolutionExpressions.h +++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h @@ -309,17 +309,17 @@ namespace llvm { getLoop(), FlagAnyWrap); } - /// isAffine - Return true if this is an affine AddRec (i.e., it represents - /// an expressions A+B*x where A and B are loop invariant values. + /// isAffine - Return true if this represents an expression + /// A + B*x where A and B are loop invariant values. bool isAffine() const { // We know that the start value is invariant. This expression is thus // affine iff the step is also invariant. return getNumOperands() == 2; } - /// isQuadratic - Return true if this is an quadratic AddRec (i.e., it - /// represents an expressions A+B*x+C*x^2 where A, B and C are loop - /// invariant values. This corresponds to an addrec of the form {L,+,M,+,N} + /// isQuadratic - Return true if this represents an expression + /// A + B*x + C*x^2 where A, B and C are loop invariant values. + /// This corresponds to an addrec of the form {L,+,M,+,N} bool isQuadratic() const { return getNumOperands() == 3; } diff --git a/include/llvm/Analysis/TargetFolder.h b/include/llvm/Analysis/TargetFolder.h index 8a7fc7caf7b2..587a7ef5410a 100644 --- a/include/llvm/Analysis/TargetFolder.h +++ b/include/llvm/Analysis/TargetFolder.h @@ -211,6 +211,13 @@ class TargetFolder { return Fold(ConstantExpr::getTruncOrBitCast(C, DestTy)); } + Constant *CreatePointerBitCastOrAddrSpaceCast(Constant *C, + Type *DestTy) const { + if (C->getType() == DestTy) + return C; // avoid calling Fold + return Fold(ConstantExpr::getPointerBitCastOrAddrSpaceCast(C, DestTy)); + } + //===--------------------------------------------------------------------===// // Compare Instructions //===--------------------------------------------------------------------===// diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 79fe1dcae639..f57f3eb009a1 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -322,6 +322,7 @@ class TargetTransformInfo { enum ShuffleKind { SK_Broadcast, ///< Broadcast element 0 to all other elements. SK_Reverse, ///< Reverse the order of the vector. + SK_Alternate, ///< Choose alternate elements from vector. SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset. SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset. }; diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h index ce7896738d8c..83b5408fb1c2 100644 --- a/include/llvm/Analysis/ValueTracking.h +++ b/include/llvm/Analysis/ValueTracking.h @@ -37,7 +37,10 @@ namespace llvm { /// for all of the elements in the vector. void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, const DataLayout *TD = nullptr, unsigned Depth = 0); - void computeKnownBitsLoad(const MDNode &Ranges, APInt &KnownZero); + /// Compute known bits from the range metadata. + /// \p KnownZero the set of bits that are known to be zero + void computeKnownBitsFromRangeMetadata(const MDNode &Ranges, + APInt &KnownZero); /// ComputeSignBit - Determine whether the sign bit is known to be zero or /// one. Convenience wrapper around computeKnownBits. diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h index 10b0f65cbc0c..ee2efa2257b1 100644 --- a/include/llvm/Bitcode/LLVMBitCodes.h +++ b/include/llvm/Bitcode/LLVMBitCodes.h @@ -71,7 +71,8 @@ namespace bitc { // MODULE_CODE_PURGEVALS: [numvals] MODULE_CODE_PURGEVALS = 10, - MODULE_CODE_GCNAME = 11 // GCNAME: [strchr x N] + MODULE_CODE_GCNAME = 11, // GCNAME: [strchr x N] + MODULE_CODE_COMDAT = 12, // COMDAT: [selection_kind, name] }; /// PARAMATTR blocks have code for defining a parameter attribute set. @@ -289,7 +290,7 @@ namespace bitc { FUNC_CODE_INST_PHI = 16, // PHI: [ty, val0,bb0, ...] // 17 is unused. // 18 is unused. - FUNC_CODE_INST_ALLOCA = 19, // ALLOCA: [instty, op, align] + FUNC_CODE_INST_ALLOCA = 19, // ALLOCA: [instty, opty, op, align] FUNC_CODE_INST_LOAD = 20, // LOAD: [opty, op, align, vol] // 21 is unused. // 22 is unused. @@ -373,7 +374,16 @@ namespace bitc { ATTR_KIND_OPTIMIZE_NONE = 37, ATTR_KIND_IN_ALLOCA = 38, ATTR_KIND_NON_NULL = 39, - ATTR_KIND_JUMP_TABLE = 40 + ATTR_KIND_JUMP_TABLE = 40, + ATTR_KIND_DEREFERENCEABLE = 41 + }; + + enum ComdatSelectionKindCodes { + COMDAT_SELECTION_KIND_ANY = 1, + COMDAT_SELECTION_KIND_EXACT_MATCH = 2, + COMDAT_SELECTION_KIND_LARGEST = 3, + COMDAT_SELECTION_KIND_NO_DUPLICATES = 4, + COMDAT_SELECTION_KIND_SAME_SIZE = 5, }; } // End bitc namespace diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h index 0d0d6a71ab14..8cf573544f8d 100644 --- a/include/llvm/Bitcode/ReaderWriter.h +++ b/include/llvm/Bitcode/ReaderWriter.h @@ -30,8 +30,7 @@ namespace llvm { /// deserialization of function bodies. If successful, this takes ownership /// of 'buffer. On error, this *does not* take ownership of Buffer. ErrorOr getLazyBitcodeModule(MemoryBuffer *Buffer, - LLVMContext &Context, - bool BufferOwned = true); + LLVMContext &Context); /// getStreamedBitcodeModule - Read the header of the specified stream /// and prepare for lazy deserialization and streaming of function bodies. @@ -42,14 +41,11 @@ namespace llvm { LLVMContext &Context, std::string *ErrMsg = nullptr); - /// getBitcodeTargetTriple - Read the header of the specified bitcode - /// buffer and extract just the triple information. If successful, - /// this returns a string and *does not* take ownership - /// of 'buffer'. On error, this returns "", and fills in *ErrMsg - /// if ErrMsg is non-null. + /// Read the header of the specified bitcode buffer and extract just the + /// triple information. If successful, this returns a string and *does not* + /// take ownership of 'buffer'. On error, this returns "". std::string getBitcodeTargetTriple(MemoryBuffer *Buffer, - LLVMContext &Context, - std::string *ErrMsg = nullptr); + LLVMContext &Context); /// Read the specified bitcode file, returning the module. /// This method *never* takes ownership of Buffer. diff --git a/include/llvm/CodeGen/Analysis.h b/include/llvm/CodeGen/Analysis.h index c5060fb5441c..b791ba09adaf 100644 --- a/include/llvm/CodeGen/Analysis.h +++ b/include/llvm/CodeGen/Analysis.h @@ -24,10 +24,11 @@ namespace llvm { class GlobalVariable; class TargetLoweringBase; +class TargetLowering; +class TargetMachine; class SDNode; class SDValue; class SelectionDAG; -class TargetLowering; struct EVT; /// ComputeLinearIndex - Given an LLVM IR aggregate type and a sequence @@ -86,7 +87,7 @@ ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred); /// between it and the return. /// /// This function only tests target-independent requirements. -bool isInTailCallPosition(ImmutableCallSite CS, const SelectionDAG &DAG); +bool isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM); /// Test if given that the input instruction is in the tail call position if the /// return type or any attributes of the function will inhibit tail call diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h index c7ec6a024b84..0d1b1dc09560 100644 --- a/include/llvm/CodeGen/FastISel.h +++ b/include/llvm/CodeGen/FastISel.h @@ -16,7 +16,10 @@ #define LLVM_CODEGEN_FASTISEL_H #include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/IR/CallingConv.h" namespace llvm { @@ -27,6 +30,7 @@ class CallInst; class DataLayout; class FunctionLoweringInfo; class Instruction; +class IntrinsicInst; class LoadInst; class MVT; class MachineConstantPool; @@ -46,9 +50,148 @@ class Value; /// This is a fast-path instruction selection class that generates poor code and /// doesn't support illegal types or non-trivial lowering, but runs quickly. class FastISel { + public: + struct ArgListEntry { + Value *Val; + Type *Ty; + bool isSExt : 1; + bool isZExt : 1; + bool isInReg : 1; + bool isSRet : 1; + bool isNest : 1; + bool isByVal : 1; + bool isInAlloca : 1; + bool isReturned : 1; + uint16_t Alignment; + + ArgListEntry() + : Val(nullptr), Ty(nullptr), isSExt(false), isZExt(false), isInReg(false), + isSRet(false), isNest(false), isByVal(false), isInAlloca(false), + isReturned(false), Alignment(0) { } + + void setAttributes(ImmutableCallSite *CS, unsigned AttrIdx); + }; + typedef std::vector ArgListTy; + + struct CallLoweringInfo { + Type *RetTy; + bool RetSExt : 1; + bool RetZExt : 1; + bool IsVarArg : 1; + bool IsInReg : 1; + bool DoesNotReturn : 1; + bool IsReturnValueUsed : 1; + + // IsTailCall should be modified by implementations of + // FastLowerCall that perform tail call conversions. + bool IsTailCall; + + unsigned NumFixedArgs; + CallingConv::ID CallConv; + const Value *Callee; + const char *SymName; + ArgListTy Args; + ImmutableCallSite *CS; + MachineInstr *Call; + unsigned ResultReg; + unsigned NumResultRegs; + + SmallVector OutVals; + SmallVector OutFlags; + SmallVector OutRegs; + SmallVector Ins; + SmallVector InRegs; + + CallLoweringInfo() + : RetTy(nullptr), RetSExt(false), RetZExt(false), IsVarArg(false), + IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true), + IsTailCall(false), NumFixedArgs(-1), CallConv(CallingConv::C), + Callee(nullptr), SymName(nullptr), CS(nullptr), Call(nullptr), + ResultReg(0), NumResultRegs(0) + {} + + CallLoweringInfo &setCallee(Type *ResultTy, FunctionType *FuncTy, + const Value *Target, ArgListTy &&ArgsList, + ImmutableCallSite &Call) { + RetTy = ResultTy; + Callee = Target; + + IsInReg = Call.paramHasAttr(0, Attribute::InReg); + DoesNotReturn = Call.doesNotReturn(); + IsVarArg = FuncTy->isVarArg(); + IsReturnValueUsed = !Call.getInstruction()->use_empty(); + RetSExt = Call.paramHasAttr(0, Attribute::SExt); + RetZExt = Call.paramHasAttr(0, Attribute::ZExt); + + CallConv = Call.getCallingConv(); + NumFixedArgs = FuncTy->getNumParams(); + Args = std::move(ArgsList); + + CS = &Call; + + return *this; + } + + CallLoweringInfo &setCallee(Type *ResultTy, FunctionType *FuncTy, + const char *Target, ArgListTy &&ArgsList, + ImmutableCallSite &Call, + unsigned FixedArgs = ~0U) { + RetTy = ResultTy; + Callee = Call.getCalledValue(); + SymName = Target; + + IsInReg = Call.paramHasAttr(0, Attribute::InReg); + DoesNotReturn = Call.doesNotReturn(); + IsVarArg = FuncTy->isVarArg(); + IsReturnValueUsed = !Call.getInstruction()->use_empty(); + RetSExt = Call.paramHasAttr(0, Attribute::SExt); + RetZExt = Call.paramHasAttr(0, Attribute::ZExt); + + CallConv = Call.getCallingConv(); + NumFixedArgs = (FixedArgs == ~0U) ? FuncTy->getNumParams() : FixedArgs; + Args = std::move(ArgsList); + + CS = &Call; + + return *this; + } + + CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultTy, + const Value *Target, ArgListTy &&ArgsList, + unsigned FixedArgs = ~0U) { + RetTy = ResultTy; + Callee = Target; + CallConv = CC; + NumFixedArgs = (FixedArgs == ~0U) ? Args.size() : FixedArgs; + Args = std::move(ArgsList); + return *this; + } + + CallLoweringInfo &setTailCall(bool Value = true) { + IsTailCall = Value; + return *this; + } + + ArgListTy &getArgs() { + return Args; + } + + void clearOuts() { + OutVals.clear(); + OutFlags.clear(); + OutRegs.clear(); + } + + void clearIns() { + Ins.clear(); + InRegs.clear(); + } + }; + protected: DenseMap LocalValueMap; FunctionLoweringInfo &FuncInfo; + MachineFunction *MF; MachineRegisterInfo &MRI; MachineFrameInfo &MFI; MachineConstantPool &MCP; @@ -171,13 +314,20 @@ class FastISel { /// process fails to select an instruction. This gives targets a chance to /// emit code for anything that doesn't fit into FastISel's framework. It /// returns true if it was successful. - virtual bool - TargetSelectInstruction(const Instruction *I) = 0; + virtual bool TargetSelectInstruction(const Instruction *I) = 0; /// This method is called by target-independent code to do target specific /// argument lowering. It returns true if it was successful. virtual bool FastLowerArguments(); + /// \brief This method is called by target-independent code to do target + /// specific call lowering. It returns true if it was successful. + virtual bool FastLowerCall(CallLoweringInfo &CLI); + + /// \brief This method is called by target-independent code to do target + /// specific intrinsic lowering. It returns true if it was successful. + virtual bool FastLowerIntrinsicCall(const IntrinsicInst *II); + /// This method is called by target-independent code to request that an /// instruction with the given type and opcode be emitted. virtual unsigned FastEmit_(MVT VT, @@ -380,6 +530,9 @@ class FastISel { /// \brief Create a machine mem operand from the given instruction. MachineMemOperand *createMachineMemOperandFor(const Instruction *I) const; + bool LowerCallTo(const CallInst *CI, const char *SymName, unsigned NumArgs); + bool LowerCallTo(CallLoweringInfo &CLI); + private: bool SelectBinaryOp(const User *I, unsigned ISDOpcode); @@ -387,7 +540,11 @@ class FastISel { bool SelectGetElementPtr(const User *I); - bool SelectCall(const User *I); + bool SelectStackmap(const CallInst *I); + bool SelectPatchpoint(const CallInst *I); + bool LowerCall(const CallInst *I); + bool SelectCall(const User *Call); + bool SelectIntrinsicCall(const IntrinsicInst *II); bool SelectBitCast(const User *I); @@ -418,6 +575,9 @@ class FastISel { bool addStackMapLiveVars(SmallVectorImpl &Ops, const CallInst *CI, unsigned StartIdx); + bool lowerCallOperands(const CallInst *CI, unsigned ArgIdx, unsigned NumArgs, + const Value *Callee, bool ForceRetVoidTy, + CallLoweringInfo &CLI); }; } diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index 80fb8b2d3a5d..84447616c989 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -379,6 +379,37 @@ namespace ISD { /// operand, a ValueType node. SIGN_EXTEND_INREG, + /// ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an + /// in-register any-extension of the low lanes of an integer vector. The + /// result type must have fewer elements than the operand type, and those + /// elements must be larger integer types such that the total size of the + /// operand type and the result type match. Each of the low operand + /// elements is any-extended into the corresponding, wider result + /// elements with the high bits becoming undef. + ANY_EXTEND_VECTOR_INREG, + + /// SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an + /// in-register sign-extension of the low lanes of an integer vector. The + /// result type must have fewer elements than the operand type, and those + /// elements must be larger integer types such that the total size of the + /// operand type and the result type match. Each of the low operand + /// elements is sign-extended into the corresponding, wider result + /// elements. + // FIXME: The SIGN_EXTEND_INREG node isn't specifically limited to + // scalars, but it also doesn't handle vectors well. Either it should be + // restricted to scalars or this node (and its handling) should be merged + // into it. + SIGN_EXTEND_VECTOR_INREG, + + /// ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an + /// in-register zero-extension of the low lanes of an integer vector. The + /// result type must have fewer elements than the operand type, and those + /// elements must be larger integer types such that the total size of the + /// operand type and the result type match. Each of the low operand + /// elements is zero-extended into the corresponding, wider result + /// elements. + ZERO_EXTEND_VECTOR_INREG, + /// FP_TO_[US]INT - Convert a floating point value to a signed or unsigned /// integer. FP_TO_SINT, @@ -441,11 +472,11 @@ namespace ISD { /// 5) ISD::CvtCode indicating the type of conversion to do CONVERT_RNDSAT, - /// FP16_TO_FP32, FP32_TO_FP16 - These operators are used to perform - /// promotions and truncation for half-precision (16 bit) floating - /// numbers. We need special nodes since FP16 is a storage-only type with - /// special semantics of operations. - FP16_TO_FP32, FP32_TO_FP16, + /// FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions + /// and truncation for half-precision (16 bit) floating numbers. These nodes + /// form a semi-softened interface for dealing with f16 (as an i16), which + /// is often a storage-only type but has native conversions. + FP16_TO_FP, FP_TO_FP16, /// FNEG, FABS, FSQRT, FSIN, FCOS, FPOWI, FPOW, /// FLOG, FLOG2, FLOG10, FEXP, FEXP2, diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index 90bdeee46d26..a08cc2eb508a 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -620,7 +620,7 @@ class MachineBasicBlock : public ilist_node { /// computeRegisterLiveness - Return whether (physical) register \c Reg /// has been ined and not ed as of just before \c MI. - /// + /// /// Search is localised to a neighborhood of /// \c Neighborhood instructions before (searching for defs or kills) and /// Neighborhood instructions after (searching just for defs) MI. @@ -635,7 +635,7 @@ class MachineBasicBlock : public ilist_node { void print(raw_ostream &OS, SlotIndexes* = nullptr) const; // Printing method used by LoopInfo. - void printAsOperand(raw_ostream &OS, bool PrintType = true); + void printAsOperand(raw_ostream &OS, bool PrintType = true) const; /// getNumber - MachineBasicBlocks are uniquely numbered at the function /// level, unless they're not in a MachineFunction yet, in which case this diff --git a/include/llvm/CodeGen/MachineConstantPool.h b/include/llvm/CodeGen/MachineConstantPool.h index 912ce8966268..c619afb83333 100644 --- a/include/llvm/CodeGen/MachineConstantPool.h +++ b/include/llvm/CodeGen/MachineConstantPool.h @@ -17,6 +17,7 @@ #define LLVM_CODEGEN_MACHINECONSTANTPOOL_H #include "llvm/ADT/DenseSet.h" +#include "llvm/MC/SectionKind.h" #include #include #include @@ -119,6 +120,8 @@ class MachineConstantPoolEntry { /// them. /// 2: This entry may have arbitrary relocations. unsigned getRelocationInfo() const; + + SectionKind getSectionKind(const DataLayout *DL) const; }; /// The MachineConstantPool class keeps track of constants referenced by a diff --git a/include/llvm/CodeGen/MachineDominanceFrontier.h b/include/llvm/CodeGen/MachineDominanceFrontier.h new file mode 100644 index 000000000000..e099e716c63d --- /dev/null +++ b/include/llvm/CodeGen/MachineDominanceFrontier.h @@ -0,0 +1,109 @@ +//===- llvm/CodeGen/MachineDominanceFrontier.h ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_MACHINEDOMINANCEFRONTIER_H +#define LLVM_CODEGEN_MACHINEDOMINANCEFRONTIER_H + +#include "llvm/Analysis/DominanceFrontier.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + + +namespace llvm { + +class MachineDominanceFrontier : public MachineFunctionPass { + ForwardDominanceFrontierBase Base; +public: + typedef DominatorTreeBase DomTreeT; + typedef DomTreeNodeBase DomTreeNodeT; + typedef DominanceFrontierBase::DomSetType DomSetType; + typedef DominanceFrontierBase::iterator iterator; + typedef DominanceFrontierBase::const_iterator const_iterator; + + void operator=(const MachineDominanceFrontier &) LLVM_DELETED_FUNCTION; + MachineDominanceFrontier(const MachineDominanceFrontier &) LLVM_DELETED_FUNCTION; + + static char ID; + + MachineDominanceFrontier(); + + DominanceFrontierBase &getBase() { + return Base; + } + + inline const std::vector &getRoots() const { + return Base.getRoots(); + } + + MachineBasicBlock *getRoot() const { + return Base.getRoot(); + } + + bool isPostDominator() const { + return Base.isPostDominator(); + } + + iterator begin() { + return Base.begin(); + } + + const_iterator begin() const { + return Base.begin(); + } + + iterator end() { + return Base.end(); + } + + const_iterator end() const { + return Base.end(); + } + + iterator find(MachineBasicBlock *B) { + return Base.find(B); + } + + const_iterator find(MachineBasicBlock *B) const { + return Base.find(B); + } + + iterator addBasicBlock(MachineBasicBlock *BB, const DomSetType &frontier) { + return Base.addBasicBlock(BB, frontier); + } + + void removeBlock(MachineBasicBlock *BB) { + return Base.removeBlock(BB); + } + + void addToFrontier(iterator I, MachineBasicBlock *Node) { + return Base.addToFrontier(I, Node); + } + + void removeFromFrontier(iterator I, MachineBasicBlock *Node) { + return Base.removeFromFrontier(I, Node); + } + + bool compareDomSet(DomSetType &DS1, const DomSetType &DS2) const { + return Base.compareDomSet(DS1, DS2); + } + + bool compare(DominanceFrontierBase &Other) const { + return Base.compare(Other); + } + + bool runOnMachineFunction(MachineFunction &F) override; + + void releaseMemory() override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +} + +#endif diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h index bd0ea119114c..c51f8fe03bbf 100644 --- a/include/llvm/CodeGen/MachineFrameInfo.h +++ b/include/llvm/CodeGen/MachineFrameInfo.h @@ -484,6 +484,9 @@ class MachineFrameInfo { /// int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool Immutable); + /// CreateFixedSpillStackObject - Create a spill slot at a fixed location + /// on the stack. Returns an index with a negative value. + int CreateFixedSpillStackObject(uint64_t Size, int64_t SPOffset); /// isFixedObjectIndex - Returns true if the specified index corresponds to a /// fixed stack object. diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h index f4c2542b990d..042c62b4a887 100644 --- a/include/llvm/CodeGen/MachineFunction.h +++ b/include/llvm/CodeGen/MachineFunction.h @@ -227,19 +227,14 @@ class MachineFunction { void setHasInlineAsm(bool B) { HasInlineAsm = B; } - + /// getInfo - Keep track of various per-function pieces of information for /// backends that would like to do so. /// template Ty *getInfo() { - if (!MFInfo) { - // This should be just `new (Allocator.Allocate()) Ty(*this)', but - // that apparently breaks GCC 3.3. - Ty *Loc = static_cast(Allocator.Allocate(sizeof(Ty), - AlignOf::Alignment)); - MFInfo = new (Loc) Ty(*this); - } + if (!MFInfo) + MFInfo = new (Allocator.Allocate()) Ty(*this); return static_cast(MFInfo); } diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h index b0d3e02cebf9..3c828116411e 100644 --- a/include/llvm/CodeGen/MachineInstr.h +++ b/include/llvm/CodeGen/MachineInstr.h @@ -727,6 +727,9 @@ class MachineInstr : public ilist_node { bool isFullCopy() const { return isCopy() && !getOperand(0).getSubReg() && !getOperand(1).getSubReg(); } + bool isExtractSubreg() const { + return getOpcode() == TargetOpcode::EXTRACT_SUBREG; + } /// isCopyLike - Return true if the instruction behaves like a copy. /// This does not include native copy instructions. @@ -947,7 +950,7 @@ class MachineInstr : public ilist_node { } /// isRegTiedToDefOperand - Return true if the use operand of the specified - /// index is tied to an def operand. It also returns the def operand index by + /// index is tied to a def operand. It also returns the def operand index by /// reference if DefOpIdx is not null. bool isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx = nullptr) const { diff --git a/include/llvm/CodeGen/MachineRegionInfo.h b/include/llvm/CodeGen/MachineRegionInfo.h new file mode 100644 index 000000000000..43499dba71c0 --- /dev/null +++ b/include/llvm/CodeGen/MachineRegionInfo.h @@ -0,0 +1,183 @@ +//===- llvm/CodeGen/MachineRegionInfo.h -------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_MACHINEREGIONINFO_H +#define LLVM_CODEGEN_MACHINEREGIONINFO_H + +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RegionIterator.h" +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" + + +namespace llvm { + +class MachineDominatorTree; +struct MachinePostDominatorTree; +class MachineRegion; +class MachineRegionNode; +class MachineRegionInfo; + +template<> +struct RegionTraits { + typedef MachineFunction FuncT; + typedef MachineBasicBlock BlockT; + typedef MachineRegion RegionT; + typedef MachineRegionNode RegionNodeT; + typedef MachineRegionInfo RegionInfoT; + typedef MachineDominatorTree DomTreeT; + typedef MachineDomTreeNode DomTreeNodeT; + typedef MachinePostDominatorTree PostDomTreeT; + typedef MachineDominanceFrontier DomFrontierT; + typedef MachineInstr InstT; + typedef MachineLoop LoopT; + typedef MachineLoopInfo LoopInfoT; + + static unsigned getNumSuccessors(MachineBasicBlock *BB) { + return BB->succ_size(); + } +}; + + +class MachineRegionNode : public RegionNodeBase> { +public: + inline MachineRegionNode(MachineRegion *Parent, + MachineBasicBlock *Entry, + bool isSubRegion = false) + : RegionNodeBase>(Parent, Entry, isSubRegion) { + + } + + ~MachineRegionNode() { } + + bool operator==(const MachineRegion &RN) const { + return this == reinterpret_cast(&RN); + } +}; + +class MachineRegion : public RegionBase> { +public: + MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit, + MachineRegionInfo* RI, + MachineDominatorTree *DT, MachineRegion *Parent = nullptr); + ~MachineRegion(); + + bool operator==(const MachineRegionNode &RN) const { + return &RN == reinterpret_cast(this); + } +}; + +class MachineRegionInfo : public RegionInfoBase> { +public: + explicit MachineRegionInfo(); + + virtual ~MachineRegionInfo(); + + // updateStatistics - Update statistic about created regions. + void updateStatistics(MachineRegion *R) final; + + void recalculate(MachineFunction &F, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, + MachineDominanceFrontier *DF); +}; + +class MachineRegionInfoPass : public MachineFunctionPass { + MachineRegionInfo RI; + +public: + static char ID; + explicit MachineRegionInfoPass(); + + ~MachineRegionInfoPass(); + + MachineRegionInfo &getRegionInfo() { + return RI; + } + + const MachineRegionInfo &getRegionInfo() const { + return RI; + } + + /// @name MachineFunctionPass interface + //@{ + bool runOnMachineFunction(MachineFunction &F) override; + void releaseMemory() override; + void verifyAnalysis() const override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + void print(raw_ostream &OS, const Module *) const override; + void dump() const; + //@} +}; + + +template <> +template <> +inline MachineBasicBlock* RegionNodeBase>::getNodeAs() const { + assert(!isSubRegion() && "This is not a MachineBasicBlock RegionNode!"); + return getEntry(); +} + +template<> +template<> +inline MachineRegion* RegionNodeBase>::getNodeAs() const { + assert(isSubRegion() && "This is not a subregion RegionNode!"); + auto Unconst = const_cast>*>(this); + return reinterpret_cast(Unconst); +} + + +RegionNodeGraphTraits(MachineRegionNode, MachineBasicBlock, MachineRegion); +RegionNodeGraphTraits(const MachineRegionNode, MachineBasicBlock, MachineRegion); + +RegionGraphTraits(MachineRegion, MachineRegionNode); +RegionGraphTraits(const MachineRegion, const MachineRegionNode); + +template <> struct GraphTraits + : public GraphTraits > { + typedef df_iterator, false, + GraphTraits > > nodes_iterator; + + static NodeType *getEntryNode(MachineRegionInfo *RI) { + return GraphTraits >::getEntryNode(RI->getTopLevelRegion()); + } + static nodes_iterator nodes_begin(MachineRegionInfo* RI) { + return nodes_iterator::begin(getEntryNode(RI)); + } + static nodes_iterator nodes_end(MachineRegionInfo *RI) { + return nodes_iterator::end(getEntryNode(RI)); + } +}; + +template <> struct GraphTraits + : public GraphTraits { + typedef df_iterator, false, + GraphTraits > > nodes_iterator; + + static NodeType *getEntryNode(MachineRegionInfoPass *RI) { + return GraphTraits::getEntryNode(&RI->getRegionInfo()); + } + static nodes_iterator nodes_begin(MachineRegionInfoPass* RI) { + return GraphTraits::nodes_begin(&RI->getRegionInfo()); + } + static nodes_iterator nodes_end(MachineRegionInfoPass *RI) { + return GraphTraits::nodes_end(&RI->getRegionInfo()); + } +}; + +EXTERN_TEMPLATE_INSTANTIATION(class RegionBase>); +EXTERN_TEMPLATE_INSTANTIATION(class RegionNodeBase>); +EXTERN_TEMPLATE_INSTANTIATION(class RegionInfoBase>); + +} + +#endif diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h index b8a28e877f7d..7d85432101b5 100644 --- a/include/llvm/CodeGen/MachineScheduler.h +++ b/include/llvm/CodeGen/MachineScheduler.h @@ -518,9 +518,7 @@ class ReadyQueue { return Queue.begin() + idx; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump(); -#endif }; /// Summarize the unscheduled region. diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h index 17477fe6b05d..87f55e8572fe 100644 --- a/include/llvm/CodeGen/Passes.h +++ b/include/llvm/CodeGen/Passes.h @@ -381,6 +381,9 @@ namespace llvm { /// MachineDominators - This pass is a machine dominators analysis pass. extern char &MachineDominatorsID; +/// MachineDominanaceFrontier - This pass is a machine dominators analysis pass. + extern char &MachineDominanceFrontierID; + /// EdgeBundles analysis - Bundle machine CFG edges. extern char &EdgeBundlesID; diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h index 6343bb7c937b..441b0f084e69 100644 --- a/include/llvm/CodeGen/RegAllocPBQP.h +++ b/include/llvm/CodeGen/RegAllocPBQP.h @@ -158,7 +158,7 @@ namespace llvm { }; FunctionPass * - createPBQPRegisterAllocator(std::unique_ptr &builder, + createPBQPRegisterAllocator(std::unique_ptr builder, char *customPassID = nullptr); } diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h index c11a6ac261c2..cc9e00055146 100644 --- a/include/llvm/CodeGen/RegisterPressure.h +++ b/include/llvm/CodeGen/RegisterPressure.h @@ -434,10 +434,8 @@ class RegPressureTracker { void bumpDownwardPressure(const MachineInstr *MI); }; -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dumpRegSetPressure(ArrayRef SetPressure, const TargetRegisterInfo *TRI); -#endif } // end namespace llvm #endif diff --git a/include/llvm/CodeGen/RuntimeLibcalls.h b/include/llvm/CodeGen/RuntimeLibcalls.h index 009b8a0f697a..81db8a2f79b5 100644 --- a/include/llvm/CodeGen/RuntimeLibcalls.h +++ b/include/llvm/CodeGen/RuntimeLibcalls.h @@ -210,6 +210,10 @@ namespace RTLIB { FPEXT_F32_F64, FPEXT_F16_F32, FPROUND_F32_F16, + FPROUND_F64_F16, + FPROUND_F80_F16, + FPROUND_F128_F16, + FPROUND_PPCF128_F16, FPROUND_F64_F32, FPROUND_F80_F32, FPROUND_F128_F32, diff --git a/include/llvm/CodeGen/ScheduleDFS.h b/include/llvm/CodeGen/ScheduleDFS.h index 73ce99f4713d..b2108ad3bedb 100644 --- a/include/llvm/CodeGen/ScheduleDFS.h +++ b/include/llvm/CodeGen/ScheduleDFS.h @@ -57,11 +57,9 @@ struct ILPValue { return RHS <= *this; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void print(raw_ostream &OS) const; void dump() const; -#endif }; /// \brief Compute the values of each DAG node for various metrics during DFS. diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h index db2e841e1720..9b8d9521ee24 100644 --- a/include/llvm/CodeGen/SelectionDAG.h +++ b/include/llvm/CodeGen/SelectionDAG.h @@ -546,6 +546,12 @@ class SelectionDAG { return getVectorShuffle(VT, dl, N1, N2, MaskElts.data()); } + /// \brief Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to + /// the shuffle node in input but with swapped operands. + /// + /// Example: shuffle A, B, <0,5,2,7> -> shuffle B, A, <4,1,6,3> + SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV); + /// getAnyExtOrTrunc - Convert Op, which must be of integer type, to the /// integer type VT, by either any-extending or truncating it. SDValue getAnyExtOrTrunc(SDValue Op, SDLoc DL, EVT VT); @@ -562,10 +568,28 @@ class SelectionDAG { /// value assuming it was the smaller SrcTy value. SDValue getZeroExtendInReg(SDValue Op, SDLoc DL, EVT SrcTy); + /// getAnyExtendVectorInReg - Return an operation which will any-extend the + /// low lanes of the operand into the specified vector type. For example, + /// this can convert a v16i8 into a v4i32 by any-extending the low four + /// lanes of the operand from i8 to i32. + SDValue getAnyExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT); + + /// getSignExtendVectorInReg - Return an operation which will sign extend the + /// low lanes of the operand into the specified vector type. For example, + /// this can convert a v16i8 into a v4i32 by sign extending the low four + /// lanes of the operand from i8 to i32. + SDValue getSignExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT); + + /// getZeroExtendVectorInReg - Return an operation which will zero extend the + /// low lanes of the operand into the specified vector type. For example, + /// this can convert a v16i8 into a v4i32 by zero extending the low four + /// lanes of the operand from i8 to i32. + SDValue getZeroExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT); + /// getBoolExtOrTrunc - Convert Op, which must be of integer type, to the /// integer type VT, by using an extension appropriate for the target's - /// BooleanContent or truncating it. - SDValue getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT); + /// BooleanContent for type OpVT or truncating it. + SDValue getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT, EVT OpVT); /// getNOT - Create a bitwise NOT operation as (XOR Val, -1). SDValue getNOT(SDLoc DL, SDValue Val, EVT VT); @@ -1174,6 +1198,7 @@ class SelectionDAG { unsigned getEVTAlignment(EVT MemoryVT) const; private: + void InsertNode(SDNode *N); bool RemoveNodeFromCSEMaps(SDNode *N); void AddModifiedNodeToCSEMaps(SDNode *N); SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op, void *&InsertPos); diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h index a39d35be6175..223151105b0d 100644 --- a/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/include/llvm/CodeGen/SelectionDAGNodes.h @@ -20,6 +20,7 @@ #define LLVM_CODEGEN_SELECTIONDAGNODES_H #include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/STLExtras.h" @@ -142,6 +143,9 @@ class SDValue { bool operator<(const SDValue &O) const { return std::tie(Node, ResNo) < std::tie(O.Node, O.ResNo); } + LLVM_EXPLICIT operator bool() const { + return Node != nullptr; + } SDValue getValue(unsigned R) const { return SDValue(Node, R); @@ -593,6 +597,7 @@ class SDNode : public FoldingSetNode, public ilist_node { typedef SDUse* op_iterator; op_iterator op_begin() const { return OperandList; } op_iterator op_end() const { return OperandList+NumOperands; } + ArrayRef ops() const { return makeArrayRef(op_begin(), op_end()); } SDVTList getVTList() const { SDVTList X = { ValueList, NumValues }; @@ -1579,11 +1584,27 @@ class BuildVectorSDNode : public SDNode { unsigned MinSplatBits = 0, bool isBigEndian = false) const; - /// getConstantSplatValue - Check if this is a constant splat, and if so, - /// return the splat value only if it is a ConstantSDNode. Otherwise - /// return nullptr. This is a simpler form of isConstantSplat. - /// Get the constant splat only if you care about the splat value. - ConstantSDNode *getConstantSplatValue() const; + /// \brief Returns the splatted value or a null value if this is not a splat. + /// + /// If passed a non-null UndefElements bitvector, it will resize it to match + /// the vector width and set the bits where elements are undef. + SDValue getSplatValue(BitVector *UndefElements = nullptr) const; + + /// \brief Returns the splatted constant or null if this is not a constant + /// splat. + /// + /// If passed a non-null UndefElements bitvector, it will resize it to match + /// the vector width and set the bits where elements are undef. + ConstantSDNode * + getConstantSplatNode(BitVector *UndefElements = nullptr) const; + + /// \brief Returns the splatted constant FP or null if this is not a constant + /// FP splat. + /// + /// If passed a non-null UndefElements bitvector, it will resize it to match + /// the vector width and set the bits where elements are undef. + ConstantFPSDNode * + getConstantFPSplatNode(BitVector *UndefElements = nullptr) const; bool isConstant() const; diff --git a/include/llvm/CodeGen/StackMapLivenessAnalysis.h b/include/llvm/CodeGen/StackMapLivenessAnalysis.h index 6ba7256e44af..6f0754616206 100644 --- a/include/llvm/CodeGen/StackMapLivenessAnalysis.h +++ b/include/llvm/CodeGen/StackMapLivenessAnalysis.h @@ -8,8 +8,8 @@ //===----------------------------------------------------------------------===// // // This pass calculates the liveness for each basic block in a function and -// attaches the register live-out information to a stackmap or patchpoint -// intrinsic if present. +// attaches the register live-out information to a patchpoint intrinsic (if +// present). // //===----------------------------------------------------------------------===// @@ -23,14 +23,13 @@ namespace llvm { /// \brief This pass calculates the liveness information for each basic block in -/// a function and attaches the register live-out information to a stackmap or -/// patchpoint intrinsic if present. +/// a function and attaches the register live-out information to a patchpoint +/// intrinsic if present. /// -/// This is an optional pass that has to be explicitly enabled via the -/// -enable-stackmap-liveness and/or -enable-patchpoint-liveness flag. The pass -/// skips functions that don't have any stackmap or patchpoint intrinsics. The +/// This pass can be disabled via the -enable-patchpoint-liveness=false flag. +/// The pass skips functions that don't have any patchpoint intrinsics. The /// information provided by this pass is optional and not required by the -/// aformentioned intrinsics to function. +/// aformentioned intrinsic to function. class StackMapLiveness : public MachineFunctionPass { MachineFunction *MF; const TargetRegisterInfo *TRI; diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h index 230d1ed51a94..87f140190a75 100644 --- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h +++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h @@ -43,7 +43,8 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile { /// Given a constant with the SectionKind, return a section that it should be /// placed in. - const MCSection *getSectionForConstant(SectionKind Kind) const override; + const MCSection *getSectionForConstant(SectionKind Kind, + const Constant *C) const override; const MCSection *getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang, @@ -100,7 +101,8 @@ class TargetLoweringObjectFileMachO : public TargetLoweringObjectFile { SectionKind Kind, Mangler &Mang, const TargetMachine &TM) const override; - const MCSection *getSectionForConstant(SectionKind Kind) const override; + const MCSection *getSectionForConstant(SectionKind Kind, + const Constant *C) const override; /// The mach-o version of this method defaults to returning a stub reference. const MCExpr * diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake index e9f6702f8180..1dfa6edb5268 100644 --- a/include/llvm/Config/config.h.cmake +++ b/include/llvm/Config/config.h.cmake @@ -179,9 +179,6 @@ /* Define to 1 if you have the `shell32' library (-lshell32). */ #cmakedefine HAVE_LIBSHELL32 ${HAVE_LIBSHELL32} -/* Define to 1 if you have the `udis86' library (-ludis86). */ -#undef HAVE_LIBUDIS86 - /* Define to 1 if you have the 'z' library (-lz). */ #cmakedefine HAVE_LIBZ ${HAVE_LIBZ} @@ -518,9 +515,6 @@ /* Define to 1 if your declares `struct tm'. */ #undef TM_IN_SYS_TIME -/* Define if use udis86 library */ -#undef USE_UDIS86 - /* Type of 1st arg on ELM Callback */ #cmakedefine WIN32_ELMCB_PCSTR ${WIN32_ELMCB_PCSTR} diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in index b5f72977c22d..102b11bc0b30 100644 --- a/include/llvm/Config/config.h.in +++ b/include/llvm/Config/config.h.in @@ -161,9 +161,6 @@ /* Define to 1 if you have the `shell32' library (-lshell32). */ #undef HAVE_LIBSHELL32 -/* Define to 1 if you have the `udis86' library (-ludis86). */ -#undef HAVE_LIBUDIS86 - /* Define to 1 if you have the `z' library (-lz). */ #undef HAVE_LIBZ @@ -487,9 +484,6 @@ /* Define to 1 if your declares `struct tm'. */ #undef TM_IN_SYS_TIME -/* Define if use udis86 library */ -#undef USE_UDIS86 - /* Type of 1st arg on ELM Callback */ #undef WIN32_ELMCB_PCSTR diff --git a/include/llvm/ExecutionEngine/ObjectBuffer.h b/include/llvm/ExecutionEngine/ObjectBuffer.h index 071a42b6b768..6221d3b335df 100644 --- a/include/llvm/ExecutionEngine/ObjectBuffer.h +++ b/include/llvm/ExecutionEngine/ObjectBuffer.h @@ -39,7 +39,8 @@ class ObjectBuffer { /// returns a pointer to an object that is owned by the caller. However, /// the caller does not take ownership of the underlying memory. MemoryBuffer *getMemBuffer() const { - return MemoryBuffer::getMemBuffer(Buffer->getBuffer(), "", false); + return MemoryBuffer::getMemBuffer(Buffer->getBuffer(), + Buffer->getBufferIdentifier(), false); } const char *getBufferStart() const { return Buffer->getBufferStart(); } diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h index 30c0d49ade03..f123ffb803bd 100644 --- a/include/llvm/ExecutionEngine/RuntimeDyld.h +++ b/include/llvm/ExecutionEngine/RuntimeDyld.h @@ -29,6 +29,8 @@ class RuntimeDyldImpl; class ObjectImage; class RuntimeDyld { + friend class RuntimeDyldChecker; + RuntimeDyld(const RuntimeDyld &) LLVM_DELETED_FUNCTION; void operator=(const RuntimeDyld &) LLVM_DELETED_FUNCTION; diff --git a/include/llvm/ExecutionEngine/RuntimeDyldChecker.h b/include/llvm/ExecutionEngine/RuntimeDyldChecker.h new file mode 100644 index 000000000000..8dd891e83648 --- /dev/null +++ b/include/llvm/ExecutionEngine/RuntimeDyldChecker.h @@ -0,0 +1,98 @@ +//===---- RuntimeDyldChecker.h - RuntimeDyld tester framework -----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_RUNTIMEDYLDCHECKER_H +#define LLVM_RUNTIMEDYLDCHECKER_H + +#include "RuntimeDyld.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace llvm { + +class MCDisassembler; +class MCInstPrinter; + +/// \brief RuntimeDyld invariant checker for verifying that RuntimeDyld has +/// correctly applied relocations. +/// +/// The RuntimeDyldChecker class evaluates expressions against an attached +/// RuntimeDyld instance to verify that relocations have been applied +/// correctly. +/// +/// The expression language supports basic pointer arithmetic and bit-masking, +/// and has limited disassembler integration for accessing instruction +/// operands and the next PC (program counter) address for each instruction. +/// +/// The language syntax is: +/// +/// check = expr '=' expr +/// +/// expr = binary_expr +/// | sliceable_expr +/// +/// sliceable_expr = '*{' number '}' load_addr_expr [slice] +/// | '(' expr ')' [slice] +/// | ident_expr [slice] +/// | number [slice] +/// +/// slice = '[' high-bit-index ':' low-bit-index ']' +/// +/// load_addr_expr = symbol +/// | '(' symbol '+' number ')' +/// | '(' symbol '-' number ')' +/// +/// ident_expr = 'decode_operand' '(' symbol ',' operand-index ')' +/// | 'next_pc' '(' symbol ')' +/// | symbol +/// +/// binary_expr = expr '+' expr +/// | expr '-' expr +/// | expr '&' expr +/// | expr '|' expr +/// | expr '<<' expr +/// | expr '>>' expr +/// +class RuntimeDyldChecker { + friend class RuntimeDyldCheckerExprEval; +public: + RuntimeDyldChecker(RuntimeDyld &RTDyld, + MCDisassembler *Disassembler, + MCInstPrinter *InstPrinter, + llvm::raw_ostream &ErrStream) + : RTDyld(*RTDyld.Dyld), Disassembler(Disassembler), + InstPrinter(InstPrinter), ErrStream(ErrStream) {} + + /// \brief Check a single expression against the attached RuntimeDyld + /// instance. + bool check(StringRef CheckExpr) const; + + /// \brief Scan the given memory buffer for lines beginning with the string + /// in RulePrefix. The remainder of the line is passed to the check + /// method to be evaluated as an expression. + bool checkAllRulesInBuffer(StringRef RulePrefix, MemoryBuffer *MemBuf) const; + +private: + + bool isSymbolValid(StringRef Symbol) const; + uint64_t getSymbolAddress(StringRef Symbol) const; + uint64_t readMemoryAtSymbol(StringRef Symbol, int64_t Offset, + unsigned Size) const; + StringRef getSubsectionStartingAt(StringRef Name) const; + + RuntimeDyldImpl &RTDyld; + MCDisassembler *Disassembler; + MCInstPrinter *InstPrinter; + llvm::raw_ostream &ErrStream; +}; + +} // end namespace llvm + +#endif // LLVM_RUNTIMEDYLDCHECKER_H diff --git a/include/llvm/IR/Argument.h b/include/llvm/IR/Argument.h index 3a63e1a1eaa2..7c398a5e5530 100644 --- a/include/llvm/IR/Argument.h +++ b/include/llvm/IR/Argument.h @@ -56,9 +56,15 @@ class Argument : public Value, public ilist_node { unsigned getArgNo() const; /// \brief Return true if this argument has the nonnull attribute on it in - /// its containing function. + /// its containing function. Also returns true if at least one byte is known + /// to be dereferenceable and the pointer is in addrspace(0). bool hasNonNullAttr() const; + /// \brief If this argument has the dereferenceable attribute on it in its + /// containing function, return the number of bytes known to be + /// dereferenceable. Otherwise, zero is returned. + uint64_t getDereferenceableBytes() const; + /// \brief Return true if this argument has the byval attribute on it in its /// containing function. bool hasByValAttr() const; diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h index e34dc83a5ba3..5ff48d688918 100644 --- a/include/llvm/IR/Attributes.h +++ b/include/llvm/IR/Attributes.h @@ -88,6 +88,7 @@ class Attribute { NonLazyBind, ///< Function is called early and/or ///< often, so lazy binding isn't worthwhile NonNull, ///< Pointer is known to be not null + Dereferenceable, ///< Pointer is known to be dereferenceable NoRedZone, ///< Disable redzone NoReturn, ///< Mark the function as not returning NoUnwind, ///< Function doesn't unwind stack @@ -133,6 +134,8 @@ class Attribute { /// alignment set. static Attribute getWithAlignment(LLVMContext &Context, uint64_t Align); static Attribute getWithStackAlignment(LLVMContext &Context, uint64_t Align); + static Attribute getWithDereferenceableBytes(LLVMContext &Context, + uint64_t Bytes); //===--------------------------------------------------------------------===// // Attribute Accessors @@ -141,8 +144,8 @@ class Attribute { /// \brief Return true if the attribute is an Attribute::AttrKind type. bool isEnumAttribute() const; - /// \brief Return true if the attribute is an alignment attribute. - bool isAlignAttribute() const; + /// \brief Return true if the attribute is an integer attribute. + bool isIntAttribute() const; /// \brief Return true if the attribute is a string (target-dependent) /// attribute. @@ -178,6 +181,10 @@ class Attribute { /// alignment value. unsigned getStackAlignment() const; + /// \brief Returns the number of dereferenceable bytes from the + /// dereferenceable attribute (or zero if unknown). + uint64_t getDereferenceableBytes() const; + /// \brief The Attribute is converted to a string of equivalent mnemonic. This /// is, presumably, for writing out the mnemonics for the assembly writer. std::string getAsString(bool InAttrGrp = false) const; @@ -316,6 +323,9 @@ class AttributeSet { /// \brief Get the stack alignment. unsigned getStackAlignment(unsigned Index) const; + /// \brief Get the number of dereferenceable bytes (or zero if unknown). + uint64_t getDereferenceableBytes(unsigned Index) const; + /// \brief Return the attributes at the index as a string. std::string getAsString(unsigned Index, bool InAttrGrp = false) const; @@ -395,13 +405,15 @@ class AttrBuilder { std::map TargetDepAttrs; uint64_t Alignment; uint64_t StackAlignment; + uint64_t DerefBytes; public: - AttrBuilder() : Attrs(0), Alignment(0), StackAlignment(0) {} + AttrBuilder() : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0) {} explicit AttrBuilder(uint64_t Val) - : Attrs(0), Alignment(0), StackAlignment(0) { + : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0) { addRawValue(Val); } - AttrBuilder(const Attribute &A) : Attrs(0), Alignment(0), StackAlignment(0) { + AttrBuilder(const Attribute &A) + : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0) { addAttribute(A); } AttrBuilder(AttributeSet AS, unsigned Idx); @@ -455,6 +467,10 @@ class AttrBuilder { /// \brief Retrieve the stack alignment attribute, if it exists. uint64_t getStackAlignment() const { return StackAlignment; } + /// \brief Retrieve the number of dereferenceable bytes, if the dereferenceable + /// attribute exists (zero is returned otherwise). + uint64_t getDereferenceableBytes() const { return DerefBytes; } + /// \brief This turns an int alignment (which must be a power of 2) into the /// form used internally in Attribute. AttrBuilder &addAlignmentAttr(unsigned Align); @@ -463,6 +479,10 @@ class AttrBuilder { /// the form used internally in Attribute. AttrBuilder &addStackAlignmentAttr(unsigned Align); + /// \brief This turns the number of dereferenceable bytes into the form used + /// internally in Attribute. + AttrBuilder &addDereferenceableAttr(uint64_t Bytes); + /// \brief Return true if the builder contains no target-independent /// attributes. bool empty() const { return Attrs.none(); } diff --git a/include/llvm/IR/AutoUpgrade.h b/include/llvm/IR/AutoUpgrade.h index 076ed4acff0a..a4b3c410c4f6 100644 --- a/include/llvm/IR/AutoUpgrade.h +++ b/include/llvm/IR/AutoUpgrade.h @@ -14,6 +14,8 @@ #ifndef LLVM_IR_AUTOUPGRADE_H #define LLVM_IR_AUTOUPGRADE_H +#include + namespace llvm { class CallInst; class Constant; @@ -61,6 +63,9 @@ namespace llvm { /// Check the debug info version number, if it is out-dated, drop the debug /// info. Return true if module is modified. bool UpgradeDebugInfo(Module &M); + + /// Upgrade a metadata string constant in place. + void UpgradeMDStringConstant(std::string &String); } // End llvm namespace #endif diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h index deea4151dddc..df082577a0e2 100644 --- a/include/llvm/IR/CallSite.h +++ b/include/llvm/IR/CallSite.h @@ -217,6 +217,12 @@ class CallSiteBase { CALLSITE_DELEGATE_GETTER(getParamAlignment(i)); } + /// @brief Extract the number of dereferenceable bytes for a call or + /// parameter (0=unknown). + uint64_t getDereferenceableBytes(uint16_t i) const { + CALLSITE_DELEGATE_GETTER(getDereferenceableBytes(i)); + } + /// \brief Return true if the call should not be treated as a call to a /// builtin. bool isNoBuiltin() const { @@ -302,6 +308,19 @@ class CallSiteBase { paramHasAttr(ArgNo + 1, Attribute::ReadNone); } + /// @brief Return true if the return value is known to be not null. + /// This may be because it has the nonnull attribute, or because at least + /// one byte is dereferenceable and the pointer is in addrspace(0). + bool isReturnNonNull() const { + if (paramHasAttr(0, Attribute::NonNull)) + return true; + else if (getDereferenceableBytes(0) > 0 && + getType()->getPointerAddressSpace() == 0) + return true; + + return false; + } + /// hasArgument - Returns true if this CallSite passes the given Value* as an /// argument to the called function. bool hasArgument(const Value *Arg) const { diff --git a/include/llvm/IR/Comdat.h b/include/llvm/IR/Comdat.h new file mode 100644 index 000000000000..3e77a7709d5c --- /dev/null +++ b/include/llvm/IR/Comdat.h @@ -0,0 +1,66 @@ +//===-- llvm/IR/Comdat.h - Comdat definitions -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// @file +/// This file contains the declaration of the Comdat class, which represents a +/// single COMDAT in LLVM. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_COMDAT_H +#define LLVM_IR_COMDAT_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" + +namespace llvm { + +class raw_ostream; +template class StringMapEntry; + +// This is a Name X SelectionKind pair. The reason for having this be an +// independent object instead of just adding the name and the SelectionKind +// to a GlobalObject is that it is invalid to have two Comdats with the same +// name but different SelectionKind. This structure makes that unrepresentable. +class Comdat { +public: + enum SelectionKind { + Any, ///< The linker may choose any COMDAT. + ExactMatch, ///< The data referenced by the COMDAT must be the same. + Largest, ///< The linker will choose the largest COMDAT. + NoDuplicates, ///< No other Module may specify this COMDAT. + SameSize, ///< The data referenced by the COMDAT must be the same size. + }; + + Comdat(Comdat &&C); + SelectionKind getSelectionKind() const { return SK; } + void setSelectionKind(SelectionKind Val) { SK = Val; } + StringRef getName() const; + void print(raw_ostream &OS) const; + void dump() const; + +private: + friend class Module; + Comdat(); + Comdat(SelectionKind SK, StringMapEntry *Name); + Comdat(const Comdat &) LLVM_DELETED_FUNCTION; + + // Points to the map in Module. + StringMapEntry *Name; + SelectionKind SK; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const Comdat &C) { + C.print(OS); + return OS; +} + +} // end llvm namespace + +#endif diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h index 39c7c37dafd5..82ad9fc2f407 100644 --- a/include/llvm/IR/Constant.h +++ b/include/llvm/IR/Constant.h @@ -64,6 +64,9 @@ class Constant : public User { /// Return true if the value is negative zero or null value. bool isZeroValue() const; + /// \brief Return true if the value is the smallest signed value. + bool isMinSignedValue() const; + /// canTrap - Return true if evaluation of this constant could trap. This is /// true for things like constant expressions that could divide by zero. bool canTrap() const; @@ -71,6 +74,9 @@ class Constant : public User { /// isThreadDependent - Return true if the value can vary between threads. bool isThreadDependent() const; + /// Return true if the value is dependent on a dllimport variable. + bool isDLLImportDependent() const; + /// isConstantUsed - Return true if the constant has users other than constant /// exprs and other dangling things. bool isConstantUsed() const; diff --git a/include/llvm/IR/ConstantFolder.h b/include/llvm/IR/ConstantFolder.h index 86668f7e7dfa..e271a1482117 100644 --- a/include/llvm/IR/ConstantFolder.h +++ b/include/llvm/IR/ConstantFolder.h @@ -159,6 +159,12 @@ class ConstantFolder { Constant *CreatePointerCast(Constant *C, Type *DestTy) const { return ConstantExpr::getPointerCast(C, DestTy); } + + Constant *CreatePointerBitCastOrAddrSpaceCast(Constant *C, + Type *DestTy) const { + return ConstantExpr::getPointerBitCastOrAddrSpaceCast(C, DestTy); + } + Constant *CreateIntCast(Constant *C, Type *DestTy, bool isSigned) const { return ConstantExpr::getIntegerCast(C, DestTy, isSigned); diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h index 8b05bbb4fdb7..267350409604 100644 --- a/include/llvm/IR/DIBuilder.h +++ b/include/llvm/IR/DIBuilder.h @@ -108,12 +108,23 @@ namespace llvm { /// Objective-C. /// @param SplitName The name of the file that we'll split debug info out /// into. + /// @param Kind The kind of debug information to generate. + /// @param EmitDebugInfo A boolean flag which indicates whether debug + /// information should be written to the final + /// output or not. When this is false, debug + /// information annotations will be present in + /// the IL but they are not written to the final + /// assembly or object file. This supports tracking + /// source location information in the back end + /// without actually changing the output (e.g., + /// when using optimization remarks). DICompileUnit createCompileUnit(unsigned Lang, StringRef File, StringRef Dir, StringRef Producer, bool isOptimized, StringRef Flags, unsigned RV, StringRef SplitName = StringRef(), - DebugEmissionKind Kind = FullDebug); + DebugEmissionKind Kind = FullDebug, + bool EmitDebugInfo = true); /// createFile - Create a file descriptor to hold debugging information /// for a file. diff --git a/include/llvm/IR/DataLayout.h b/include/llvm/IR/DataLayout.h index 3079dec7835b..877029f92f0b 100644 --- a/include/llvm/IR/DataLayout.h +++ b/include/llvm/IR/DataLayout.h @@ -414,8 +414,8 @@ class DataLayout { return (LargestSize == 0) ? nullptr : Type::getIntNTy(C, LargestSize); } - /// getLargestLegalIntType - Return the size of largest legal integer type - /// size, or 0 if none are set. + /// getLargestLegalIntTypeSize - Return the size of largest legal integer + /// type size, or 0 if none are set. unsigned getLargestLegalIntTypeSize() const; /// getIndexedOffset - return the offset from the beginning of the type for diff --git a/include/llvm/IR/DebugInfo.h b/include/llvm/IR/DebugInfo.h index 65e0a0625f70..088eb9f01049 100644 --- a/include/llvm/IR/DebugInfo.h +++ b/include/llvm/IR/DebugInfo.h @@ -690,12 +690,17 @@ class DIVariable : public DIDescriptor { /// HasComplexAddr - Return true if the variable has a complex address. bool hasComplexAddress() const { return getNumAddrElements() > 0; } - unsigned getNumAddrElements() const; - - uint64_t getAddrElement(unsigned Idx) const { - return getUInt64Field(Idx + 8); + /// \brief Return the size of this variable's complex address or + /// zero if there is none. + unsigned getNumAddrElements() const { + if (DbgNode->getNumOperands() < 9) + return 0; + return getDescriptorField(8)->getNumOperands(); } + /// \brief return the Idx'th complex address element. + uint64_t getAddrElement(unsigned Idx) const; + /// isBlockByrefVariable - Return true if the variable was declared as /// a "__block" variable (Apple Blocks). bool isBlockByrefVariable(const DITypeIdentifierMap &Map) const { @@ -929,6 +934,9 @@ class DebugInfoFinder { /// Specify if TypeIdentifierMap is initialized. bool TypeMapInitialized; }; + +DenseMap makeSubprogramMap(const Module &M); + } // end namespace llvm #endif diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h index de38d07f903c..9c9f236180ce 100644 --- a/include/llvm/IR/DiagnosticInfo.h +++ b/include/llvm/IR/DiagnosticInfo.h @@ -51,6 +51,7 @@ enum DiagnosticKind { DK_OptimizationRemark, DK_OptimizationRemarkMissed, DK_OptimizationRemarkAnalysis, + DK_OptimizationFailure, DK_FirstPluginKind }; @@ -239,7 +240,7 @@ class DiagnosticInfoSampleProfile : public DiagnosticInfo { }; /// Common features for diagnostics dealing with optimization remarks. -class DiagnosticInfoOptimizationRemarkBase : public DiagnosticInfo { +class DiagnosticInfoOptimizationBase : public DiagnosticInfo { public: /// \p PassName is the name of the pass emitting this diagnostic. /// \p Fn is the function where the diagnostic is being emitted. \p DLoc is @@ -248,10 +249,11 @@ class DiagnosticInfoOptimizationRemarkBase : public DiagnosticInfo { /// location. \p Msg is the message to show. Note that this class does not /// copy this message, so this reference must be valid for the whole life time /// of the diagnostic. - DiagnosticInfoOptimizationRemarkBase(enum DiagnosticKind Kind, - const char *PassName, const Function &Fn, - const DebugLoc &DLoc, const Twine &Msg) - : DiagnosticInfo(Kind, DS_Remark), PassName(PassName), Fn(Fn), DLoc(DLoc), + DiagnosticInfoOptimizationBase(enum DiagnosticKind Kind, + enum DiagnosticSeverity Severity, + const char *PassName, const Function &Fn, + const DebugLoc &DLoc, const Twine &Msg) + : DiagnosticInfo(Kind, Severity), PassName(PassName), Fn(Fn), DLoc(DLoc), Msg(Msg) {} /// \see DiagnosticInfo::print. @@ -302,8 +304,7 @@ class DiagnosticInfoOptimizationRemarkBase : public DiagnosticInfo { }; /// Diagnostic information for applied optimization remarks. -class DiagnosticInfoOptimizationRemark - : public DiagnosticInfoOptimizationRemarkBase { +class DiagnosticInfoOptimizationRemark : public DiagnosticInfoOptimizationBase { public: /// \p PassName is the name of the pass emitting this diagnostic. If /// this name matches the regular expression given in -Rpass=, then the @@ -315,20 +316,20 @@ class DiagnosticInfoOptimizationRemark /// must be valid for the whole life time of the diagnostic. DiagnosticInfoOptimizationRemark(const char *PassName, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg) - : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemark, PassName, - Fn, DLoc, Msg) {} + : DiagnosticInfoOptimizationBase(DK_OptimizationRemark, DS_Remark, + PassName, Fn, DLoc, Msg) {} static bool classof(const DiagnosticInfo *DI) { return DI->getKind() == DK_OptimizationRemark; } - /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled. + /// \see DiagnosticInfoOptimizationBase::isEnabled. virtual bool isEnabled() const override; }; /// Diagnostic information for missed-optimization remarks. class DiagnosticInfoOptimizationRemarkMissed - : public DiagnosticInfoOptimizationRemarkBase { + : public DiagnosticInfoOptimizationBase { public: /// \p PassName is the name of the pass emitting this diagnostic. If /// this name matches the regular expression given in -Rpass-missed=, then the @@ -341,20 +342,20 @@ class DiagnosticInfoOptimizationRemarkMissed DiagnosticInfoOptimizationRemarkMissed(const char *PassName, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg) - : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemarkMissed, - PassName, Fn, DLoc, Msg) {} + : DiagnosticInfoOptimizationBase(DK_OptimizationRemarkMissed, DS_Remark, + PassName, Fn, DLoc, Msg) {} static bool classof(const DiagnosticInfo *DI) { return DI->getKind() == DK_OptimizationRemarkMissed; } - /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled. + /// \see DiagnosticInfoOptimizationBase::isEnabled. virtual bool isEnabled() const override; }; /// Diagnostic information for optimization analysis remarks. class DiagnosticInfoOptimizationRemarkAnalysis - : public DiagnosticInfoOptimizationRemarkBase { + : public DiagnosticInfoOptimizationBase { public: /// \p PassName is the name of the pass emitting this diagnostic. If /// this name matches the regular expression given in -Rpass-analysis=, then @@ -368,14 +369,14 @@ class DiagnosticInfoOptimizationRemarkAnalysis const Function &Fn, const DebugLoc &DLoc, const Twine &Msg) - : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemarkAnalysis, - PassName, Fn, DLoc, Msg) {} + : DiagnosticInfoOptimizationBase(DK_OptimizationRemarkAnalysis, DS_Remark, + PassName, Fn, DLoc, Msg) {} static bool classof(const DiagnosticInfo *DI) { return DI->getKind() == DK_OptimizationRemarkAnalysis; } - /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled. + /// \see DiagnosticInfoOptimizationBase::isEnabled. virtual bool isEnabled() const override; }; @@ -411,6 +412,41 @@ void emitOptimizationRemarkAnalysis(LLVMContext &Ctx, const char *PassName, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg); +/// Diagnostic information for optimization failures. +class DiagnosticInfoOptimizationFailure + : public DiagnosticInfoOptimizationBase { +public: + /// \p Fn is the function where the diagnostic is being emitted. \p DLoc is + /// the location information to use in the diagnostic. If line table + /// information is available, the diagnostic will include the source code + /// location. \p Msg is the message to show. Note that this class does not + /// copy this message, so this reference must be valid for the whole life time + /// of the diagnostic. + DiagnosticInfoOptimizationFailure(const Function &Fn, const DebugLoc &DLoc, + const Twine &Msg) + : DiagnosticInfoOptimizationBase(DK_OptimizationFailure, DS_Warning, + nullptr, Fn, DLoc, Msg) {} + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == DK_OptimizationFailure; + } + + /// \see DiagnosticInfoOptimizationBase::isEnabled. + virtual bool isEnabled() const override; +}; + +/// Emit a warning when loop vectorization is specified but fails. \p Fn is the +/// function triggering the warning, \p DLoc is the debug location where the +/// diagnostic is generated. \p Msg is the message string to use. +void emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn, + const DebugLoc &DLoc, const Twine &Msg); + +/// Emit a warning when loop interleaving is specified but fails. \p Fn is the +/// function triggering the warning, \p DLoc is the debug location where the +/// diagnostic is generated. \p Msg is the message string to use. +void emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn, + const DebugLoc &DLoc, const Twine &Msg); + } // End namespace llvm #endif diff --git a/include/llvm/IR/Dominators.h b/include/llvm/IR/Dominators.h index 364820275dd4..e2d1ccc8a3ca 100644 --- a/include/llvm/IR/Dominators.h +++ b/include/llvm/IR/Dominators.h @@ -97,10 +97,6 @@ class DominatorTree : public DominatorTreeBase { bool dominates(const BasicBlockEdge &BBE, const Use &U) const; bool dominates(const BasicBlockEdge &BBE, const BasicBlock *BB) const; - inline DomTreeNode *operator[](BasicBlock *BB) const { - return getNode(BB); - } - // Ensure base class overloads are visible. using Base::isReachableFromEntry; diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h index 22444bd3000e..ad4b1395f0cb 100644 --- a/include/llvm/IR/Function.h +++ b/include/llvm/IR/Function.h @@ -233,6 +233,12 @@ class Function : public GlobalObject, public ilist_node { return AttributeSets.getParamAlignment(i); } + /// @brief Extract the number of dereferenceable bytes for a call or + /// parameter (0=unknown). + uint64_t getDereferenceableBytes(unsigned i) const { + return AttributeSets.getDereferenceableBytes(i); + } + /// @brief Determine if the function does not access memory. bool doesNotAccessMemory() const { return AttributeSets.hasAttribute(AttributeSet::FunctionIndex, diff --git a/include/llvm/IR/GVMaterializer.h b/include/llvm/IR/GVMaterializer.h index 4afdbb05f854..a1216a174282 100644 --- a/include/llvm/IR/GVMaterializer.h +++ b/include/llvm/IR/GVMaterializer.h @@ -54,6 +54,8 @@ class GVMaterializer { /// Make sure the entire Module has been completely read. /// virtual std::error_code MaterializeModule(Module *M) = 0; + + virtual void releaseBuffer() = 0; }; } // End llvm namespace diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h index a77d1630f428..075b5703bccf 100644 --- a/include/llvm/IR/GlobalAlias.h +++ b/include/llvm/IR/GlobalAlias.h @@ -87,6 +87,21 @@ class GlobalAlias : public GlobalValue, public ilist_node { return getOperand(0); } + const GlobalObject *getBaseObject() const { + return const_cast(this)->getBaseObject(); + } + GlobalObject *getBaseObject() { + return dyn_cast(getAliasee()->stripInBoundsOffsets()); + } + + const GlobalObject *getBaseObject(const DataLayout &DL, APInt &Offset) const { + return const_cast(this)->getBaseObject(DL, Offset); + } + GlobalObject *getBaseObject(const DataLayout &DL, APInt &Offset) { + return dyn_cast( + getAliasee()->stripAndAccumulateInBoundsConstantOffsets(DL, Offset)); + } + static bool isValidLinkage(LinkageTypes L) { return isExternalLinkage(L) || isLocalLinkage(L) || isWeakLinkage(L) || isLinkOnceLinkage(L); diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h index 74cc18eeb1e7..2e042f489749 100644 --- a/include/llvm/IR/GlobalObject.h +++ b/include/llvm/IR/GlobalObject.h @@ -20,7 +20,7 @@ #include "llvm/IR/GlobalValue.h" namespace llvm { - +class Comdat; class Module; class GlobalObject : public GlobalValue { @@ -29,11 +29,12 @@ class GlobalObject : public GlobalValue { protected: GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps, LinkageTypes Linkage, const Twine &Name) - : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name) { + : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name), ObjComdat(nullptr) { setGlobalValueSubClassData(0); } std::string Section; // Section to emit this into, empty means default + Comdat *ObjComdat; public: unsigned getAlignment() const { return (1u << getGlobalValueSubClassData()) >> 1; @@ -44,6 +45,11 @@ class GlobalObject : public GlobalValue { const char *getSection() const { return Section.c_str(); } void setSection(StringRef S); + bool hasComdat() const { return getComdat() != nullptr; } + const Comdat *getComdat() const { return ObjComdat; } + Comdat *getComdat() { return ObjComdat; } + void setComdat(Comdat *C) { ObjComdat = C; } + void copyAttributesFrom(const GlobalValue *Src) override; // Methods for support type inquiry through isa, cast, and dyn_cast: diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h index 5e99886a5fb2..68e410ba4b8b 100644 --- a/include/llvm/IR/GlobalValue.h +++ b/include/llvm/IR/GlobalValue.h @@ -23,6 +23,7 @@ namespace llvm { +class Comdat; class PointerType; class Module; @@ -110,6 +111,12 @@ class GlobalValue : public Constant { bool hasUnnamedAddr() const { return UnnamedAddr; } void setUnnamedAddr(bool Val) { UnnamedAddr = Val; } + bool hasComdat() const { return getComdat() != nullptr; } + Comdat *getComdat(); + const Comdat *getComdat() const { + return const_cast(this)->getComdat(); + } + VisibilityTypes getVisibility() const { return VisibilityTypes(Visibility); } bool hasDefaultVisibility() const { return Visibility == DefaultVisibility; } bool hasHiddenVisibility() const { return Visibility == HiddenVisibility; } @@ -173,6 +180,9 @@ class GlobalValue : public Constant { static bool isAvailableExternallyLinkage(LinkageTypes Linkage) { return Linkage == AvailableExternallyLinkage; } + static bool isLinkOnceODRLinkage(LinkageTypes Linkage) { + return Linkage == LinkOnceODRLinkage; + } static bool isLinkOnceLinkage(LinkageTypes Linkage) { return Linkage == LinkOnceAnyLinkage || Linkage == LinkOnceODRLinkage; } diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h index cedb87cdb7cc..aed2463d42d8 100644 --- a/include/llvm/IR/IRBuilder.h +++ b/include/llvm/IR/IRBuilder.h @@ -327,6 +327,11 @@ class IRBuilderBase { return Type::getIntNTy(Context, N); } + /// \brief Fetch the type representing a 16-bit floating point value. + Type *getHalfTy() { + return Type::getHalfTy(Context); + } + /// \brief Fetch the type representing a 32-bit floating point value. Type *getFloatTy() { return Type::getFloatTy(Context); @@ -1198,6 +1203,21 @@ class IRBuilder : public IRBuilderBase, public Inserter { return Insert(Folder.CreatePointerCast(VC, DestTy), Name); return Insert(CastInst::CreatePointerCast(V, DestTy), Name); } + + Value *CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, + const Twine &Name = "") { + if (V->getType() == DestTy) + return V; + + if (Constant *VC = dyn_cast(V)) { + return Insert(Folder.CreatePointerBitCastOrAddrSpaceCast(VC, DestTy), + Name); + } + + return Insert(CastInst::CreatePointerBitCastOrAddrSpaceCast(V, DestTy), + Name); + } + Value *CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name = "") { if (V->getType() == DestTy) diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h index a27859edb59f..981aad852b29 100644 --- a/include/llvm/IR/InstrTypes.h +++ b/include/llvm/IR/InstrTypes.h @@ -459,7 +459,7 @@ class CastInst : public UnaryInstruction { BasicBlock *InsertAtEnd ///< The block to insert the instruction into ); - /// @brief Create a BitCast or a PtrToInt cast instruction + /// @brief Create a BitCast AddrSpaceCast, or a PtrToInt cast instruction. static CastInst *CreatePointerCast( Value *S, ///< The pointer value to be casted (operand 0) Type *Ty, ///< The type to which operand is casted @@ -467,7 +467,7 @@ class CastInst : public UnaryInstruction { BasicBlock *InsertAtEnd ///< The block to insert the instruction into ); - /// @brief Create a BitCast or a PtrToInt cast instruction + /// @brief Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction. static CastInst *CreatePointerCast( Value *S, ///< The pointer value to be casted (operand 0) Type *Ty, ///< The type to which cast should be made @@ -475,6 +475,22 @@ class CastInst : public UnaryInstruction { Instruction *InsertBefore = nullptr ///< Place to insert the instruction ); + /// @brief Create a BitCast or an AddrSpaceCast cast instruction. + static CastInst *CreatePointerBitCastOrAddrSpaceCast( + Value *S, ///< The pointer value to be casted (operand 0) + Type *Ty, ///< The type to which operand is casted + const Twine &Name, ///< The name for the instruction + BasicBlock *InsertAtEnd ///< The block to insert the instruction into + ); + + /// @brief Create a BitCast or an AddrSpaceCast cast instruction. + static CastInst *CreatePointerBitCastOrAddrSpaceCast( + Value *S, ///< The pointer value to be casted (operand 0) + Type *Ty, ///< The type to which cast should be made + const Twine &Name = "", ///< Name for the instruction + Instruction *InsertBefore = 0 ///< Place to insert the instruction + ); + /// @brief Create a ZExt, BitCast, or Trunc for int -> int casts. static CastInst *CreateIntegerCast( Value *S, ///< The pointer value to be casted (operand 0) diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h index e0c829ac985a..308467f7aa17 100644 --- a/include/llvm/IR/Instructions.h +++ b/include/llvm/IR/Instructions.h @@ -1376,6 +1376,12 @@ class CallInst : public Instruction { return AttributeList.getParamAlignment(i); } + /// \brief Extract the number of dereferenceable bytes for a call or + /// parameter (0=unknown). + uint64_t getDereferenceableBytes(unsigned i) const { + return AttributeList.getDereferenceableBytes(i); + } + /// \brief Return true if the call should not be treated as a call to a /// builtin. bool isNoBuiltin() const { @@ -2661,6 +2667,9 @@ class SwitchInst : public TerminatorInst { assert(RHS.SI == SI && "Incompatible operators."); return RHS.Index != Index; } + Self &operator*() { + return *this; + } }; typedef CaseIteratorT @@ -2741,6 +2750,17 @@ class SwitchInst : public TerminatorInst { ConstCaseIt case_end() const { return ConstCaseIt(this, getNumCases()); } + + /// cases - iteration adapter for range-for loops. + iterator_range cases() { + return iterator_range(case_begin(), case_end()); + } + + /// cases - iteration adapter for range-for loops. + iterator_range cases() const { + return iterator_range(case_begin(), case_end()); + } + /// Returns an iterator that points to the default case. /// Note: this iterator allows to resolve successor only. Attempt /// to resolve case value causes an assertion. @@ -3037,6 +3057,12 @@ class InvokeInst : public TerminatorInst { return AttributeList.getParamAlignment(i); } + /// \brief Extract the number of dereferenceable bytes for a call or + /// parameter (0=unknown). + uint64_t getDereferenceableBytes(unsigned i) const { + return AttributeList.getDereferenceableBytes(i); + } + /// \brief Return true if the call should not be treated as a call to a /// builtin. bool isNoBuiltin() const { diff --git a/include/llvm/IR/Intrinsics.h b/include/llvm/IR/Intrinsics.h index 839bbbd8b47d..b0d746bd4127 100644 --- a/include/llvm/IR/Intrinsics.h +++ b/include/llvm/IR/Intrinsics.h @@ -71,6 +71,9 @@ namespace Intrinsic { /// Map a GCC builtin name to an intrinsic ID. ID getIntrinsicForGCCBuiltin(const char *Prefix, const char *BuiltinName); + + /// Map a MS builtin name to an intrinsic ID. + ID getIntrinsicForMSBuiltin(const char *Prefix, const char *BuiltinName); /// IITDescriptor - This is a type descriptor which explains the type /// requirements of an intrinsic. This is returned by diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index edd1621ef25d..0b8f64fc7984 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -226,6 +226,10 @@ class GCCBuiltin { string GCCBuiltinName = name; } +class MSBuiltin { + string MSBuiltinName = name; +} + //===--------------- Variable Argument Handling Intrinsics ----------------===// // @@ -492,10 +496,8 @@ def int_donothing : Intrinsic<[], [], [IntrNoMem]>; // Intrisics to support half precision floating point format let Properties = [IntrNoMem] in { -def int_convert_to_fp16 : Intrinsic<[llvm_i16_ty], [llvm_float_ty]>, - GCCBuiltin<"__gnu_f2h_ieee">; -def int_convert_from_fp16 : Intrinsic<[llvm_float_ty], [llvm_i16_ty]>, - GCCBuiltin<"__gnu_h2f_ieee">; +def int_convert_to_fp16 : Intrinsic<[llvm_i16_ty], [llvm_anyfloat_ty]>; +def int_convert_from_fp16 : Intrinsic<[llvm_anyfloat_ty], [llvm_i16_ty]>; } // These convert intrinsics are to support various conversions between diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td index e3c0fb359901..7d69ed52171c 100644 --- a/include/llvm/IR/IntrinsicsAArch64.td +++ b/include/llvm/IR/IntrinsicsAArch64.td @@ -32,12 +32,24 @@ def int_aarch64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, def int_aarch64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; +//===----------------------------------------------------------------------===// +// HINT + +def int_aarch64_hint : Intrinsic<[], [llvm_i32_ty]>; + //===----------------------------------------------------------------------===// // RBIT def int_aarch64_rbit : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; +//===----------------------------------------------------------------------===// +// Data Barrier Instructions + +def int_aarch64_dmb : GCCBuiltin<"__builtin_arm_dmb">, Intrinsic<[], [llvm_i32_ty]>; +def int_aarch64_dsb : GCCBuiltin<"__builtin_arm_dsb">, Intrinsic<[], [llvm_i32_ty]>; +def int_aarch64_isb : GCCBuiltin<"__builtin_arm_isb">, Intrinsic<[], [llvm_i32_ty]>; + } //===----------------------------------------------------------------------===// diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td index 145eeedc22b4..a02d7072d720 100644 --- a/include/llvm/IR/IntrinsicsARM.td +++ b/include/llvm/IR/IntrinsicsARM.td @@ -54,8 +54,12 @@ def int_arm_ldaexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty]>; //===----------------------------------------------------------------------===// // Data barrier instructions -def int_arm_dmb : GCCBuiltin<"__builtin_arm_dmb">, Intrinsic<[], [llvm_i32_ty]>; -def int_arm_dsb : GCCBuiltin<"__builtin_arm_dsb">, Intrinsic<[], [llvm_i32_ty]>; +def int_arm_dmb : GCCBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">, + Intrinsic<[], [llvm_i32_ty]>; +def int_arm_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">, + Intrinsic<[], [llvm_i32_ty]>; +def int_arm_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">, + Intrinsic<[], [llvm_i32_ty]>; //===----------------------------------------------------------------------===// // VFP @@ -74,17 +78,21 @@ def int_arm_vcvtru : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty], // Move to coprocessor def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">, + MSBuiltin<"_MoveToCoprocessor">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">, + MSBuiltin<"_MoveToCoprocessor2">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; // Move from coprocessor def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">, + MSBuiltin<"_MoveFromCoprocessor">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">, + MSBuiltin<"_MoveFromCoprocessor2">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td index 26dc70acfd3e..cd512841a1af 100644 --- a/include/llvm/IR/IntrinsicsNVVM.td +++ b/include/llvm/IR/IntrinsicsNVVM.td @@ -796,26 +796,25 @@ def llvm_anyi64ptr_ty : LLVMAnyPointerType; // (space)i64* // Generated within nvvm. Use for ldu on sm_20 or later -// @TODO: Revisit this, Changed LLVMAnyPointerType to LLVMPointerType def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty], - [LLVMPointerType>], [IntrReadMem, NoCapture<0>], + [LLVMAnyPointerType>], [IntrReadMem, NoCapture<0>], "llvm.nvvm.ldu.global.i">; def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty], - [LLVMPointerType>], [IntrReadMem, NoCapture<0>], + [LLVMAnyPointerType>], [IntrReadMem, NoCapture<0>], "llvm.nvvm.ldu.global.f">; def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty], - [LLVMPointerType>], [IntrReadMem, NoCapture<0>], + [LLVMAnyPointerType>], [IntrReadMem, NoCapture<0>], "llvm.nvvm.ldu.global.p">; // Generated within nvvm. Use for ldg on sm_35 or later def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty], - [LLVMPointerType>], [IntrReadMem, NoCapture<0>], + [LLVMAnyPointerType>], [IntrReadMem, NoCapture<0>], "llvm.nvvm.ldg.global.i">; def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty], - [LLVMPointerType>], [IntrReadMem, NoCapture<0>], + [LLVMAnyPointerType>], [IntrReadMem, NoCapture<0>], "llvm.nvvm.ldg.global.f">; def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty], - [LLVMPointerType>], [IntrReadMem, NoCapture<0>], + [LLVMAnyPointerType>], [IntrReadMem, NoCapture<0>], "llvm.nvvm.ldg.global.p">; // Use for generic pointers @@ -889,12 +888,164 @@ def int_nvvm_compiler_error : def int_nvvm_compiler_warn : Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.warn">; +def int_nvvm_reflect : + Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty], [IntrNoMem], "llvm.nvvm.reflect">; + +// isspacep.{const, global, local, shared} +def int_nvvm_isspacep_const + : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem], + "llvm.nvvm.isspacep.const">, + GCCBuiltin<"__nvvm_isspacep_const">; +def int_nvvm_isspacep_global + : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem], + "llvm.nvvm.isspacep.global">, + GCCBuiltin<"__nvvm_isspacep_global">; +def int_nvvm_isspacep_local + : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem], + "llvm.nvvm.isspacep.local">, + GCCBuiltin<"__nvvm_isspacep_local">; +def int_nvvm_isspacep_shared + : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem], + "llvm.nvvm.isspacep.shared">, + GCCBuiltin<"__nvvm_isspacep_shared">; + +// Environment register read +def int_nvvm_read_ptx_sreg_envreg0 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg0">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg0">; +def int_nvvm_read_ptx_sreg_envreg1 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg1">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg1">; +def int_nvvm_read_ptx_sreg_envreg2 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg2">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg2">; +def int_nvvm_read_ptx_sreg_envreg3 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg3">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg3">; +def int_nvvm_read_ptx_sreg_envreg4 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg4">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg4">; +def int_nvvm_read_ptx_sreg_envreg5 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg5">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg5">; +def int_nvvm_read_ptx_sreg_envreg6 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg6">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg6">; +def int_nvvm_read_ptx_sreg_envreg7 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg7">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg7">; +def int_nvvm_read_ptx_sreg_envreg8 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg8">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg8">; +def int_nvvm_read_ptx_sreg_envreg9 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg9">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg9">; +def int_nvvm_read_ptx_sreg_envreg10 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg10">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg10">; +def int_nvvm_read_ptx_sreg_envreg11 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg11">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg11">; +def int_nvvm_read_ptx_sreg_envreg12 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg12">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg12">; +def int_nvvm_read_ptx_sreg_envreg13 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg13">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg13">; +def int_nvvm_read_ptx_sreg_envreg14 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg14">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg14">; +def int_nvvm_read_ptx_sreg_envreg15 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg15">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg15">; +def int_nvvm_read_ptx_sreg_envreg16 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg16">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg16">; +def int_nvvm_read_ptx_sreg_envreg17 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg17">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg17">; +def int_nvvm_read_ptx_sreg_envreg18 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg18">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg18">; +def int_nvvm_read_ptx_sreg_envreg19 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg19">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg19">; +def int_nvvm_read_ptx_sreg_envreg20 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg20">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg20">; +def int_nvvm_read_ptx_sreg_envreg21 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg21">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg21">; +def int_nvvm_read_ptx_sreg_envreg22 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg22">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg22">; +def int_nvvm_read_ptx_sreg_envreg23 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg23">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg23">; +def int_nvvm_read_ptx_sreg_envreg24 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg24">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg24">; +def int_nvvm_read_ptx_sreg_envreg25 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg25">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg25">; +def int_nvvm_read_ptx_sreg_envreg26 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg26">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg26">; +def int_nvvm_read_ptx_sreg_envreg27 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg27">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg27">; +def int_nvvm_read_ptx_sreg_envreg28 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg28">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg28">; +def int_nvvm_read_ptx_sreg_envreg29 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg29">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg29">; +def int_nvvm_read_ptx_sreg_envreg30 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg30">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg30">; +def int_nvvm_read_ptx_sreg_envreg31 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem], + "llvm.nvvm.read.ptx.sreg.envreg31">, + GCCBuiltin<"__nvvm_read_ptx_sreg_envreg31">; + // Texture Fetch -def int_nvvm_tex_1d_v4f32_i32 +// texmode_independent +def int_nvvm_tex_1d_v4f32_s32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.tex.1d.v4f32.i32">; + "llvm.nvvm.tex.1d.v4f32.s32">; def int_nvvm_tex_1d_v4f32_f32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], [], @@ -908,28 +1059,45 @@ def int_nvvm_tex_1d_grad_v4f32_f32 [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], "llvm.nvvm.tex.1d.grad.v4f32.f32">; -def int_nvvm_tex_1d_v4i32_i32 +def int_nvvm_tex_1d_v4s32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.tex.1d.v4s32.s32">; +def int_nvvm_tex_1d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], [], + "llvm.nvvm.tex.1d.v4s32.f32">; +def int_nvvm_tex_1d_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.1d.level.v4s32.f32">; +def int_nvvm_tex_1d_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.1d.grad.v4s32.f32">; +def int_nvvm_tex_1d_v4u32_s32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.tex.1d.v4i32.i32">; -def int_nvvm_tex_1d_v4i32_f32 + "llvm.nvvm.tex.1d.v4u32.s32">; +def int_nvvm_tex_1d_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], [], - "llvm.nvvm.tex.1d.v4i32.f32">; -def int_nvvm_tex_1d_level_v4i32_f32 + "llvm.nvvm.tex.1d.v4u32.f32">; +def int_nvvm_tex_1d_level_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.1d.level.v4i32.f32.level">; -def int_nvvm_tex_1d_grad_v4i32_f32 + "llvm.nvvm.tex.1d.level.v4u32.f32">; +def int_nvvm_tex_1d_grad_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.1d.grad.v4i32.f32">; + "llvm.nvvm.tex.1d.grad.v4u32.f32">; -def int_nvvm_tex_1d_array_v4f32_i32 +def int_nvvm_tex_1d_array_v4f32_s32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.tex.1d.array.v4f32.i32">; + "llvm.nvvm.tex.1d.array.v4f32.s32">; def int_nvvm_tex_1d_array_v4f32_f32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [], @@ -944,29 +1112,47 @@ def int_nvvm_tex_1d_array_grad_v4f32_f32 [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], "llvm.nvvm.tex.1d.array.grad.v4f32.f32">; -def int_nvvm_tex_1d_array_v4i32_i32 +def int_nvvm_tex_1d_array_v4s32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.tex.1d.array.v4s32.s32">; +def int_nvvm_tex_1d_array_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [], + "llvm.nvvm.tex.1d.array.v4s32.f32">; +def int_nvvm_tex_1d_array_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.1d.array.level.v4s32.f32">; +def int_nvvm_tex_1d_array_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.1d.array.grad.v4s32.f32">; +def int_nvvm_tex_1d_array_v4u32_s32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.tex.1d.array.v4i32.i32">; -def int_nvvm_tex_1d_array_v4i32_f32 + "llvm.nvvm.tex.1d.array.v4u32.s32">; +def int_nvvm_tex_1d_array_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [], - "llvm.nvvm.tex.1d.array.v4i32.f32">; -def int_nvvm_tex_1d_array_level_v4i32_f32 + "llvm.nvvm.tex.1d.array.v4u32.f32">; +def int_nvvm_tex_1d_array_level_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.1d.array.level.v4i32.f32">; -def int_nvvm_tex_1d_array_grad_v4i32_f32 + "llvm.nvvm.tex.1d.array.level.v4u32.f32">; +def int_nvvm_tex_1d_array_grad_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.1d.array.grad.v4i32.f32">; + "llvm.nvvm.tex.1d.array.grad.v4u32.f32">; -def int_nvvm_tex_2d_v4f32_i32 +def int_nvvm_tex_2d_v4f32_s32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.tex.2d.v4f32.i32">; + "llvm.nvvm.tex.2d.v4f32.s32">; def int_nvvm_tex_2d_v4f32_f32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], @@ -981,30 +1167,48 @@ def int_nvvm_tex_2d_grad_v4f32_f32 [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], "llvm.nvvm.tex.2d.grad.v4f32.f32">; -def int_nvvm_tex_2d_v4i32_i32 +def int_nvvm_tex_2d_v4s32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.tex.2d.v4s32.s32">; +def int_nvvm_tex_2d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.2d.v4s32.f32">; +def int_nvvm_tex_2d_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.2d.level.v4s32.f32">; +def int_nvvm_tex_2d_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.2d.grad.v4s32.f32">; +def int_nvvm_tex_2d_v4u32_s32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.tex.2d.v4i32.i32">; -def int_nvvm_tex_2d_v4i32_f32 + "llvm.nvvm.tex.2d.v4u32.s32">; +def int_nvvm_tex_2d_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.2d.v4i32.f32">; -def int_nvvm_tex_2d_level_v4i32_f32 + "llvm.nvvm.tex.2d.v4u32.f32">; +def int_nvvm_tex_2d_level_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.2d.level.v4i32.f32">; -def int_nvvm_tex_2d_grad_v4i32_f32 + "llvm.nvvm.tex.2d.level.v4u32.f32">; +def int_nvvm_tex_2d_grad_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.2d.grad.v4i32.f32">; + "llvm.nvvm.tex.2d.grad.v4u32.f32">; -def int_nvvm_tex_2d_array_v4f32_i32 +def int_nvvm_tex_2d_array_v4f32_s32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.tex.2d.array.v4f32.i32">; + "llvm.nvvm.tex.2d.array.v4f32.s32">; def int_nvvm_tex_2d_array_v4f32_f32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, @@ -1021,32 +1225,53 @@ def int_nvvm_tex_2d_array_grad_v4f32_f32 llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], "llvm.nvvm.tex.2d.array.grad.v4f32.f32">; -def int_nvvm_tex_2d_array_v4i32_i32 +def int_nvvm_tex_2d_array_v4s32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty], [], + "llvm.nvvm.tex.2d.array.v4s32.s32">; +def int_nvvm_tex_2d_array_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.2d.array.v4s32.f32">; +def int_nvvm_tex_2d_array_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.2d.array.level.v4s32.f32">; +def int_nvvm_tex_2d_array_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.2d.array.grad.v4s32.f32">; +def int_nvvm_tex_2d_array_v4u32_s32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.tex.2d.array.v4i32.i32">; -def int_nvvm_tex_2d_array_v4i32_f32 + "llvm.nvvm.tex.2d.array.v4u32.s32">; +def int_nvvm_tex_2d_array_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.2d.array.v4i32.f32">; -def int_nvvm_tex_2d_array_level_v4i32_f32 + "llvm.nvvm.tex.2d.array.v4u32.f32">; +def int_nvvm_tex_2d_array_level_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.2d.array.level.v4i32.f32">; -def int_nvvm_tex_2d_array_grad_v4i32_f32 + "llvm.nvvm.tex.2d.array.level.v4u32.f32">; +def int_nvvm_tex_2d_array_grad_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.2d.array.grad.v4i32.f32">; + "llvm.nvvm.tex.2d.array.grad.v4u32.f32">; -def int_nvvm_tex_3d_v4f32_i32 +def int_nvvm_tex_3d_v4f32_s32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [], "llvm.nvvm.tex.3d.v4f32.i32">; + [], "llvm.nvvm.tex.3d.v4f32.s32">; def int_nvvm_tex_3d_v4f32_f32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, @@ -1063,212 +1288,1237 @@ def int_nvvm_tex_3d_grad_v4f32_f32 llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], "llvm.nvvm.tex.3d.grad.v4f32.f32">; -def int_nvvm_tex_3d_v4i32_i32 +def int_nvvm_tex_3d_v4s32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [], "llvm.nvvm.tex.3d.v4s32.s32">; +def int_nvvm_tex_3d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.3d.v4s32.f32">; +def int_nvvm_tex_3d_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.3d.level.v4s32.f32">; +def int_nvvm_tex_3d_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.3d.grad.v4s32.f32">; +def int_nvvm_tex_3d_v4u32_s32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [], "llvm.nvvm.tex.3d.v4i32.i32">; -def int_nvvm_tex_3d_v4i32_f32 + [], "llvm.nvvm.tex.3d.v4u32.s32">; +def int_nvvm_tex_3d_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.3d.v4i32.f32">; -def int_nvvm_tex_3d_level_v4i32_f32 + "llvm.nvvm.tex.3d.v4u32.f32">; +def int_nvvm_tex_3d_level_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.3d.level.v4i32.f32">; -def int_nvvm_tex_3d_grad_v4i32_f32 + "llvm.nvvm.tex.3d.level.v4u32.f32">; +def int_nvvm_tex_3d_grad_v4u32_f32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], - "llvm.nvvm.tex.3d.grad.v4i32.f32">; + "llvm.nvvm.tex.3d.grad.v4u32.f32">; -// Surface Load -def int_nvvm_suld_1d_i8_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.i8.trap">; -def int_nvvm_suld_1d_i16_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.i16.trap">; -def int_nvvm_suld_1d_i32_trap - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.i32.trap">; -def int_nvvm_suld_1d_v2i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.v2i8.trap">; -def int_nvvm_suld_1d_v2i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.v2i16.trap">; -def int_nvvm_suld_1d_v2i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.v2i32.trap">; -def int_nvvm_suld_1d_v4i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], +def int_nvvm_tex_cube_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.v4f32.f32">; +def int_nvvm_tex_cube_level_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.level.v4f32.f32">; +def int_nvvm_tex_cube_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.v4s32.f32">; +def int_nvvm_tex_cube_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.level.v4s32.f32">; +def int_nvvm_tex_cube_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.v4u32.f32">; +def int_nvvm_tex_cube_level_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.level.v4u32.f32">; + +def int_nvvm_tex_cube_array_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.array.v4f32.f32">; +def int_nvvm_tex_cube_array_level_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.array.level.v4f32.f32">; +def int_nvvm_tex_cube_array_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.array.v4s32.f32">; +def int_nvvm_tex_cube_array_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.array.level.v4s32.f32">; +def int_nvvm_tex_cube_array_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.array.v4u32.f32">; +def int_nvvm_tex_cube_array_level_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.cube.array.level.v4u32.f32">; + +def int_nvvm_tld4_r_2d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.r.2d.v4f32.f32">; +def int_nvvm_tld4_g_2d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.g.2d.v4f32.f32">; +def int_nvvm_tld4_b_2d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.b.2d.v4f32.f32">; +def int_nvvm_tld4_a_2d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.a.2d.v4f32.f32">; +def int_nvvm_tld4_r_2d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.r.2d.v4s32.f32">; +def int_nvvm_tld4_g_2d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.g.2d.v4s32.f32">; +def int_nvvm_tld4_b_2d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.b.2d.v4s32.f32">; +def int_nvvm_tld4_a_2d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.a.2d.v4s32.f32">; +def int_nvvm_tld4_r_2d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.r.2d.v4u32.f32">; +def int_nvvm_tld4_g_2d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.g.2d.v4u32.f32">; +def int_nvvm_tld4_b_2d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.b.2d.v4u32.f32">; +def int_nvvm_tld4_a_2d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.a.2d.v4u32.f32">; + + +// texmode_unified +def int_nvvm_tex_unified_1d_v4f32_s32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.v4i8.trap">; -def int_nvvm_suld_1d_v4i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + "llvm.nvvm.tex.unified.1d.v4f32.s32">; +def int_nvvm_tex_unified_1d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.v4f32.f32">; +def int_nvvm_tex_unified_1d_level_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.level.v4f32.f32">; +def int_nvvm_tex_unified_1d_grad_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.grad.v4f32.f32">; +def int_nvvm_tex_unified_1d_v4s32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.v4i16.trap">; -def int_nvvm_suld_1d_v4i32_trap + "llvm.nvvm.tex.unified.1d.v4s32.s32">; +def int_nvvm_tex_unified_1d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.v4s32.f32">; +def int_nvvm_tex_unified_1d_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.level.v4s32.f32">; +def int_nvvm_tex_unified_1d_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.grad.v4s32.f32">; +def int_nvvm_tex_unified_1d_v4u32_s32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.v4i32.trap">; + "llvm.nvvm.tex.unified.1d.v4u32.s32">; +def int_nvvm_tex_unified_1d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.v4u32.f32">; +def int_nvvm_tex_unified_1d_level_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.level.v4u32.f32">; +def int_nvvm_tex_unified_1d_grad_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.grad.v4u32.f32">; -def int_nvvm_suld_1d_array_i8_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.array.i8.trap">; -def int_nvvm_suld_1d_array_i16_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.array.i16.trap">; -def int_nvvm_suld_1d_array_i32_trap - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.array.i32.trap">; -def int_nvvm_suld_1d_array_v2i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], +def int_nvvm_tex_unified_1d_array_v4f32_s32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.array.v2i8.trap">; -def int_nvvm_suld_1d_array_v2i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + "llvm.nvvm.tex.unified.1d.array.v4f32.s32">; +def int_nvvm_tex_unified_1d_array_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.array.v4f32.f32">; +def int_nvvm_tex_unified_1d_array_level_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.array.level.v4f32.f32">; +def int_nvvm_tex_unified_1d_array_grad_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.array.grad.v4f32.f32">; +def int_nvvm_tex_unified_1d_array_v4s32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.array.v2i16.trap">; -def int_nvvm_suld_1d_array_v2i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + "llvm.nvvm.tex.unified.1d.array.v4s32.s32">; +def int_nvvm_tex_unified_1d_array_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.array.v4s32.f32">; +def int_nvvm_tex_unified_1d_array_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.array.level.v4s32.f32">; +def int_nvvm_tex_unified_1d_array_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.array.grad.v4s32.f32">; +def int_nvvm_tex_unified_1d_array_v4u32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.array.v2i32.trap">; -def int_nvvm_suld_1d_array_v4i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + "llvm.nvvm.tex.unified.1d.array.v4u32.s32">; +def int_nvvm_tex_unified_1d_array_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.array.v4u32.f32">; +def int_nvvm_tex_unified_1d_array_level_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.array.level.v4u32.f32">; +def int_nvvm_tex_unified_1d_array_grad_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.1d.array.grad.v4u32.f32">; + +def int_nvvm_tex_unified_2d_v4f32_s32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.array.v4i8.trap">; -def int_nvvm_suld_1d_array_v4i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + "llvm.nvvm.tex.unified.2d.v4f32.s32">; +def int_nvvm_tex_unified_2d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.v4f32.f32">; +def int_nvvm_tex_unified_2d_level_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.level.v4f32.f32">; +def int_nvvm_tex_unified_2d_grad_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.grad.v4f32.f32">; +def int_nvvm_tex_unified_2d_v4s32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.array.v4i16.trap">; -def int_nvvm_suld_1d_array_v4i32_trap + "llvm.nvvm.tex.unified.2d.v4s32.s32">; +def int_nvvm_tex_unified_2d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.v4s32.f32">; +def int_nvvm_tex_unified_2d_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.level.v4s32.f32">; +def int_nvvm_tex_unified_2d_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.grad.v4s32.f32">; +def int_nvvm_tex_unified_2d_v4u32_s32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.1d.array.v4i32.trap">; + "llvm.nvvm.tex.unified.2d.v4u32.s32">; +def int_nvvm_tex_unified_2d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.v4u32.f32">; +def int_nvvm_tex_unified_2d_level_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.level.v4u32.f32">; +def int_nvvm_tex_unified_2d_grad_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.grad.v4u32.f32">; -def int_nvvm_suld_2d_i8_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.i8.trap">; -def int_nvvm_suld_2d_i16_trap +def int_nvvm_tex_unified_2d_array_v4f32_s32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty], [], + "llvm.nvvm.tex.unified.2d.array.v4f32.s32">; +def int_nvvm_tex_unified_2d_array_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.array.v4f32.f32">; +def int_nvvm_tex_unified_2d_array_level_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.array.level.v4f32.f32">; +def int_nvvm_tex_unified_2d_array_grad_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.array.grad.v4f32.f32">; +def int_nvvm_tex_unified_2d_array_v4s32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty], [], + "llvm.nvvm.tex.unified.2d.array.v4s32.s32">; +def int_nvvm_tex_unified_2d_array_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.array.v4s32.f32">; +def int_nvvm_tex_unified_2d_array_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.array.level.v4s32.f32">; +def int_nvvm_tex_unified_2d_array_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.array.grad.v4s32.f32">; +def int_nvvm_tex_unified_2d_array_v4u32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty], [], + "llvm.nvvm.tex.unified.2d.array.v4u32.s32">; +def int_nvvm_tex_unified_2d_array_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.array.v4u32.f32">; +def int_nvvm_tex_unified_2d_array_level_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.array.level.v4u32.f32">; +def int_nvvm_tex_unified_2d_array_grad_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.2d.array.grad.v4u32.f32">; + +def int_nvvm_tex_unified_3d_v4f32_s32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [], "llvm.nvvm.tex.unified.3d.v4f32.s32">; +def int_nvvm_tex_unified_3d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.3d.v4f32.f32">; +def int_nvvm_tex_unified_3d_level_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.3d.level.v4f32.f32">; +def int_nvvm_tex_unified_3d_grad_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.3d.grad.v4f32.f32">; +def int_nvvm_tex_unified_3d_v4s32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [], "llvm.nvvm.tex.unified.3d.v4s32.s32">; +def int_nvvm_tex_unified_3d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.3d.v4s32.f32">; +def int_nvvm_tex_unified_3d_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.3d.level.v4s32.f32">; +def int_nvvm_tex_unified_3d_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.3d.grad.v4s32.f32">; +def int_nvvm_tex_unified_3d_v4u32_s32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [], "llvm.nvvm.tex.unified.3d.v4u32.s32">; +def int_nvvm_tex_unified_3d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty], [], + "llvm.nvvm.tex.unified.3d.v4u32.f32">; +def int_nvvm_tex_unified_3d_level_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.3d.level.v4u32.f32">; +def int_nvvm_tex_unified_3d_grad_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.3d.grad.v4u32.f32">; + +def int_nvvm_tex_unified_cube_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.v4f32.f32">; +def int_nvvm_tex_unified_cube_level_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.level.v4f32.f32">; +def int_nvvm_tex_unified_cube_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.v4s32.f32">; +def int_nvvm_tex_unified_cube_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.level.v4s32.f32">; +def int_nvvm_tex_unified_cube_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.v4u32.f32">; +def int_nvvm_tex_unified_cube_level_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.level.v4u32.f32">; + +def int_nvvm_tex_unified_cube_array_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.array.v4f32.f32">; +def int_nvvm_tex_unified_cube_array_level_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.array.level.v4f32.f32">; +def int_nvvm_tex_unified_cube_array_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.array.v4s32.f32">; +def int_nvvm_tex_unified_cube_array_level_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.array.level.v4s32.f32">; +def int_nvvm_tex_unified_cube_array_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.array.v4u32.f32">; +def int_nvvm_tex_unified_cube_array_level_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.array.level.v4u32.f32">; + +def int_nvvm_tld4_unified_r_2d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.r.2d.v4f32.f32">; +def int_nvvm_tld4_unified_g_2d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.g.2d.v4f32.f32">; +def int_nvvm_tld4_unified_b_2d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.b.2d.v4f32.f32">; +def int_nvvm_tld4_unified_a_2d_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.a.2d.v4f32.f32">; +def int_nvvm_tld4_unified_r_2d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.r.2d.v4s32.f32">; +def int_nvvm_tld4_unified_g_2d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.g.2d.v4s32.f32">; +def int_nvvm_tld4_unified_b_2d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.b.2d.v4s32.f32">; +def int_nvvm_tld4_unified_a_2d_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.a.2d.v4s32.f32">; +def int_nvvm_tld4_unified_r_2d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.r.2d.v4u32.f32">; +def int_nvvm_tld4_unified_g_2d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.g.2d.v4u32.f32">; +def int_nvvm_tld4_unified_b_2d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.b.2d.v4u32.f32">; +def int_nvvm_tld4_unified_a_2d_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tld4.unified.a.2d.v4u32.f32">; + + +//=== Surface Load +// .clamp variants +def int_nvvm_suld_1d_i8_clamp + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i8.clamp">; +def int_nvvm_suld_1d_i16_clamp + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i16.clamp">; +def int_nvvm_suld_1d_i32_clamp + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i32.clamp">; +def int_nvvm_suld_1d_i64_clamp + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i64.clamp">; +def int_nvvm_suld_1d_v2i8_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i8.clamp">; +def int_nvvm_suld_1d_v2i16_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i16.clamp">; +def int_nvvm_suld_1d_v2i32_clamp + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i32.clamp">; +def int_nvvm_suld_1d_v2i64_clamp + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i64.clamp">; +def int_nvvm_suld_1d_v4i8_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v4i8.clamp">; +def int_nvvm_suld_1d_v4i16_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v4i16.clamp">; +def int_nvvm_suld_1d_v4i32_clamp + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v4i32.clamp">; + +def int_nvvm_suld_1d_array_i8_clamp + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i8.clamp">; +def int_nvvm_suld_1d_array_i16_clamp + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i16.clamp">; +def int_nvvm_suld_1d_array_i32_clamp + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i32.clamp">; +def int_nvvm_suld_1d_array_i64_clamp + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i64.clamp">; +def int_nvvm_suld_1d_array_v2i8_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i8.clamp">; +def int_nvvm_suld_1d_array_v2i16_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i16.clamp">; +def int_nvvm_suld_1d_array_v2i32_clamp + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i32.clamp">; +def int_nvvm_suld_1d_array_v2i64_clamp + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i64.clamp">; +def int_nvvm_suld_1d_array_v4i8_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v4i8.clamp">; +def int_nvvm_suld_1d_array_v4i16_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v4i16.clamp">; +def int_nvvm_suld_1d_array_v4i32_clamp + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v4i32.clamp">; + +def int_nvvm_suld_2d_i8_clamp + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.i8.clamp">; +def int_nvvm_suld_2d_i16_clamp + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.i16.clamp">; +def int_nvvm_suld_2d_i32_clamp + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.i32.clamp">; +def int_nvvm_suld_2d_i64_clamp + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.i64.clamp">; +def int_nvvm_suld_2d_v2i8_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v2i8.clamp">; +def int_nvvm_suld_2d_v2i16_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v2i16.clamp">; +def int_nvvm_suld_2d_v2i32_clamp + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v2i32.clamp">; +def int_nvvm_suld_2d_v2i64_clamp + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v2i64.clamp">; +def int_nvvm_suld_2d_v4i8_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v4i8.clamp">; +def int_nvvm_suld_2d_v4i16_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v4i16.clamp">; +def int_nvvm_suld_2d_v4i32_clamp + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v4i32.clamp">; + +def int_nvvm_suld_2d_array_i8_clamp + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.i8.clamp">; +def int_nvvm_suld_2d_array_i16_clamp + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.i16.clamp">; +def int_nvvm_suld_2d_array_i32_clamp + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.i32.clamp">; +def int_nvvm_suld_2d_array_i64_clamp + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.i64.clamp">; +def int_nvvm_suld_2d_array_v2i8_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v2i8.clamp">; +def int_nvvm_suld_2d_array_v2i16_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v2i16.clamp">; +def int_nvvm_suld_2d_array_v2i32_clamp + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v2i32.clamp">; +def int_nvvm_suld_2d_array_v2i64_clamp + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v2i64.clamp">; +def int_nvvm_suld_2d_array_v4i8_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v4i8.clamp">; +def int_nvvm_suld_2d_array_v4i16_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v4i16.clamp">; +def int_nvvm_suld_2d_array_v4i32_clamp + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v4i32.clamp">; + +def int_nvvm_suld_3d_i8_clamp + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.i8.clamp">; +def int_nvvm_suld_3d_i16_clamp + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.i16.clamp">; +def int_nvvm_suld_3d_i32_clamp + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.i32.clamp">; +def int_nvvm_suld_3d_i64_clamp + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.i64.clamp">; +def int_nvvm_suld_3d_v2i8_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v2i8.clamp">; +def int_nvvm_suld_3d_v2i16_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v2i16.clamp">; +def int_nvvm_suld_3d_v2i32_clamp + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v2i32.clamp">; +def int_nvvm_suld_3d_v2i64_clamp + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v2i64.clamp">; +def int_nvvm_suld_3d_v4i8_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v4i8.clamp">; +def int_nvvm_suld_3d_v4i16_clamp + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v4i16.clamp">; +def int_nvvm_suld_3d_v4i32_clamp + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v4i32.clamp">; + +// .trap variants +def int_nvvm_suld_1d_i8_trap + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i8.trap">; +def int_nvvm_suld_1d_i16_trap + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i16.trap">; +def int_nvvm_suld_1d_i32_trap + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i32.trap">; +def int_nvvm_suld_1d_i64_trap + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i64.trap">; +def int_nvvm_suld_1d_v2i8_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i8.trap">; +def int_nvvm_suld_1d_v2i16_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i16.trap">; +def int_nvvm_suld_1d_v2i32_trap + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i32.trap">; +def int_nvvm_suld_1d_v2i64_trap + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i64.trap">; +def int_nvvm_suld_1d_v4i8_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v4i8.trap">; +def int_nvvm_suld_1d_v4i16_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v4i16.trap">; +def int_nvvm_suld_1d_v4i32_trap + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v4i32.trap">; + +def int_nvvm_suld_1d_array_i8_trap + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i8.trap">; +def int_nvvm_suld_1d_array_i16_trap + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i16.trap">; +def int_nvvm_suld_1d_array_i32_trap + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i32.trap">; +def int_nvvm_suld_1d_array_i64_trap + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i64.trap">; +def int_nvvm_suld_1d_array_v2i8_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i8.trap">; +def int_nvvm_suld_1d_array_v2i16_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i16.trap">; +def int_nvvm_suld_1d_array_v2i32_trap + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i32.trap">; +def int_nvvm_suld_1d_array_v2i64_trap + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i64.trap">; +def int_nvvm_suld_1d_array_v4i8_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v4i8.trap">; +def int_nvvm_suld_1d_array_v4i16_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v4i16.trap">; +def int_nvvm_suld_1d_array_v4i32_trap + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v4i32.trap">; + +def int_nvvm_suld_2d_i8_trap + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.i8.trap">; +def int_nvvm_suld_2d_i16_trap + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.i16.trap">; +def int_nvvm_suld_2d_i32_trap + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.i32.trap">; +def int_nvvm_suld_2d_i64_trap + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.i64.trap">; +def int_nvvm_suld_2d_v2i8_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v2i8.trap">; +def int_nvvm_suld_2d_v2i16_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v2i16.trap">; +def int_nvvm_suld_2d_v2i32_trap + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v2i32.trap">; +def int_nvvm_suld_2d_v2i64_trap + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v2i64.trap">; +def int_nvvm_suld_2d_v4i8_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v4i8.trap">; +def int_nvvm_suld_2d_v4i16_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v4i16.trap">; +def int_nvvm_suld_2d_v4i32_trap + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v4i32.trap">; + +def int_nvvm_suld_2d_array_i8_trap + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.i8.trap">; +def int_nvvm_suld_2d_array_i16_trap + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.i16.trap">; +def int_nvvm_suld_2d_array_i32_trap + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.i32.trap">; +def int_nvvm_suld_2d_array_i64_trap + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.i64.trap">; +def int_nvvm_suld_2d_array_v2i8_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v2i8.trap">; +def int_nvvm_suld_2d_array_v2i16_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v2i16.trap">; +def int_nvvm_suld_2d_array_v2i32_trap + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v2i32.trap">; +def int_nvvm_suld_2d_array_v2i64_trap + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v2i64.trap">; +def int_nvvm_suld_2d_array_v4i8_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v4i8.trap">; +def int_nvvm_suld_2d_array_v4i16_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v4i16.trap">; +def int_nvvm_suld_2d_array_v4i32_trap + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v4i32.trap">; + +def int_nvvm_suld_3d_i8_trap + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.i8.trap">; +def int_nvvm_suld_3d_i16_trap + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.i16.trap">; +def int_nvvm_suld_3d_i32_trap + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.i32.trap">; +def int_nvvm_suld_3d_i64_trap + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.i64.trap">; +def int_nvvm_suld_3d_v2i8_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v2i8.trap">; +def int_nvvm_suld_3d_v2i16_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v2i16.trap">; +def int_nvvm_suld_3d_v2i32_trap + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v2i32.trap">; +def int_nvvm_suld_3d_v2i64_trap + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v2i64.trap">; +def int_nvvm_suld_3d_v4i8_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v4i8.trap">; +def int_nvvm_suld_3d_v4i16_trap + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v4i16.trap">; +def int_nvvm_suld_3d_v4i32_trap + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v4i32.trap">; + +// .zero variants +def int_nvvm_suld_1d_i8_zero + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i8.zero">; +def int_nvvm_suld_1d_i16_zero + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i16.zero">; +def int_nvvm_suld_1d_i32_zero + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i32.zero">; +def int_nvvm_suld_1d_i64_zero + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.i64.zero">; +def int_nvvm_suld_1d_v2i8_zero + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i8.zero">; +def int_nvvm_suld_1d_v2i16_zero + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i16.zero">; +def int_nvvm_suld_1d_v2i32_zero + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i32.zero">; +def int_nvvm_suld_1d_v2i64_zero + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v2i64.zero">; +def int_nvvm_suld_1d_v4i8_zero + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v4i8.zero">; +def int_nvvm_suld_1d_v4i16_zero + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v4i16.zero">; +def int_nvvm_suld_1d_v4i32_zero + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.v4i32.zero">; + +def int_nvvm_suld_1d_array_i8_zero + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i8.zero">; +def int_nvvm_suld_1d_array_i16_zero + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i16.zero">; +def int_nvvm_suld_1d_array_i32_zero + : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i32.zero">; +def int_nvvm_suld_1d_array_i64_zero + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.i64.zero">; +def int_nvvm_suld_1d_array_v2i8_zero + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i8.zero">; +def int_nvvm_suld_1d_array_v2i16_zero + : Intrinsic<[llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i16.zero">; +def int_nvvm_suld_1d_array_v2i32_zero + : Intrinsic<[llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i32.zero">; +def int_nvvm_suld_1d_array_v2i64_zero + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v2i64.zero">; +def int_nvvm_suld_1d_array_v4i8_zero + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v4i8.zero">; +def int_nvvm_suld_1d_array_v4i16_zero + : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v4i16.zero">; +def int_nvvm_suld_1d_array_v4i32_zero + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.1d.array.v4i32.zero">; + +def int_nvvm_suld_2d_i8_zero + : Intrinsic<[llvm_i16_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.i8.zero">; +def int_nvvm_suld_2d_i16_zero : Intrinsic<[llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.i16.trap">; -def int_nvvm_suld_2d_i32_trap + "llvm.nvvm.suld.2d.i16.zero">; +def int_nvvm_suld_2d_i32_zero : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.i32.trap">; -def int_nvvm_suld_2d_v2i8_trap + "llvm.nvvm.suld.2d.i32.zero">; +def int_nvvm_suld_2d_i64_zero + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.i64.zero">; +def int_nvvm_suld_2d_v2i8_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.v2i8.trap">; -def int_nvvm_suld_2d_v2i16_trap + "llvm.nvvm.suld.2d.v2i8.zero">; +def int_nvvm_suld_2d_v2i16_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.v2i16.trap">; -def int_nvvm_suld_2d_v2i32_trap + "llvm.nvvm.suld.2d.v2i16.zero">; +def int_nvvm_suld_2d_v2i32_zero : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.v2i32.trap">; -def int_nvvm_suld_2d_v4i8_trap + "llvm.nvvm.suld.2d.v2i32.zero">; +def int_nvvm_suld_2d_v2i64_zero + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.v2i64.zero">; +def int_nvvm_suld_2d_v4i8_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.v4i8.trap">; -def int_nvvm_suld_2d_v4i16_trap + "llvm.nvvm.suld.2d.v4i8.zero">; +def int_nvvm_suld_2d_v4i16_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.v4i16.trap">; -def int_nvvm_suld_2d_v4i32_trap + "llvm.nvvm.suld.2d.v4i16.zero">; +def int_nvvm_suld_2d_v4i32_zero : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.v4i32.trap">; + "llvm.nvvm.suld.2d.v4i32.zero">; -def int_nvvm_suld_2d_array_i8_trap +def int_nvvm_suld_2d_array_i8_zero : Intrinsic<[llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.array.i8.trap">; -def int_nvvm_suld_2d_array_i16_trap + "llvm.nvvm.suld.2d.array.i8.zero">; +def int_nvvm_suld_2d_array_i16_zero : Intrinsic<[llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.array.i16.trap">; -def int_nvvm_suld_2d_array_i32_trap + "llvm.nvvm.suld.2d.array.i16.zero">; +def int_nvvm_suld_2d_array_i32_zero : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.array.i32.trap">; -def int_nvvm_suld_2d_array_v2i8_trap + "llvm.nvvm.suld.2d.array.i32.zero">; +def int_nvvm_suld_2d_array_i64_zero + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.i64.zero">; +def int_nvvm_suld_2d_array_v2i8_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.array.v2i8.trap">; -def int_nvvm_suld_2d_array_v2i16_trap + "llvm.nvvm.suld.2d.array.v2i8.zero">; +def int_nvvm_suld_2d_array_v2i16_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.array.v2i16.trap">; -def int_nvvm_suld_2d_array_v2i32_trap + "llvm.nvvm.suld.2d.array.v2i16.zero">; +def int_nvvm_suld_2d_array_v2i32_zero : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.array.v2i32.trap">; -def int_nvvm_suld_2d_array_v4i8_trap + "llvm.nvvm.suld.2d.array.v2i32.zero">; +def int_nvvm_suld_2d_array_v2i64_zero + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.2d.array.v2i64.zero">; +def int_nvvm_suld_2d_array_v4i8_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.array.v4i8.trap">; -def int_nvvm_suld_2d_array_v4i16_trap + "llvm.nvvm.suld.2d.array.v4i8.zero">; +def int_nvvm_suld_2d_array_v4i16_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.array.v4i16.trap">; -def int_nvvm_suld_2d_array_v4i32_trap + "llvm.nvvm.suld.2d.array.v4i16.zero">; +def int_nvvm_suld_2d_array_v4i32_zero : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.2d.array.v4i32.trap">; + "llvm.nvvm.suld.2d.array.v4i32.zero">; -def int_nvvm_suld_3d_i8_trap +def int_nvvm_suld_3d_i8_zero : Intrinsic<[llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.3d.i8.trap">; -def int_nvvm_suld_3d_i16_trap + "llvm.nvvm.suld.3d.i8.zero">; +def int_nvvm_suld_3d_i16_zero : Intrinsic<[llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.3d.i16.trap">; -def int_nvvm_suld_3d_i32_trap + "llvm.nvvm.suld.3d.i16.zero">; +def int_nvvm_suld_3d_i32_zero : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.3d.i32.trap">; -def int_nvvm_suld_3d_v2i8_trap + "llvm.nvvm.suld.3d.i32.zero">; +def int_nvvm_suld_3d_i64_zero + : Intrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.i64.zero">; +def int_nvvm_suld_3d_v2i8_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.3d.v2i8.trap">; -def int_nvvm_suld_3d_v2i16_trap + "llvm.nvvm.suld.3d.v2i8.zero">; +def int_nvvm_suld_3d_v2i16_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.3d.v2i16.trap">; -def int_nvvm_suld_3d_v2i32_trap + "llvm.nvvm.suld.3d.v2i16.zero">; +def int_nvvm_suld_3d_v2i32_zero : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.3d.v2i32.trap">; -def int_nvvm_suld_3d_v4i8_trap + "llvm.nvvm.suld.3d.v2i32.zero">; +def int_nvvm_suld_3d_v2i64_zero + : Intrinsic<[llvm_i64_ty, llvm_i64_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.suld.3d.v2i64.zero">; +def int_nvvm_suld_3d_v4i8_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.3d.v4i8.trap">; -def int_nvvm_suld_3d_v4i16_trap + "llvm.nvvm.suld.3d.v4i8.zero">; +def int_nvvm_suld_3d_v4i16_zero : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.3d.v4i16.trap">; -def int_nvvm_suld_3d_v4i32_trap + "llvm.nvvm.suld.3d.v4i16.zero">; +def int_nvvm_suld_3d_v4i32_zero : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], - "llvm.nvvm.suld.3d.v4i32.trap">; + "llvm.nvvm.suld.3d.v4i32.zero">; //===- Texture Query ------------------------------------------------------===// @@ -1353,7 +2603,277 @@ def int_nvvm_istypep_texture //===- Surface Stores -----------------------------------------------------===// // Unformatted +// .clamp variant +def int_nvvm_sust_b_1d_i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_i8_clamp">; +def int_nvvm_sust_b_1d_i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_i16_clamp">; +def int_nvvm_sust_b_1d_i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_i32_clamp">; +def int_nvvm_sust_b_1d_i64_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.i64.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_i64_clamp">; +def int_nvvm_sust_b_1d_v2i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.v2i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_v2i8_clamp">; +def int_nvvm_sust_b_1d_v2i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.v2i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_v2i16_clamp">; +def int_nvvm_sust_b_1d_v2i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.v2i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_v2i32_clamp">; +def int_nvvm_sust_b_1d_v2i64_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.v2i64.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_v2i64_clamp">; +def int_nvvm_sust_b_1d_v4i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.v4i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_v4i8_clamp">; +def int_nvvm_sust_b_1d_v4i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.v4i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_v4i16_clamp">; +def int_nvvm_sust_b_1d_v4i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.v4i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_v4i32_clamp">; + + +def int_nvvm_sust_b_1d_array_i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_i8_clamp">; +def int_nvvm_sust_b_1d_array_i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_i16_clamp">; +def int_nvvm_sust_b_1d_array_i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.array.i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_i32_clamp">; +def int_nvvm_sust_b_1d_array_i64_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.array.i64.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_i64_clamp">; +def int_nvvm_sust_b_1d_array_v2i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.v2i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v2i8_clamp">; +def int_nvvm_sust_b_1d_array_v2i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.v2i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v2i16_clamp">; +def int_nvvm_sust_b_1d_array_v2i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.array.v2i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v2i32_clamp">; +def int_nvvm_sust_b_1d_array_v2i64_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.array.v2i64.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v2i64_clamp">; +def int_nvvm_sust_b_1d_array_v4i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.v4i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v4i8_clamp">; +def int_nvvm_sust_b_1d_array_v4i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.v4i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v4i16_clamp">; +def int_nvvm_sust_b_1d_array_v4i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.array.v4i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v4i32_clamp">; + + +def int_nvvm_sust_b_2d_i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_i8_clamp">; +def int_nvvm_sust_b_2d_i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_i16_clamp">; +def int_nvvm_sust_b_2d_i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_i32_clamp">; +def int_nvvm_sust_b_2d_i64_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.i64.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_i64_clamp">; +def int_nvvm_sust_b_2d_v2i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.v2i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_v2i8_clamp">; +def int_nvvm_sust_b_2d_v2i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.v2i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_v2i16_clamp">; +def int_nvvm_sust_b_2d_v2i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.v2i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_v2i32_clamp">; +def int_nvvm_sust_b_2d_v2i64_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.v2i64.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_v2i64_clamp">; +def int_nvvm_sust_b_2d_v4i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.v4i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_v4i8_clamp">; +def int_nvvm_sust_b_2d_v4i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.v4i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_v4i16_clamp">; +def int_nvvm_sust_b_2d_v4i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.v4i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_v4i32_clamp">; + + +def int_nvvm_sust_b_2d_array_i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_i8_clamp">; +def int_nvvm_sust_b_2d_array_i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_i16_clamp">; +def int_nvvm_sust_b_2d_array_i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.array.i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_i32_clamp">; +def int_nvvm_sust_b_2d_array_i64_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.array.i64.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_i64_clamp">; +def int_nvvm_sust_b_2d_array_v2i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.v2i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v2i8_clamp">; +def int_nvvm_sust_b_2d_array_v2i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.v2i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v2i16_clamp">; +def int_nvvm_sust_b_2d_array_v2i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.array.v2i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v2i32_clamp">; +def int_nvvm_sust_b_2d_array_v2i64_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.array.v2i64.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v2i64_clamp">; +def int_nvvm_sust_b_2d_array_v4i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.v4i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v4i8_clamp">; +def int_nvvm_sust_b_2d_array_v4i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.v4i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v4i16_clamp">; +def int_nvvm_sust_b_2d_array_v4i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.array.v4i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v4i32_clamp">; + +def int_nvvm_sust_b_3d_i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_i8_clamp">; +def int_nvvm_sust_b_3d_i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_i16_clamp">; +def int_nvvm_sust_b_3d_i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.3d.i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_i32_clamp">; +def int_nvvm_sust_b_3d_i64_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.3d.i64.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_i64_clamp">; +def int_nvvm_sust_b_3d_v2i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.v2i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_v2i8_clamp">; +def int_nvvm_sust_b_3d_v2i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.v2i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_v2i16_clamp">; +def int_nvvm_sust_b_3d_v2i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.3d.v2i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_v2i32_clamp">; +def int_nvvm_sust_b_3d_v2i64_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.3d.v2i64.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_v2i64_clamp">; +def int_nvvm_sust_b_3d_v4i8_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.v4i8.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_v4i8_clamp">; +def int_nvvm_sust_b_3d_v4i16_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.v4i16.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_v4i16_clamp">; +def int_nvvm_sust_b_3d_v4i32_clamp + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.3d.v4i32.clamp">, + GCCBuiltin<"__nvvm_sust_b_3d_v4i32_clamp">; + + +// .trap variant def int_nvvm_sust_b_1d_i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [], "llvm.nvvm.sust.b.1d.i8.trap">, @@ -1366,6 +2886,10 @@ def int_nvvm_sust_b_1d_i32_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], "llvm.nvvm.sust.b.1d.i32.trap">, GCCBuiltin<"__nvvm_sust_b_1d_i32_trap">; +def int_nvvm_sust_b_1d_i64_trap + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.i64.trap">, + GCCBuiltin<"__nvvm_sust_b_1d_i64_trap">; def int_nvvm_sust_b_1d_v2i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [], "llvm.nvvm.sust.b.1d.v2i8.trap">, @@ -1378,6 +2902,10 @@ def int_nvvm_sust_b_1d_v2i32_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], "llvm.nvvm.sust.b.1d.v2i32.trap">, GCCBuiltin<"__nvvm_sust_b_1d_v2i32_trap">; +def int_nvvm_sust_b_1d_v2i64_trap + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.v2i64.trap">, + GCCBuiltin<"__nvvm_sust_b_1d_v2i64_trap">; def int_nvvm_sust_b_1d_v4i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], @@ -1407,6 +2935,10 @@ def int_nvvm_sust_b_1d_array_i32_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], "llvm.nvvm.sust.b.1d.array.i32.trap">, GCCBuiltin<"__nvvm_sust_b_1d_array_i32_trap">; +def int_nvvm_sust_b_1d_array_i64_trap + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.array.i64.trap">, + GCCBuiltin<"__nvvm_sust_b_1d_array_i64_trap">; def int_nvvm_sust_b_1d_array_v2i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [], @@ -1422,6 +2954,11 @@ def int_nvvm_sust_b_1d_array_v2i32_trap llvm_i32_ty, llvm_i32_ty], [], "llvm.nvvm.sust.b.1d.array.v2i32.trap">, GCCBuiltin<"__nvvm_sust_b_1d_array_v2i32_trap">; +def int_nvvm_sust_b_1d_array_v2i64_trap + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.array.v2i64.trap">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v2i64_trap">; def int_nvvm_sust_b_1d_array_v4i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], @@ -1451,6 +2988,10 @@ def int_nvvm_sust_b_2d_i32_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], "llvm.nvvm.sust.b.2d.i32.trap">, GCCBuiltin<"__nvvm_sust_b_2d_i32_trap">; +def int_nvvm_sust_b_2d_i64_trap + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.i64.trap">, + GCCBuiltin<"__nvvm_sust_b_2d_i64_trap">; def int_nvvm_sust_b_2d_v2i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [], @@ -1466,6 +3007,11 @@ def int_nvvm_sust_b_2d_v2i32_trap llvm_i32_ty, llvm_i32_ty], [], "llvm.nvvm.sust.b.2d.v2i32.trap">, GCCBuiltin<"__nvvm_sust_b_2d_v2i32_trap">; +def int_nvvm_sust_b_2d_v2i64_trap + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.v2i64.trap">, + GCCBuiltin<"__nvvm_sust_b_2d_v2i64_trap">; def int_nvvm_sust_b_2d_v4i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], @@ -1498,6 +3044,11 @@ def int_nvvm_sust_b_2d_array_i32_trap llvm_i32_ty, llvm_i32_ty], [], "llvm.nvvm.sust.b.2d.array.i32.trap">, GCCBuiltin<"__nvvm_sust_b_2d_array_i32_trap">; +def int_nvvm_sust_b_2d_array_i64_trap + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.array.i64.trap">, + GCCBuiltin<"__nvvm_sust_b_2d_array_i64_trap">; def int_nvvm_sust_b_2d_array_v2i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [], @@ -1513,6 +3064,11 @@ def int_nvvm_sust_b_2d_array_v2i32_trap llvm_i32_ty, llvm_i32_ty], [], "llvm.nvvm.sust.b.2d.array.v2i32.trap">, GCCBuiltin<"__nvvm_sust_b_2d_array_v2i32_trap">; +def int_nvvm_sust_b_2d_array_v2i64_trap + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.array.v2i64.trap">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v2i64_trap">; def int_nvvm_sust_b_2d_array_v4i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], @@ -1545,6 +3101,11 @@ def int_nvvm_sust_b_3d_i32_trap llvm_i32_ty, llvm_i32_ty], [], "llvm.nvvm.sust.b.3d.i32.trap">, GCCBuiltin<"__nvvm_sust_b_3d_i32_trap">; +def int_nvvm_sust_b_3d_i64_trap + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.3d.i64.trap">, + GCCBuiltin<"__nvvm_sust_b_3d_i64_trap">; def int_nvvm_sust_b_3d_v2i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [], @@ -1560,6 +3121,11 @@ def int_nvvm_sust_b_3d_v2i32_trap llvm_i32_ty, llvm_i32_ty], [], "llvm.nvvm.sust.b.3d.v2i32.trap">, GCCBuiltin<"__nvvm_sust_b_3d_v2i32_trap">; +def int_nvvm_sust_b_3d_v2i64_trap + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.3d.v2i64.trap">, + GCCBuiltin<"__nvvm_sust_b_3d_v2i64_trap">; def int_nvvm_sust_b_3d_v4i8_trap : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], @@ -1576,6 +3142,278 @@ def int_nvvm_sust_b_3d_v4i32_trap "llvm.nvvm.sust.b.3d.v4i32.trap">, GCCBuiltin<"__nvvm_sust_b_3d_v4i32_trap">; + +// .zero variant +def int_nvvm_sust_b_1d_i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.i8.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_i8_zero">; +def int_nvvm_sust_b_1d_i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.i16.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_i16_zero">; +def int_nvvm_sust_b_1d_i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.i32.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_i32_zero">; +def int_nvvm_sust_b_1d_i64_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.i64.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_i64_zero">; +def int_nvvm_sust_b_1d_v2i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.v2i8.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_v2i8_zero">; +def int_nvvm_sust_b_1d_v2i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.v2i16.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_v2i16_zero">; +def int_nvvm_sust_b_1d_v2i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.v2i32.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_v2i32_zero">; +def int_nvvm_sust_b_1d_v2i64_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.v2i64.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_v2i64_zero">; +def int_nvvm_sust_b_1d_v4i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.v4i8.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_v4i8_zero">; +def int_nvvm_sust_b_1d_v4i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.v4i16.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_v4i16_zero">; +def int_nvvm_sust_b_1d_v4i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.v4i32.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_v4i32_zero">; + + +def int_nvvm_sust_b_1d_array_i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.i8.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_i8_zero">; +def int_nvvm_sust_b_1d_array_i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.i16.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_i16_zero">; +def int_nvvm_sust_b_1d_array_i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.array.i32.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_i32_zero">; +def int_nvvm_sust_b_1d_array_i64_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.array.i64.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_i64_zero">; +def int_nvvm_sust_b_1d_array_v2i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.v2i8.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v2i8_zero">; +def int_nvvm_sust_b_1d_array_v2i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.v2i16.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v2i16_zero">; +def int_nvvm_sust_b_1d_array_v2i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.array.v2i32.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v2i32_zero">; +def int_nvvm_sust_b_1d_array_v2i64_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.1d.array.v2i64.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v2i64_zero">; +def int_nvvm_sust_b_1d_array_v4i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.v4i8.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v4i8_zero">; +def int_nvvm_sust_b_1d_array_v4i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.1d.array.v4i16.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v4i16_zero">; +def int_nvvm_sust_b_1d_array_v4i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.1d.array.v4i32.zero">, + GCCBuiltin<"__nvvm_sust_b_1d_array_v4i32_zero">; + + +def int_nvvm_sust_b_2d_i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.i8.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_i8_zero">; +def int_nvvm_sust_b_2d_i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.i16.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_i16_zero">; +def int_nvvm_sust_b_2d_i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.i32.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_i32_zero">; +def int_nvvm_sust_b_2d_i64_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.i64.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_i64_zero">; +def int_nvvm_sust_b_2d_v2i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.v2i8.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_v2i8_zero">; +def int_nvvm_sust_b_2d_v2i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.v2i16.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_v2i16_zero">; +def int_nvvm_sust_b_2d_v2i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.v2i32.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_v2i32_zero">; +def int_nvvm_sust_b_2d_v2i64_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.v2i64.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_v2i64_zero">; +def int_nvvm_sust_b_2d_v4i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.v4i8.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_v4i8_zero">; +def int_nvvm_sust_b_2d_v4i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.v4i16.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_v4i16_zero">; +def int_nvvm_sust_b_2d_v4i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.v4i32.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_v4i32_zero">; + + +def int_nvvm_sust_b_2d_array_i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.i8.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_i8_zero">; +def int_nvvm_sust_b_2d_array_i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.i16.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_i16_zero">; +def int_nvvm_sust_b_2d_array_i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.array.i32.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_i32_zero">; +def int_nvvm_sust_b_2d_array_i64_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.array.i64.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_i64_zero">; +def int_nvvm_sust_b_2d_array_v2i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.v2i8.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v2i8_zero">; +def int_nvvm_sust_b_2d_array_v2i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.v2i16.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v2i16_zero">; +def int_nvvm_sust_b_2d_array_v2i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.array.v2i32.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v2i32_zero">; +def int_nvvm_sust_b_2d_array_v2i64_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.2d.array.v2i64.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v2i64_zero">; +def int_nvvm_sust_b_2d_array_v4i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.v4i8.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v4i8_zero">; +def int_nvvm_sust_b_2d_array_v4i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.2d.array.v4i16.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v4i16_zero">; +def int_nvvm_sust_b_2d_array_v4i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.2d.array.v4i32.zero">, + GCCBuiltin<"__nvvm_sust_b_2d_array_v4i32_zero">; + + +def int_nvvm_sust_b_3d_i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.i8.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_i8_zero">; +def int_nvvm_sust_b_3d_i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.i16.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_i16_zero">; +def int_nvvm_sust_b_3d_i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.3d.i32.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_i32_zero">; +def int_nvvm_sust_b_3d_i64_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.3d.i64.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_i64_zero">; +def int_nvvm_sust_b_3d_v2i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.v2i8.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_v2i8_zero">; +def int_nvvm_sust_b_3d_v2i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.v2i16.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_v2i16_zero">; +def int_nvvm_sust_b_3d_v2i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.3d.v2i32.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_v2i32_zero">; +def int_nvvm_sust_b_3d_v2i64_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i64_ty, llvm_i64_ty], [], + "llvm.nvvm.sust.b.3d.v2i64.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_v2i64_zero">; +def int_nvvm_sust_b_3d_v4i8_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.v4i8.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_v4i8_zero">; +def int_nvvm_sust_b_3d_v4i16_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [], + "llvm.nvvm.sust.b.3d.v4i16.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_v4i16_zero">; +def int_nvvm_sust_b_3d_v4i32_zero + : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [], + "llvm.nvvm.sust.b.3d.v4i32.zero">, + GCCBuiltin<"__nvvm_sust_b_3d_v4i32_zero">; + + + // Formatted def int_nvvm_sust_p_1d_i8_trap @@ -1801,6 +3639,26 @@ def int_nvvm_sust_p_3d_v4i32_trap GCCBuiltin<"__nvvm_sust_p_3d_v4i32_trap">; +def int_nvvm_rotate_b32 + : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem], "llvm.nvvm.rotate.b32">, + GCCBuiltin<"__nvvm_rotate_b32">; + +def int_nvvm_rotate_b64 + :Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty], + [IntrNoMem], "llvm.nvvm.rotate.b64">, + GCCBuiltin<"__nvvm_rotate_b64">; + +def int_nvvm_rotate_right_b64 + : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty], + [IntrNoMem], "llvm.nvvm.rotate.right.b64">, + GCCBuiltin<"__nvvm_rotate_right_b64">; + +def int_nvvm_swap_lo_hi_b64 + : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], + [IntrNoMem], "llvm.nvvm.swap.lo.hi.b64">, + GCCBuiltin<"__nvvm_swap_lo_hi_b64">; + // Old PTX back-end intrinsics retained here for backwards-compatibility diff --git a/include/llvm/IR/IntrinsicsR600.td b/include/llvm/IR/IntrinsicsR600.td index ecb5668d8e95..ba69eaae089f 100644 --- a/include/llvm/IR/IntrinsicsR600.td +++ b/include/llvm/IR/IntrinsicsR600.td @@ -33,4 +33,40 @@ defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz < "__builtin_r600_read_tgid">; defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz < "__builtin_r600_read_tidig">; + } // End TargetPrefix = "r600" + +let TargetPrefix = "AMDGPU" in { +def int_AMDGPU_div_scale : GCCBuiltin<"__builtin_amdgpu_div_scale">, + // 1st parameter: Numerator + // 2nd parameter: Denominator + // 3rd parameter: Constant to select select between first and + // second. (0 = first, 1 = second). + Intrinsic<[llvm_anyfloat_ty, llvm_i1_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], + [IntrNoMem]>; + +def int_AMDGPU_div_fmas : GCCBuiltin<"__builtin_amdgpu_div_fmas">, + Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + +def int_AMDGPU_div_fixup : GCCBuiltin<"__builtin_amdgpu_div_fixup">, + Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + +def int_AMDGPU_trig_preop : GCCBuiltin<"__builtin_amdgpu_trig_preop">, + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem]>; + +def int_AMDGPU_rcp : GCCBuiltin<"__builtin_amdgpu_rcp">, + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + +def int_AMDGPU_rsq : GCCBuiltin<"__builtin_amdgpu_rsq">, + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + +def int_AMDGPU_rsq_clamped : GCCBuiltin<"__builtin_amdgpu_rsq_clamped">, + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + +} // End TargetPrefix = "AMDGPU" diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 7c81223b091f..018aa9a87d07 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -26,6 +26,12 @@ let TargetPrefix = "x86" in { Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>; } +// Read Performance-Monitoring Counter. +let TargetPrefix = "x86" in { + def int_x86_rdpmc : GCCBuiltin<"__builtin_ia32_rdpmc">, + Intrinsic<[llvm_i64_ty], [llvm_i32_ty], []>; +} + //===----------------------------------------------------------------------===// // 3DNow! @@ -667,6 +673,15 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_ssse3_pshuf_b_128 : GCCBuiltin<"__builtin_ia32_pshufb128">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; + def int_x86_sse2_pshuf_d : GCCBuiltin<"__builtin_ia32_pshufd">, + Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_sse2_pshufl_w : GCCBuiltin<"__builtin_ia32_pshuflw">, + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_sse2_pshufh_w : GCCBuiltin<"__builtin_ia32_pshufhw">, + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; def int_x86_sse_pshuf_w : GCCBuiltin<"__builtin_ia32_pshufw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem]>; @@ -2940,10 +2955,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; - def int_x86_avx512_sqrt_pd_512 : GCCBuiltin<"__builtin_ia32_sqrtpd512">, - Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty], [IntrNoMem]>; - def int_x86_avx512_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512">, - Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty], [IntrNoMem]>; + def int_x86_avx512_sqrt_pd_512 : GCCBuiltin<"__builtin_ia32_sqrtpd512_mask">, + Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, + llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, + llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h index e2b86f62fb8d..26f62db9db53 100644 --- a/include/llvm/IR/Module.h +++ b/include/llvm/IR/Module.h @@ -16,6 +16,7 @@ #define LLVM_IR_MODULE_H #include "llvm/ADT/iterator_range.h" +#include "llvm/IR/Comdat.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" @@ -29,6 +30,7 @@ namespace llvm { class FunctionType; class GVMaterializer; class LLVMContext; +class RandomNumberGenerator; class StructType; template struct DenseMapInfo; template class DenseMap; @@ -122,6 +124,8 @@ class Module { typedef iplist AliasListType; /// The type for the list of named metadata. typedef ilist NamedMDListType; + /// The type of the comdat "symbol" table. + typedef StringMap ComdatSymTabType; /// The Global Variable iterator. typedef GlobalListType::iterator global_iterator; @@ -196,11 +200,14 @@ class Module { NamedMDListType NamedMDList; ///< The named metadata in the module std::string GlobalScopeAsm; ///< Inline Asm at global scope. ValueSymbolTable *ValSymTab; ///< Symbol table for values + ComdatSymTabType ComdatSymTab; ///< Symbol table for COMDATs std::unique_ptr Materializer; ///< Used to materialize GlobalValues std::string ModuleID; ///< Human readable identifier for the module std::string TargetTriple; ///< Platform target triple Module compiled on void *NamedMDSymTab; ///< NamedMDNode names. + // Allow lazy initialization in const method. + mutable RandomNumberGenerator *RNG; ///< The random number generator for this module. // We need to keep the string because the C API expects us to own the string // representation. @@ -249,6 +256,11 @@ class Module { /// @returns a string containing the module-scope inline assembly blocks. const std::string &getModuleInlineAsm() const { return GlobalScopeAsm; } + /// Get the RandomNumberGenerator for this module. The RNG can be + /// seeded via -rng-seed= and is salted with the ModuleID. + /// The returned RNG should not be shared across threads. + RandomNumberGenerator &getRNG() const; + /// @} /// @name Module Level Mutators /// @{ @@ -395,6 +407,14 @@ class Module { /// Remove the given NamedMDNode from this module and delete it. void eraseNamedMetadata(NamedMDNode *NMD); +/// @} +/// @name Comdat Accessors +/// @{ + + /// Return the Comdat in the module with the specified name. It is created + /// if it didn't already exist. + Comdat *getOrInsertComdat(StringRef Name); + /// @} /// @name Module Flags Accessors /// @{ @@ -458,7 +478,7 @@ class Module { /// Make sure all GlobalValues in this Module are fully read and clear the /// Materializer. If the module is corrupt, this DOES NOT clear the old /// Materializer. - std::error_code materializeAllPermanently(); + std::error_code materializeAllPermanently(bool ReleaseBuffer = false); /// @} /// @name Direct access to the globals list, functions list, and symbol table @@ -496,6 +516,10 @@ class Module { const ValueSymbolTable &getValueSymbolTable() const { return *ValSymTab; } /// Get the Module's symbol table of global variable and function identifiers. ValueSymbolTable &getValueSymbolTable() { return *ValSymTab; } + /// Get the Module's symbol table for COMDATs (constant). + const ComdatSymTabType &getComdatSymbolTable() const { return ComdatSymTab; } + /// Get the Module's symbol table for COMDATs. + ComdatSymTabType &getComdatSymbolTable() { return ComdatSymTab; } /// @} /// @name Global Variable Iteration diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h index aae39ccbbfb1..b5bbc96eac2a 100644 --- a/include/llvm/IR/Value.h +++ b/include/llvm/IR/Value.h @@ -430,7 +430,7 @@ class Value { /// isDereferenceablePointer - Test if this value is always a pointer to /// allocated and suitably aligned memory for a simple load or store. - bool isDereferenceablePointer() const; + bool isDereferenceablePointer(const DataLayout *DL = nullptr) const; /// DoPHITranslation - If this value is a PHI node with CurBB as its parent, /// return the value in the PHI node corresponding to PredBB. If not, return diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h index f196f334b605..43a79c7db2b9 100644 --- a/include/llvm/IR/ValueMap.h +++ b/include/llvm/IR/ValueMap.h @@ -87,6 +87,7 @@ class ValueMap { typedef KeyT key_type; typedef ValueT mapped_type; typedef std::pair value_type; + typedef unsigned size_type; explicit ValueMap(unsigned NumInitBuckets = 64) : Map(NumInitBuckets), Data() {} @@ -103,16 +104,16 @@ class ValueMap { inline const_iterator end() const { return const_iterator(Map.end()); } bool empty() const { return Map.empty(); } - unsigned size() const { return Map.size(); } + size_type size() const { return Map.size(); } /// Grow the map so that it has at least Size buckets. Does not shrink void resize(size_t Size) { Map.resize(Size); } void clear() { Map.clear(); } - /// count - Return true if the specified key is in the map. - bool count(const KeyT &Val) const { - return Map.find_as(Val) != Map.end(); + /// Return 1 if the specified key is in the map, 0 otherwise. + size_type count(const KeyT &Val) const { + return Map.find_as(Val) == Map.end() ? 0 : 1; } iterator find(const KeyT &Val) { diff --git a/include/llvm/IRReader/IRReader.h b/include/llvm/IRReader/IRReader.h index e2ae5f7164b2..59ffc095f47f 100644 --- a/include/llvm/IRReader/IRReader.h +++ b/include/llvm/IRReader/IRReader.h @@ -24,13 +24,6 @@ class MemoryBuffer; class SMDiagnostic; class LLVMContext; -/// If the given MemoryBuffer holds a bitcode image, return a Module for it -/// which does lazy deserialization of function bodies. Otherwise, attempt to -/// parse it as LLVM Assembly and return a fully populated Module. This -/// function *always* takes ownership of the given MemoryBuffer. -Module *getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err, - LLVMContext &Context); - /// If the given file holds a bitcode image, return a Module /// for it which does lazy deserialization of function bodies. Otherwise, /// attempt to parse it as LLVM Assembly and return a fully populated @@ -40,8 +33,7 @@ Module *getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err, /// If the given MemoryBuffer holds a bitcode image, return a Module /// for it. Otherwise, attempt to parse it as LLVM Assembly and return -/// a Module for it. This function *always* takes ownership of the given -/// MemoryBuffer. +/// a Module for it. This function *never* takes ownership of Buffer. Module *ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err, LLVMContext &Context); /// If the given file holds a bitcode image, return a Module for it. diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 0c840f39f522..6171d7c8c41e 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -184,10 +184,12 @@ void initializeMachineBlockPlacementStatsPass(PassRegistry&); void initializeMachineBranchProbabilityInfoPass(PassRegistry&); void initializeMachineCSEPass(PassRegistry&); void initializeMachineDominatorTreePass(PassRegistry&); +void initializeMachineDominanceFrontierPass(PassRegistry&); void initializeMachinePostDominatorTreePass(PassRegistry&); void initializeMachineLICMPass(PassRegistry&); void initializeMachineLoopInfoPass(PassRegistry&); void initializeMachineModuleInfoPass(PassRegistry&); +void initializeMachineRegionInfoPassPass(PassRegistry&); void initializeMachineSchedulerPass(PassRegistry&); void initializeMachineSinkingPass(PassRegistry&); void initializeMachineTraceMetricsPass(PassRegistry&); @@ -195,6 +197,7 @@ void initializeMachineVerifierPassPass(PassRegistry&); void initializeMemCpyOptPass(PassRegistry&); void initializeMemDepPrinterPass(PassRegistry&); void initializeMemoryDependenceAnalysisPass(PassRegistry&); +void initializeMergedLoadStoreMotionPass(PassRegistry &); void initializeMetaRenamerPass(PassRegistry&); void initializeMergeFunctionsPass(PassRegistry&); void initializeModuleDebugInfoPrinterPass(PassRegistry&); @@ -225,7 +228,7 @@ void initializePromotePassPass(PassRegistry&); void initializePruneEHPass(PassRegistry&); void initializeReassociatePass(PassRegistry&); void initializeRegToMemPass(PassRegistry&); -void initializeRegionInfoPass(PassRegistry&); +void initializeRegionInfoPassPass(PassRegistry&); void initializeRegionOnlyPrinterPass(PassRegistry&); void initializeRegionOnlyViewerPass(PassRegistry&); void initializeRegionPrinterPass(PassRegistry&); @@ -275,6 +278,9 @@ void initializeBBVectorizePass(PassRegistry&); void initializeMachineFunctionPrinterPassPass(PassRegistry&); void initializeStackMapLivenessPass(PassRegistry&); void initializeLoadCombinePass(PassRegistry&); + +// Specific to the rust-lang llvm branch: +void initializeNullCheckEliminationPass(PassRegistry&); } #endif diff --git a/include/llvm/LTO/LTOModule.h b/include/llvm/LTO/LTOModule.h index f1b1480ad664..c43846a64050 100644 --- a/include/llvm/LTO/LTOModule.h +++ b/include/llvm/LTO/LTOModule.h @@ -16,10 +16,10 @@ #include "llvm-c/lto.h" #include "llvm/ADT/StringMap.h" -#include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/Object/IRObjectFile.h" #include "llvm/Target/TargetMachine.h" #include #include @@ -46,9 +46,8 @@ struct LTOModule { const GlobalValue *symbol; }; - std::unique_ptr _module; + std::unique_ptr IRFile; std::unique_ptr _target; - MCObjectFileInfo ObjFileInfo; StringSet _linkeropt_strings; std::vector _deplibs; std::vector _linkeropts; @@ -58,25 +57,22 @@ struct LTOModule { StringSet _defines; StringMap _undefines; std::vector _asm_undefines; - MCContext _context; - // Use mangler to add GlobalPrefix to names to match linker names. - Mangler _mangler; - - LTOModule(Module *m, TargetMachine *t); + LTOModule(std::unique_ptr Obj, TargetMachine *TM); public: /// Returns 'true' if the file or memory contents is LLVM bitcode. static bool isBitcodeFile(const void *mem, size_t length); static bool isBitcodeFile(const char *path); - /// Returns 'true' if the file or memory contents is LLVM bitcode for the - /// specified triple. - static bool isBitcodeFileForTarget(const void *mem, - size_t length, - const char *triplePrefix); - static bool isBitcodeFileForTarget(const char *path, - const char *triplePrefix); + /// Returns 'true' if the memory buffer is LLVM bitcode for the specified + /// triple. + static bool isBitcodeForTarget(MemoryBuffer *memBuffer, + StringRef triplePrefix); + + /// Create a MemoryBuffer from a memory range with an optional name. + static MemoryBuffer *makeBuffer(const void *mem, size_t length, + StringRef name = ""); /// Create an LTOModule. N.B. These methods take ownership of the buffer. The /// caller must have initialized the Targets, the TargetMCs, the AsmPrinters, @@ -86,25 +82,34 @@ struct LTOModule { /// InitializeAllTargetMCs(); /// InitializeAllAsmPrinters(); /// InitializeAllAsmParsers(); - static LTOModule *makeLTOModule(const char *path, TargetOptions options, - std::string &errMsg); - static LTOModule *makeLTOModule(int fd, const char *path, size_t size, - TargetOptions options, std::string &errMsg); - static LTOModule *makeLTOModule(int fd, const char *path, size_t map_size, - off_t offset, TargetOptions options, - std::string &errMsg); - static LTOModule *makeLTOModule(const void *mem, size_t length, - TargetOptions options, std::string &errMsg, - StringRef path = ""); + static LTOModule *createFromFile(const char *path, TargetOptions options, + std::string &errMsg); + static LTOModule *createFromOpenFile(int fd, const char *path, size_t size, + TargetOptions options, + std::string &errMsg); + static LTOModule *createFromOpenFileSlice(int fd, const char *path, + size_t map_size, off_t offset, + TargetOptions options, + std::string &errMsg); + static LTOModule *createFromBuffer(const void *mem, size_t length, + TargetOptions options, std::string &errMsg, + StringRef path = ""); + + const Module &getModule() const { + return const_cast(this)->getModule(); + } + Module &getModule() { + return IRFile->getModule(); + } /// Return the Module's target triple. - const char *getTargetTriple() { - return _module->getTargetTriple().c_str(); + const std::string &getTargetTriple() { + return getModule().getTargetTriple(); } /// Set the Module's target triple. - void setTargetTriple(const char *triple) { - _module->setTargetTriple(triple); + void setTargetTriple(StringRef Triple) { + getModule().setTargetTriple(Triple); } /// Get the number of symbols @@ -150,9 +155,6 @@ struct LTOModule { return nullptr; } - /// Return the Module. - Module *getLLVVMModule() { return _module.get(); } - const std::vector &getAsmUndefinedRefs() { return _asm_undefines; } @@ -167,20 +169,20 @@ struct LTOModule { bool parseSymbols(std::string &errMsg); /// Add a symbol which isn't defined just yet to a list to be resolved later. - void addPotentialUndefinedSymbol(const GlobalValue *dcl, bool isFunc); + void addPotentialUndefinedSymbol(const object::BasicSymbolRef &Sym, + bool isFunc); /// Add a defined symbol to the list. - void addDefinedSymbol(const GlobalValue *def, bool isFunction); - - /// Add a function symbol as defined to the list. - void addDefinedFunctionSymbol(const Function *f); + void addDefinedSymbol(const char *Name, const GlobalValue *def, + bool isFunction); /// Add a data symbol as defined to the list. - void addDefinedDataSymbol(const GlobalValue *v); + void addDefinedDataSymbol(const object::BasicSymbolRef &Sym); + void addDefinedDataSymbol(const char*Name, const GlobalValue *v); - /// Add global symbols from module-level ASM to the defined or undefined - /// lists. - bool addAsmGlobalSymbols(std::string &errMsg); + /// Add a function symbol as defined to the list. + void addDefinedFunctionSymbol(const object::BasicSymbolRef &Sym); + void addDefinedFunctionSymbol(const char *Name, const Function *F); /// Add a global symbol from module-level ASM to the defined list. void addAsmGlobalSymbol(const char *, lto_symbol_attributes scope); @@ -200,17 +202,10 @@ struct LTOModule { /// Get string that the data pointer points to. bool objcClassNameFromExpression(const Constant *c, std::string &name); - /// Returns 'true' if the memory buffer is for the specified target triple. - static bool isTargetMatch(MemoryBuffer *memBuffer, const char *triplePrefix); - /// Create an LTOModule (private version). N.B. This method takes ownership of /// the buffer. - static LTOModule *makeLTOModule(MemoryBuffer *buffer, TargetOptions options, - std::string &errMsg); - - /// Create a MemoryBuffer from a memory range with an optional name. - static MemoryBuffer *makeBuffer(const void *mem, size_t length, - StringRef name = ""); + static LTOModule *makeLTOModule(std::unique_ptr Buffer, + TargetOptions options, std::string &errMsg); }; } #endif // LTO_MODULE_H diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index b2309ffc2140..ddda0b76edff 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -134,6 +134,7 @@ namespace { (void) llvm::createConstantHoistingPass(); (void) llvm::createCodeGenPreparePass(); (void) llvm::createEarlyCSEPass(); + (void)llvm::createMergedLoadStoreMotionPass(); (void) llvm::createGVNPass(); (void) llvm::createMemCpyOptPass(); (void) llvm::createLoopDeletionPass(); @@ -160,6 +161,9 @@ namespace { (void) llvm::createScalarizerPass(); (void) llvm::createSeparateConstOffsetFromGEPPass(); + // Specific to the rust-lang llvm branch: + (void) llvm::createNullCheckEliminationPass(); + (void)new llvm::IntervalPartition(); (void)new llvm::FindUsedTypes(); (void)new llvm::ScalarEvolution(); diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h index 42b2cb37b309..6254bbb6d6d5 100644 --- a/include/llvm/Linker/Linker.h +++ b/include/llvm/Linker/Linker.h @@ -15,6 +15,8 @@ namespace llvm { +class Comdat; +class GlobalValue; class Module; class StringRef; class StructType; diff --git a/include/llvm/MC/ConstantPools.h b/include/llvm/MC/ConstantPools.h index 2819b757b8bf..2e76074db774 100644 --- a/include/llvm/MC/ConstantPools.h +++ b/include/llvm/MC/ConstantPools.h @@ -22,10 +22,19 @@ class MCExpr; class MCSection; class MCStreamer; class MCSymbol; + +struct ConstantPoolEntry { + ConstantPoolEntry(MCSymbol *L, const MCExpr *Val, unsigned Sz) + : Label(L), Value(Val), Size(Sz) {} + MCSymbol *Label; + const MCExpr *Value; + unsigned Size; +}; + // A class to keep track of assembler-generated constant pools that are use to // implement the ldr-pseudo. class ConstantPool { - typedef SmallVector, 4> EntryVecTy; + typedef SmallVector EntryVecTy; EntryVecTy Entries; public: @@ -34,9 +43,11 @@ class ConstantPool { // Add a new entry to the constant pool in the next slot. // \param Value is the new entry to put in the constant pool. + // \param Size is the size in bytes of the entry // // \returns a MCExpr that references the newly inserted value - const MCExpr *addEntry(const MCExpr *Value, MCContext &Context); + const MCExpr *addEntry(const MCExpr *Value, MCContext &Context, + unsigned Size); // Emit the contents of the constant pool using the provided streamer. void emitEntries(MCStreamer &Streamer); @@ -69,7 +80,8 @@ class AssemblerConstantPools { void emitAll(MCStreamer &Streamer); void emitForCurrentSection(MCStreamer &Streamer); - const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Expr); + const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Expr, + unsigned Size); private: ConstantPool *getConstantPool(const MCSection *Section); diff --git a/include/llvm/MC/MCAtom.h b/include/llvm/MC/MCAnalysis/MCAtom.h similarity index 97% rename from include/llvm/MC/MCAtom.h rename to include/llvm/MC/MCAnalysis/MCAtom.h index e9d0fbacc1e2..33f3431a59a9 100644 --- a/include/llvm/MC/MCAtom.h +++ b/include/llvm/MC/MCAnalysis/MCAtom.h @@ -1,4 +1,4 @@ -//===-- llvm/MC/MCAtom.h ----------------------------------------*- C++ -*-===// +//===-- MCAtom.h ------------------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_MC_MCATOM_H -#define LLVM_MC_MCATOM_H +#ifndef LLVM_MC_MCANALYSIS_MCATOM_H +#define LLVM_MC_MCANALYSIS_MCATOM_H #include "llvm/ADT/ArrayRef.h" #include "llvm/MC/MCInst.h" diff --git a/include/llvm/MC/MCFunction.h b/include/llvm/MC/MCAnalysis/MCFunction.h similarity index 97% rename from include/llvm/MC/MCFunction.h rename to include/llvm/MC/MCAnalysis/MCFunction.h index bfa470b5f423..44fa4503b8e0 100644 --- a/include/llvm/MC/MCFunction.h +++ b/include/llvm/MC/MCAnalysis/MCFunction.h @@ -1,4 +1,4 @@ -//===-- llvm/MC/MCFunction.h ------------------------------------*- C++ -*-===// +//===-- MCFunction.h --------------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_MC_MCFUNCTION_H -#define LLVM_MC_MCFUNCTION_H +#ifndef LLVM_MC_MCANALYSIS_MCFUNCTION_H +#define LLVM_MC_MCANALYSIS_MCFUNCTION_H #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInst.h" diff --git a/include/llvm/MC/MCModule.h b/include/llvm/MC/MCAnalysis/MCModule.h similarity index 96% rename from include/llvm/MC/MCModule.h rename to include/llvm/MC/MCAnalysis/MCModule.h index aa389cbb0b2b..cf7e2c0a645e 100644 --- a/include/llvm/MC/MCModule.h +++ b/include/llvm/MC/MCAnalysis/MCModule.h @@ -1,4 +1,4 @@ -//===-- llvm/MC/MCModule.h - MCModule class ---------------------*- C++ -*-===// +//===-- MCModule.h - MCModule class -----------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_MC_MCMODULE_H -#define LLVM_MC_MCMODULE_H +#ifndef LLVM_MC_MCANALYSIS_MCMODULE_H +#define LLVM_MC_MCANALYSIS_MCMODULE_H #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" diff --git a/include/llvm/MC/MCModuleYAML.h b/include/llvm/MC/MCAnalysis/MCModuleYAML.h similarity index 90% rename from include/llvm/MC/MCModuleYAML.h rename to include/llvm/MC/MCAnalysis/MCModuleYAML.h index c4ae829535c2..48562777677b 100644 --- a/include/llvm/MC/MCModuleYAML.h +++ b/include/llvm/MC/MCAnalysis/MCModuleYAML.h @@ -13,11 +13,11 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_MC_MCMODULEYAML_H -#define LLVM_MC_MCMODULEYAML_H +#ifndef LLVM_MC_MCANALYSIS_MCMODULEYAML_H +#define LLVM_MC_MCANALYSIS_MCMODULEYAML_H #include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCModule.h" +#include "llvm/MC/MCAnalysis/MCModule.h" #include "llvm/Support/raw_ostream.h" namespace llvm { diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h index 55dc40afe5ab..06e473d6b625 100644 --- a/include/llvm/MC/MCAsmInfo.h +++ b/include/llvm/MC/MCAsmInfo.h @@ -41,9 +41,13 @@ enum class EncodingType { }; } -namespace ExceptionHandling { -enum ExceptionsType { None, DwarfCFI, SjLj, ARM, Win64 }; -} +enum class ExceptionHandling { + None, /// No exception support + DwarfCFI, /// DWARF-like instruction based exceptions + SjLj, /// setjmp/longjmp based exceptions + ARM, /// ARM EHABI + WinEH, /// Windows Exception Handling +}; namespace LCOMM { enum LCOMMType { NoAlignment, ByteAlignment, Log2Alignment }; @@ -116,8 +120,8 @@ class MCAsmInfo { /// This is appended to emitted labels. Defaults to ":" const char *LabelSuffix; - /// This is appended to emitted labels. Defaults to ":" - const char *DebugLabelSuffix; + // Print the EH begin symbol with an assignment. Defaults to false. + bool UseAssignmentForEHBegin; /// This prefix is used for globals like constant pool entries that are /// completely private to the .s file and should not have names in the .o @@ -299,7 +303,7 @@ class MCAsmInfo { bool SupportsDebugInformation; /// Exception handling format for the target. Defaults to None. - ExceptionHandling::ExceptionsType ExceptionsType; + ExceptionHandling ExceptionsType; /// Windows exception handling data (.pdata) encoding. Defaults to Invalid. WinEH::EncodingType WinEHEncodingType; @@ -415,7 +419,7 @@ class MCAsmInfo { const char *getCommentString() const { return CommentString; } const char *getLabelSuffix() const { return LabelSuffix; } - const char *getDebugLabelSuffix() const { return DebugLabelSuffix; } + bool useAssignmentForEHBegin() const { return UseAssignmentForEHBegin; } const char *getPrivateGlobalPrefix() const { return PrivateGlobalPrefix; } bool hasLinkerPrivateGlobalPrefix() const { return LinkerPrivateGlobalPrefix[0] != '\0'; @@ -472,16 +476,13 @@ class MCAsmInfo { bool doesSupportExceptionHandling() const { return ExceptionsType != ExceptionHandling::None; } - ExceptionHandling::ExceptionsType getExceptionHandlingType() const { - return ExceptionsType; - } - WinEH::EncodingType getWinEHEncodingType() const { - return WinEHEncodingType; - } + ExceptionHandling getExceptionHandlingType() const { return ExceptionsType; } + WinEH::EncodingType getWinEHEncodingType() const { return WinEHEncodingType; } bool isExceptionHandlingDwarf() const { return (ExceptionsType == ExceptionHandling::DwarfCFI || ExceptionsType == ExceptionHandling::ARM || - ExceptionsType == ExceptionHandling::Win64); + // Windows handler data still uses DWARF LSDA encoding. + ExceptionsType == ExceptionHandling::WinEH); } bool doesDwarfUseRelocationsAcrossSections() const { return DwarfUsesRelocationsAcrossSections; diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h index be13b3681ee2..1cb34c2fe33b 100644 --- a/include/llvm/MC/MCAssembler.h +++ b/include/llvm/MC/MCAssembler.h @@ -802,7 +802,7 @@ class MCSymbolData : public ilist_node { /// @} - void dump(); + void dump() const; }; // FIXME: This really doesn't belong here. See comments below. diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h index 2f9b32b984ec..eb0340f7421a 100644 --- a/include/llvm/MC/MCContext.h +++ b/include/llvm/MC/MCContext.h @@ -11,15 +11,18 @@ #define LLVM_MC_MCCONTEXT_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/MC/SectionKind.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/raw_ostream.h" #include +#include #include // FIXME: Shouldn't be needed. namespace llvm { @@ -129,11 +132,10 @@ namespace llvm { /// assembly source files. unsigned GenDwarfFileNumber; - /// The default initial text section that we generate dwarf debugging line - /// info for when generating dwarf assembly source files. - const MCSection *GenDwarfSection; - /// Symbols created for the start and end of this section. - MCSymbol *GenDwarfSectionStartSym, *GenDwarfSectionEndSym; + /// Symbols created for the start and end of each section, used for + /// generating the .debug_ranges and .debug_aranges sections. + MapVector > + SectionStartEndSyms; /// The information gathered from labels that will have dwarf label /// entries when generating dwarf assembly source files. @@ -159,10 +161,11 @@ namespace llvm { unsigned DwarfCompileUnitID; typedef std::pair SectionGroupPair; + typedef std::tuple SectionGroupTriple; StringMap MachOUniquingMap; std::map ELFUniquingMap; - std::map COFFUniquingMap; + std::map COFFUniquingMap; /// Do automatic reset in destructor bool AutoReset; @@ -374,16 +377,18 @@ namespace llvm { void setGenDwarfFileNumber(unsigned FileNumber) { GenDwarfFileNumber = FileNumber; } - const MCSection *getGenDwarfSection() { return GenDwarfSection; } - void setGenDwarfSection(const MCSection *Sec) { GenDwarfSection = Sec; } - MCSymbol *getGenDwarfSectionStartSym() { return GenDwarfSectionStartSym; } - void setGenDwarfSectionStartSym(MCSymbol *Sym) { - GenDwarfSectionStartSym = Sym; + MapVector > & + getGenDwarfSectionSyms() { + return SectionStartEndSyms; } - MCSymbol *getGenDwarfSectionEndSym() { return GenDwarfSectionEndSym; } - void setGenDwarfSectionEndSym(MCSymbol *Sym) { - GenDwarfSectionEndSym = Sym; + std::pair >::iterator, + bool> + addGenDwarfSection(const MCSection *Sec) { + return SectionStartEndSyms.insert( + std::make_pair(Sec, std::make_pair(nullptr, nullptr))); } + void finalizeDwarfSections(MCStreamer &MCOS); const std::vector &getMCGenDwarfLabelEntries() const { return MCGenDwarfLabelEntries; } diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h index 6df8a1985fd6..6cd9a9a21e21 100644 --- a/include/llvm/MC/MCDwarf.h +++ b/include/llvm/MC/MCDwarf.h @@ -465,14 +465,13 @@ class MCCFIInstruction { struct MCDwarfFrameInfo { MCDwarfFrameInfo() - : Begin(nullptr), End(nullptr), Personality(nullptr), Lsda(nullptr), - Function(nullptr), Instructions(), PersonalityEncoding(), LsdaEncoding(0), - CompactUnwindEncoding(0), IsSignalFrame(false), IsSimple(false) {} + : Begin(nullptr), End(nullptr), Personality(nullptr), Lsda(nullptr), + Instructions(), PersonalityEncoding(), LsdaEncoding(0), + CompactUnwindEncoding(0), IsSignalFrame(false), IsSimple(false) {} MCSymbol *Begin; MCSymbol *End; const MCSymbol *Personality; const MCSymbol *Lsda; - const MCSymbol *Function; std::vector Instructions; unsigned PersonalityEncoding; unsigned LsdaEncoding; diff --git a/include/llvm/MC/MCELF.h b/include/llvm/MC/MCELF.h index 7e59911a89c3..294a51bf7c76 100644 --- a/include/llvm/MC/MCELF.h +++ b/include/llvm/MC/MCELF.h @@ -27,9 +27,9 @@ class MCELF { static void SetType(MCSymbolData &SD, unsigned Type); static unsigned GetType(const MCSymbolData &SD); static void SetVisibility(MCSymbolData &SD, unsigned Visibility); - static unsigned GetVisibility(MCSymbolData &SD); + static unsigned GetVisibility(const MCSymbolData &SD); static void setOther(MCSymbolData &SD, unsigned Other); - static unsigned getOther(MCSymbolData &SD); + static unsigned getOther(const MCSymbolData &SD); }; } diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h index 127f1624878b..421e7a0b2c19 100644 --- a/include/llvm/MC/MCELFObjectWriter.h +++ b/include/llvm/MC/MCELFObjectWriter.h @@ -22,6 +22,7 @@ class MCFragment; class MCObjectWriter; class MCSectionData; class MCSymbol; +class MCSymbolData; class MCValue; class MCELFObjectTargetWriter { @@ -54,7 +55,8 @@ class MCELFObjectTargetWriter { virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const = 0; - virtual bool needsRelocateWithSymbol(unsigned Type) const; + virtual bool needsRelocateWithSymbol(const MCSymbolData &SD, + unsigned Type) const; /// @name Accessors /// @{ diff --git a/include/llvm/MC/MCELFStreamer.h b/include/llvm/MC/MCELFStreamer.h index be39128dbda1..66729fe0147e 100644 --- a/include/llvm/MC/MCELFStreamer.h +++ b/include/llvm/MC/MCELFStreamer.h @@ -48,7 +48,6 @@ class MCELFStreamer : public MCObjectStreamer { void ChangeSection(const MCSection *Section, const MCExpr *Subsection) override; void EmitLabel(MCSymbol *Symbol) override; - void EmitDebugLabel(MCSymbol *Symbol) override; void EmitAssemblerFlag(MCAssemblerFlag Flag) override; void EmitThumbFunc(MCSymbol *Func) override; void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override; diff --git a/include/llvm/MC/MCELFSymbolFlags.h b/include/llvm/MC/MCELFSymbolFlags.h index 2f1f5612212b..297c44269a8f 100644 --- a/include/llvm/MC/MCELFSymbolFlags.h +++ b/include/llvm/MC/MCELFSymbolFlags.h @@ -41,6 +41,7 @@ namespace llvm { ELF_STT_File = (ELF::STT_FILE << ELF_STT_Shift), ELF_STT_Common = (ELF::STT_COMMON << ELF_STT_Shift), ELF_STT_Tls = (ELF::STT_TLS << ELF_STT_Shift), + ELF_STT_GnuIFunc = (ELF::STT_GNU_IFUNC << ELF_STT_Shift), ELF_STT_Loproc = (ELF::STT_LOPROC << ELF_STT_Shift), ELF_STT_Hiproc = (ELF::STT_HIPROC << ELF_STT_Shift), diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h index ca5cecbef0a2..e96ecb4be175 100644 --- a/include/llvm/MC/MCExpr.h +++ b/include/llvm/MC/MCExpr.h @@ -21,6 +21,7 @@ class MCAssembler; class MCContext; class MCSection; class MCSectionData; +class MCStreamer; class MCSymbol; class MCValue; class raw_ostream; @@ -524,7 +525,7 @@ class MCTargetExpr : public MCExpr { virtual void PrintImpl(raw_ostream &OS) const = 0; virtual bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const = 0; - virtual void AddValueSymbols(MCAssembler *) const = 0; + virtual void visitUsedExpr(MCStreamer& Streamer) const = 0; virtual const MCSection *FindAssociatedSection() const = 0; virtual void fixELFSymbolsInTLSFixups(MCAssembler &) const = 0; diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h index 1a56040e4667..4d1715eccf5a 100644 --- a/include/llvm/MC/MCObjectFileInfo.h +++ b/include/llvm/MC/MCObjectFileInfo.h @@ -14,13 +14,13 @@ #ifndef LLVM_MC_MCBJECTFILEINFO_H #define LLVM_MC_MCBJECTFILEINFO_H +#include "llvm/ADT/Triple.h" #include "llvm/Support/CodeGen.h" namespace llvm { class MCContext; class MCSection; class StringRef; - class Triple; class MCObjectFileInfo { protected: @@ -33,12 +33,6 @@ class MCObjectFileInfo { /// weak_definition of constant 0 for an omitted EH frame. bool SupportsWeakOmittedEHFrame; - /// IsFunctionEHFrameSymbolPrivate - This flag is set to true if the - /// "EH_frame" symbol for EH information should be an assembler temporary (aka - /// private linkage, aka an L or .L label) or false if it should be a normal - /// non-.globl label. This defaults to true. - bool IsFunctionEHFrameSymbolPrivate; - /// SupportsCompactUnwindWithoutEHFrame - True if the target object file /// supports emitting a compact unwind section without an associated EH frame /// section. @@ -201,9 +195,6 @@ class MCObjectFileInfo { void InitMCObjectFileInfo(StringRef TT, Reloc::Model RM, CodeModel::Model CM, MCContext &ctx); - bool isFunctionEHFrameSymbolPrivate() const { - return IsFunctionEHFrameSymbolPrivate; - } bool getSupportsWeakOmittedEHFrame() const { return SupportsWeakOmittedEHFrame; } @@ -380,6 +371,7 @@ class MCObjectFileInfo { Reloc::Model RelocM; CodeModel::Model CMModel; MCContext *Ctx; + Triple TT; void InitMachOMCObjectFileInfo(Triple T); void InitELFMCObjectFileInfo(Triple T); @@ -388,6 +380,9 @@ class MCObjectFileInfo { /// InitEHFrameSection - Initialize EHFrameSection on demand. /// void InitEHFrameSection(); + +public: + const Triple &getTargetTriple() const { return TT; } }; } // end namespace llvm diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h index e41a8ba63725..8d37c85b0585 100644 --- a/include/llvm/MC/MCObjectStreamer.h +++ b/include/llvm/MC/MCObjectStreamer.h @@ -78,16 +78,15 @@ class MCObjectStreamer : public MCStreamer { /// fragment is not a data fragment. MCDataFragment *getOrCreateDataFragment() const; - const MCExpr *AddValueSymbols(const MCExpr *Value); - public: + void visitUsedSymbol(const MCSymbol &Sym) override; + MCAssembler &getAssembler() { return *Assembler; } /// @name MCStreamer Interface /// @{ void EmitLabel(MCSymbol *Symbol) override; - void EmitDebugLabel(MCSymbol *Symbol) override; void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override; void EmitValueImpl(const MCExpr *Value, unsigned Size, const SMLoc &Loc = SMLoc()) override; @@ -126,6 +125,10 @@ class MCObjectStreamer : public MCStreamer { void EmitFill(uint64_t NumBytes, uint8_t FillValue) override; void EmitZeros(uint64_t NumBytes) override; void FinishImpl() override; + + virtual bool mayHaveInstructions() const { + return getCurrentSectionData()->hasInstructions(); + } }; } // end namespace llvm diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h index 59b5c09b4028..0b550ba627e9 100644 --- a/include/llvm/MC/MCParser/AsmLexer.h +++ b/include/llvm/MC/MCParser/AsmLexer.h @@ -28,7 +28,7 @@ class AsmLexer : public MCAsmLexer { const MCAsmInfo &MAI; const char *CurPtr; - const MemoryBuffer *CurBuf; + StringRef CurBuf; bool isAtStartOfLine; void operator=(const AsmLexer&) LLVM_DELETED_FUNCTION; @@ -42,7 +42,7 @@ class AsmLexer : public MCAsmLexer { AsmLexer(const MCAsmInfo &MAI); ~AsmLexer(); - void setBuffer(const MemoryBuffer *buf, const char *ptr = nullptr); + void setBuffer(StringRef Buf, const char *ptr = nullptr); StringRef LexUntilEndOfStatement() override; StringRef LexUntilEndOfLine(); diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h index 862a0fd7addc..43b8672265c6 100644 --- a/include/llvm/MC/MCSchedule.h +++ b/include/llvm/MC/MCSchedule.h @@ -186,6 +186,8 @@ class MCSchedModel { // takes to recover from a branch misprediction. unsigned MispredictPenalty; static const unsigned DefaultMispredictPenalty = 10; + + bool PostRAScheduler; // default value is false bool CompleteModel; @@ -210,7 +212,8 @@ class MCSchedModel { LoadLatency(DefaultLoadLatency), HighLatency(DefaultHighLatency), MispredictPenalty(DefaultMispredictPenalty), - CompleteModel(true), ProcID(0), ProcResourceTable(nullptr), + PostRAScheduler(false), CompleteModel(true), + ProcID(0), ProcResourceTable(nullptr), SchedClassTable(nullptr), NumProcResourceKinds(0), NumSchedClasses(0), InstrItineraries(nullptr) { (void)NumProcResourceKinds; @@ -219,12 +222,13 @@ class MCSchedModel { // Table-gen driven ctor. MCSchedModel(unsigned iw, int mbs, int lmbs, unsigned ll, unsigned hl, - unsigned mp, bool cm, unsigned pi, const MCProcResourceDesc *pr, - const MCSchedClassDesc *sc, unsigned npr, unsigned nsc, - const InstrItinerary *ii): + unsigned mp, bool postRASched, bool cm, unsigned pi, + const MCProcResourceDesc *pr, const MCSchedClassDesc *sc, + unsigned npr, unsigned nsc, const InstrItinerary *ii): IssueWidth(iw), MicroOpBufferSize(mbs), LoopMicroOpBufferSize(lmbs), LoadLatency(ll), HighLatency(hl), - MispredictPenalty(mp), CompleteModel(cm), ProcID(pi), + MispredictPenalty(mp), PostRAScheduler(postRASched), + CompleteModel(cm), ProcID(pi), ProcResourceTable(pr), SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc), InstrItineraries(ii) {} diff --git a/include/llvm/MC/MCSectionCOFF.h b/include/llvm/MC/MCSectionCOFF.h index d205e2aebfb0..0bbf3696686e 100644 --- a/include/llvm/MC/MCSectionCOFF.h +++ b/include/llvm/MC/MCSectionCOFF.h @@ -36,7 +36,7 @@ class MCSymbol; /// The COMDAT symbol of this section. Only valid if this is a COMDAT /// section. Two COMDAT sections are merged if they have the same /// COMDAT symbol. - const MCSymbol *COMDATSymbol; + MCSymbol *COMDATSymbol; /// Selection - This is the Selection field for the section symbol, if /// it is a COMDAT section (Characteristics & IMAGE_SCN_LNK_COMDAT) != 0 @@ -45,7 +45,7 @@ class MCSymbol; private: friend class MCContext; MCSectionCOFF(StringRef Section, unsigned Characteristics, - const MCSymbol *COMDATSymbol, int Selection, SectionKind K) + MCSymbol *COMDATSymbol, int Selection, SectionKind K) : MCSection(SV_COFF, K), SectionName(Section), Characteristics(Characteristics), COMDATSymbol(COMDATSymbol), Selection(Selection) { @@ -67,7 +67,7 @@ class MCSymbol; return SectionName.str() + "_end"; } unsigned getCharacteristics() const { return Characteristics; } - const MCSymbol *getCOMDATSymbol() const { return COMDATSymbol; } + MCSymbol *getCOMDATSymbol() const { return COMDATSymbol; } int getSelection() const { return Selection; } void setSelection(int Selection) const; diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h index 2a8367afeb8f..63a43d08c3f2 100644 --- a/include/llvm/MC/MCStreamer.h +++ b/include/llvm/MC/MCStreamer.h @@ -86,6 +86,27 @@ class MCTargetStreamer { virtual void finish(); }; +class AArch64TargetStreamer : public MCTargetStreamer { +public: + AArch64TargetStreamer(MCStreamer &S); + ~AArch64TargetStreamer(); + + + void finish() override; + + /// Callback used to implement the ldr= pseudo. + /// Add a new entry to the constant pool for the current section and return an + /// MCExpr that can be used to refer to the constant pool location. + const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size); + + /// Callback used to implemnt the .ltorg directive. + /// Emit contents of constant pool for the current section. + void emitCurrentConstantPool(); + +private: + std::unique_ptr ConstantPools; +}; + // FIXME: declared here because it is used from // lib/CodeGen/AsmPrinter/ARMException.cpp. class ARMTargetStreamer : public MCTargetStreamer { @@ -154,17 +175,15 @@ class MCStreamer { MCStreamer(const MCStreamer &) LLVM_DELETED_FUNCTION; MCStreamer &operator=(const MCStreamer &) LLVM_DELETED_FUNCTION; - std::vector FrameInfos; - MCDwarfFrameInfo *getCurrentFrameInfo(); - MCSymbol *EmitCFICommon(); - void EnsureValidFrame(); + std::vector DwarfFrameInfos; + MCDwarfFrameInfo *getCurrentDwarfFrameInfo(); + void EnsureValidDwarfFrame(); - std::vector W64UnwindInfos; - MCWin64EHUnwindInfo *CurrentW64UnwindInfo; - void setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame); - void EnsureValidW64UnwindInfo(); + MCSymbol *EmitCFICommon(); - MCSymbol *LastSymbol; + std::vector WinFrameInfos; + MCWinFrameInfo *CurrentWinFrameInfo; + void EnsureValidWinFrameInfo(); // SymbolOrdering - Tracks an index to represent the order // a symbol was emitted in. Zero means we did not emit that symbol. @@ -182,21 +201,23 @@ class MCStreamer { const MCExpr *ForceExpAbs(const MCExpr *Expr); - void RecordProcStart(MCDwarfFrameInfo &Frame); virtual void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame); - void RecordProcEnd(MCDwarfFrameInfo &Frame); virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame); - MCWin64EHUnwindInfo *getCurrentW64UnwindInfo() { - return CurrentW64UnwindInfo; + MCWinFrameInfo *getCurrentWinFrameInfo() { + return CurrentWinFrameInfo; } - void EmitW64Tables(); + + void EmitWindowsUnwindTables(); virtual void EmitRawTextImpl(StringRef String); public: virtual ~MCStreamer(); + void visitUsedExpr(const MCExpr &Expr); + virtual void visitUsedSymbol(const MCSymbol &Sym); + void setTargetStreamer(MCTargetStreamer *TS) { TargetStreamer.reset(TS); } @@ -211,16 +232,14 @@ class MCStreamer { return TargetStreamer.get(); } - unsigned getNumFrameInfos() { return FrameInfos.size(); } - - const MCDwarfFrameInfo &getFrameInfo(unsigned i) { return FrameInfos[i]; } - - ArrayRef getFrameInfos() const { return FrameInfos; } - - unsigned getNumW64UnwindInfos() { return W64UnwindInfos.size(); } + unsigned getNumFrameInfos() { return DwarfFrameInfos.size(); } + ArrayRef getDwarfFrameInfos() const { + return DwarfFrameInfos; + } - MCWin64EHUnwindInfo &getW64UnwindInfo(unsigned i) { - return *W64UnwindInfos[i]; + unsigned getNumWinFrameInfos() { return WinFrameInfos.size(); } + ArrayRef getWinFrameInfos() const { + return WinFrameInfos; } void generateCompactUnwindEncodings(MCAsmBackend *MAB); @@ -294,7 +313,7 @@ class MCStreamer { /// /// This is called by PopSection and SwitchSection, if the current /// section changes. - virtual void ChangeSection(const MCSection *, const MCExpr *) = 0; + virtual void ChangeSection(const MCSection *, const MCExpr *); /// pushSection - Save the current and previous section on the /// section stack. @@ -374,12 +393,10 @@ class MCStreamer { // add the section we're emitting it to later. virtual void EmitLabel(MCSymbol *Symbol); - virtual void EmitDebugLabel(MCSymbol *Symbol); - virtual void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol); /// EmitAssemblerFlag - Note in the output the specified @p Flag. - virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) = 0; + virtual void EmitAssemblerFlag(MCAssemblerFlag Flag); /// EmitLinkerOptions - Emit the given list @p Options of strings as linker /// options into the output. @@ -394,7 +411,7 @@ class MCStreamer { /// EmitThumbFunc - Note in the output that the specified @p Func is /// a Thumb mode function (ARM target only). - virtual void EmitThumbFunc(MCSymbol *Func) = 0; + virtual void EmitThumbFunc(MCSymbol *Func); /// EmitAssignment - Emit an assignment of @p Value to @p Symbol. /// @@ -416,7 +433,7 @@ class MCStreamer { /// /// @param Alias - The alias that is being created. /// @param Symbol - The symbol being aliased. - virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) = 0; + virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol); /// EmitSymbolAttribute - Add the given @p Attribute to @p Symbol. virtual bool EmitSymbolAttribute(MCSymbol *Symbol, @@ -426,25 +443,25 @@ class MCStreamer { /// /// @param Symbol - The symbol to have its n_desc field set. /// @param DescValue - The value to set into the n_desc field. - virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) = 0; + virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue); /// BeginCOFFSymbolDef - Start emitting COFF symbol definition /// /// @param Symbol - The symbol to have its External & Type fields set. - virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) = 0; + virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol); /// EmitCOFFSymbolStorageClass - Emit the storage class of the symbol. /// /// @param StorageClass - The storage class the symbol should have. - virtual void EmitCOFFSymbolStorageClass(int StorageClass) = 0; + virtual void EmitCOFFSymbolStorageClass(int StorageClass); /// EmitCOFFSymbolType - Emit the type of the symbol. /// /// @param Type - A COFF type identifier (see COFF::SymbolType in X86COFF.h) - virtual void EmitCOFFSymbolType(int Type) = 0; + virtual void EmitCOFFSymbolType(int Type); /// EndCOFFSymbolDef - Marks the end of the symbol definition. - virtual void EndCOFFSymbolDef() = 0; + virtual void EndCOFFSymbolDef(); /// EmitCOFFSectionIndex - Emits a COFF section index. /// @@ -461,7 +478,7 @@ class MCStreamer { /// This corresponds to an assembler statement such as: /// .size symbol, expression /// - virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) = 0; + virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value); /// \brief Emit a Linker Optimization Hint (LOH) directive. /// \param Args - Arguments of the LOH. @@ -482,7 +499,7 @@ class MCStreamer { /// @param Size - The size of the common symbol. /// @param ByteAlignment - The alignment of the common symbol in bytes. virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, - unsigned ByteAlignment) = 0; + unsigned ByteAlignment); /// EmitZerofill - Emit the zerofill section and an optional symbol. /// @@ -503,7 +520,7 @@ class MCStreamer { /// @param ByteAlignment - The alignment of the thread local common symbol /// if non-zero. This must be a power of 2 on some targets. virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, - uint64_t Size, unsigned ByteAlignment = 0) = 0; + uint64_t Size, unsigned ByteAlignment = 0); /// @} /// @name Generating Data @@ -513,7 +530,7 @@ class MCStreamer { /// /// This is used to implement assembler directives such as .byte, .ascii, /// etc. - virtual void EmitBytes(StringRef Data) = 0; + virtual void EmitBytes(StringRef Data); /// EmitValue - Emit the expression @p Value into the output as a native /// integer of the given @p Size bytes. @@ -526,7 +543,7 @@ class MCStreamer { /// match a native machine width. /// @param Loc - The location of the expression for error reporting. virtual void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc = SMLoc()) = 0; + const SMLoc &Loc = SMLoc()); void EmitValue(const MCExpr *Value, unsigned Size, const SMLoc &Loc = SMLoc()); @@ -541,9 +558,9 @@ class MCStreamer { /// .long foo void EmitAbsValue(const MCExpr *Value, unsigned Size); - virtual void EmitULEB128Value(const MCExpr *Value) = 0; + virtual void EmitULEB128Value(const MCExpr *Value); - virtual void EmitSLEB128Value(const MCExpr *Value) = 0; + virtual void EmitSLEB128Value(const MCExpr *Value); /// EmitULEB128Value - Special case of EmitULEB128Value that avoids the /// client having to pass in a MCExpr for constant integers. @@ -555,7 +572,8 @@ class MCStreamer { /// EmitSymbolValue - Special case of EmitValue that avoids the client /// having to pass in a MCExpr for MCSymbols. - void EmitSymbolValue(const MCSymbol *Sym, unsigned Size); + void EmitSymbolValue(const MCSymbol *Sym, unsigned Size, + bool IsSectionRelative = false); /// EmitGPRel64Value - Emit the expression @p Value into the output as a /// gprel64 (64-bit GP relative) value. @@ -598,7 +616,7 @@ class MCStreamer { /// emitted. virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0, unsigned ValueSize = 1, - unsigned MaxBytesToEmit = 0) = 0; + unsigned MaxBytesToEmit = 0); /// EmitCodeAlignment - Emit nops until the byte alignment @p ByteAlignment /// is reached. @@ -612,7 +630,7 @@ class MCStreamer { /// the alignment cannot be reached in this many bytes, no bytes are /// emitted. virtual void EmitCodeAlignment(unsigned ByteAlignment, - unsigned MaxBytesToEmit = 0) = 0; + unsigned MaxBytesToEmit = 0); /// EmitValueToOffset - Emit some number of copies of @p Value until the /// byte offset @p Offset is reached. @@ -624,13 +642,13 @@ class MCStreamer { /// @param Value - The value to use when filling bytes. /// @return false on success, true if the offset was invalid. virtual bool EmitValueToOffset(const MCExpr *Offset, - unsigned char Value = 0) = 0; + unsigned char Value = 0); /// @} /// EmitFileDirective - Switch to a new logical file. This is used to /// implement the '.file "foo.c"' assembler directive. - virtual void EmitFileDirective(StringRef Filename) = 0; + virtual void EmitFileDirective(StringRef Filename); /// Emit the "identifiers" directive. This implements the /// '.ident "version foo"' assembler directive. @@ -677,38 +695,38 @@ class MCStreamer { virtual void EmitCFIRegister(int64_t Register1, int64_t Register2); virtual void EmitCFIWindowSave(); - virtual void EmitWin64EHStartProc(const MCSymbol *Symbol); - virtual void EmitWin64EHEndProc(); - virtual void EmitWin64EHStartChained(); - virtual void EmitWin64EHEndChained(); - virtual void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind, - bool Except); - virtual void EmitWin64EHHandlerData(); - virtual void EmitWin64EHPushReg(unsigned Register); - virtual void EmitWin64EHSetFrame(unsigned Register, unsigned Offset); - virtual void EmitWin64EHAllocStack(unsigned Size); - virtual void EmitWin64EHSaveReg(unsigned Register, unsigned Offset); - virtual void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset); - virtual void EmitWin64EHPushFrame(bool Code); - virtual void EmitWin64EHEndProlog(); + virtual void EmitWinCFIStartProc(const MCSymbol *Symbol); + virtual void EmitWinCFIEndProc(); + virtual void EmitWinCFIStartChained(); + virtual void EmitWinCFIEndChained(); + virtual void EmitWinCFIPushReg(unsigned Register); + virtual void EmitWinCFISetFrame(unsigned Register, unsigned Offset); + virtual void EmitWinCFIAllocStack(unsigned Size); + virtual void EmitWinCFISaveReg(unsigned Register, unsigned Offset); + virtual void EmitWinCFISaveXMM(unsigned Register, unsigned Offset); + virtual void EmitWinCFIPushFrame(bool Code); + virtual void EmitWinCFIEndProlog(); + + virtual void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except); + virtual void EmitWinEHHandlerData(); /// EmitInstruction - Emit the given @p Instruction into the current /// section. - virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) = 0; + virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI); /// \brief Set the bundle alignment mode from now on in the section. /// The argument is the power of 2 to which the alignment is set. The /// value 0 means turn the bundle alignment off. - virtual void EmitBundleAlignMode(unsigned AlignPow2) = 0; + virtual void EmitBundleAlignMode(unsigned AlignPow2); /// \brief The following instructions are a bundle-locked group. /// /// \param AlignToEnd - If true, the bundle-locked group will be aligned to /// the end of a bundle. - virtual void EmitBundleLock(bool AlignToEnd) = 0; + virtual void EmitBundleLock(bool AlignToEnd); /// \brief Ends a bundle-locked group. - virtual void EmitBundleUnlock() = 0; + virtual void EmitBundleUnlock(); /// EmitRawText - If this file is backed by a assembly streamer, this dumps /// the specified string in the output .s file. This capability is @@ -719,9 +737,11 @@ class MCStreamer { virtual void Flush() {} /// FinishImpl - Streamer specific finalization. - virtual void FinishImpl() = 0; + virtual void FinishImpl(); /// Finish - Finish emission of machine code. void Finish(); + + virtual bool mayHaveInstructions() const { return true; } }; /// createNullStreamer - Create a dummy machine code streamer, which does diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h index 384cc1b880ca..9a5881b14b44 100644 --- a/include/llvm/MC/MCTargetAsmParser.h +++ b/include/llvm/MC/MCTargetAsmParser.h @@ -164,6 +164,9 @@ class MCTargetAsmParser : public MCAsmParserExtension { unsigned &ErrorInfo, bool MatchingInlineAsm) = 0; + /// Allows targets to let registers opt out of clobber lists. + virtual bool OmitRegisterFromClobberLists(unsigned RegNo) { return false; } + /// Allow a target to add special case operand matching for things that /// tblgen doesn't/can't handle effectively. For example, literal /// immediates on ARM. TableGen expects a token operand, but the parser diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h index 80cc8befb7a7..eb4348ed3ec2 100644 --- a/include/llvm/MC/MCTargetOptions.h +++ b/include/llvm/MC/MCTargetOptions.h @@ -29,6 +29,7 @@ class MCTargetOptions { bool ShowMCEncoding : 1; bool ShowMCInst : 1; bool AsmVerbose : 1; + int DwarfVersion; MCTargetOptions(); }; @@ -41,7 +42,8 @@ inline bool operator==(const MCTargetOptions &LHS, const MCTargetOptions &RHS) { ARE_EQUAL(MCUseDwarfDirectory) && ARE_EQUAL(ShowMCEncoding) && ARE_EQUAL(ShowMCInst) && - ARE_EQUAL(AsmVerbose)); + ARE_EQUAL(AsmVerbose) && + ARE_EQUAL(DwarfVersion)); #undef ARE_EQUAL } diff --git a/include/llvm/MC/MCTargetOptionsCommandFlags.h b/include/llvm/MC/MCTargetOptionsCommandFlags.h index 17a117a2a3bd..6d4eb0ef5911 100644 --- a/include/llvm/MC/MCTargetOptionsCommandFlags.h +++ b/include/llvm/MC/MCTargetOptionsCommandFlags.h @@ -33,11 +33,20 @@ cl::opt RelaxAll("mc-relax-all", cl::desc("When used with filetype=obj, " "relax all fixups in the emitted object file")); +cl::opt DwarfVersion("dwarf-version", cl::desc("Dwarf version"), + cl::init(0)); + +cl::opt ShowMCInst("asm-show-inst", + cl::desc("Emit internal instruction representation to " + "assembly file")); + static inline MCTargetOptions InitMCTargetOptionsFromFlags() { MCTargetOptions Options; Options.SanitizeAddress = (AsmInstrumentation == MCTargetOptions::AsmInstrumentationAddress); Options.MCRelaxAll = RelaxAll; + Options.DwarfVersion = DwarfVersion; + Options.ShowMCInst = ShowMCInst; return Options; } diff --git a/include/llvm/MC/MCWin64EH.h b/include/llvm/MC/MCWin64EH.h index d21e7620263f..3df0d0ab8c46 100644 --- a/include/llvm/MC/MCWin64EH.h +++ b/include/llvm/MC/MCWin64EH.h @@ -15,8 +15,8 @@ #ifndef LLVM_MC_MCWIN64EH_H #define LLVM_MC_MCWIN64EH_H +#include "llvm/MC/MCWinEH.h" #include "llvm/Support/Win64EH.h" -#include #include namespace llvm { @@ -24,44 +24,38 @@ namespace llvm { class MCStreamer; class MCSymbol; - class MCWin64EHInstruction { - public: - typedef Win64EH::UnwindOpcodes OpType; - private: - OpType Operation; - MCSymbol *Label; - unsigned Offset; - unsigned Register; - public: - MCWin64EHInstruction(OpType Op, MCSymbol *L, unsigned Reg) - : Operation(Op), Label(L), Offset(0), Register(Reg) { - assert(Op == Win64EH::UOP_PushNonVol); - } - MCWin64EHInstruction(MCSymbol *L, unsigned Size) - : Operation(Size>128 ? Win64EH::UOP_AllocLarge : Win64EH::UOP_AllocSmall), - Label(L), Offset(Size) { } - MCWin64EHInstruction(OpType Op, MCSymbol *L, unsigned Reg, unsigned Off) - : Operation(Op), Label(L), Offset(Off), Register(Reg) { - assert(Op == Win64EH::UOP_SetFPReg || - Op == Win64EH::UOP_SaveNonVol || - Op == Win64EH::UOP_SaveNonVolBig || - Op == Win64EH::UOP_SaveXMM128 || - Op == Win64EH::UOP_SaveXMM128Big); - } - MCWin64EHInstruction(OpType Op, MCSymbol *L, bool Code) - : Operation(Op), Label(L), Offset(Code ? 1 : 0) { - assert(Op == Win64EH::UOP_PushMachFrame); - } - OpType getOperation() const { return Operation; } - MCSymbol *getLabel() const { return Label; } - unsigned getOffset() const { return Offset; } - unsigned getSize() const { return Offset; } - unsigned getRegister() const { return Register; } - bool isPushCodeFrame() const { return Offset == 1; } - }; +namespace Win64EH { +struct Instruction { + static WinEH::Instruction PushNonVol(MCSymbol *L, unsigned Reg) { + return WinEH::Instruction(Win64EH::UOP_PushNonVol, L, Reg, -1); + } + static WinEH::Instruction Alloc(MCSymbol *L, unsigned Size) { + return WinEH::Instruction(Size > 128 ? UOP_AllocLarge : UOP_AllocSmall, L, + -1, Size); + } + static WinEH::Instruction PushMachFrame(MCSymbol *L, bool Code) { + return WinEH::Instruction(UOP_PushMachFrame, L, -1, Code ? 1 : 0); + } + static WinEH::Instruction SaveNonVol(MCSymbol *L, unsigned Reg, + unsigned Offset) { + return WinEH::Instruction(Offset > 512 * 1024 - 8 ? UOP_SaveNonVolBig + : UOP_SaveNonVol, + L, Reg, Offset); + } + static WinEH::Instruction SaveXMM(MCSymbol *L, unsigned Reg, + unsigned Offset) { + return WinEH::Instruction(Offset > 512 * 1024 - 8 ? UOP_SaveXMM128Big + : UOP_SaveXMM128, + L, Reg, Offset); + } + static WinEH::Instruction SetFPReg(MCSymbol *L, unsigned Reg, unsigned Off) { + return WinEH::Instruction(UOP_SetFPReg, L, Reg, Off); + } +}; +} - struct MCWin64EHUnwindInfo { - MCWin64EHUnwindInfo() + struct MCWinFrameInfo { + MCWinFrameInfo() : Begin(nullptr), End(nullptr),ExceptionHandler(nullptr), Function(nullptr), PrologEnd(nullptr), Symbol(nullptr), HandlesUnwind(false), HandlesExceptions(false), LastFrameInst(-1), @@ -75,8 +69,8 @@ namespace llvm { bool HandlesUnwind; bool HandlesExceptions; int LastFrameInst; - MCWin64EHUnwindInfo *ChainedParent; - std::vector Instructions; + MCWinFrameInfo *ChainedParent; + std::vector Instructions; }; class MCWin64EHUnwindEmitter { @@ -86,7 +80,7 @@ namespace llvm { // This emits the unwind info sections (.pdata and .xdata in PE/COFF). // static void Emit(MCStreamer &streamer); - static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info); + static void EmitUnwindInfo(MCStreamer &streamer, MCWinFrameInfo *info); }; } // end namespace llvm diff --git a/include/llvm/MC/MCWinCOFFStreamer.h b/include/llvm/MC/MCWinCOFFStreamer.h index 34e39bb0a636..7d2d0e4f5560 100644 --- a/include/llvm/MC/MCWinCOFFStreamer.h +++ b/include/llvm/MC/MCWinCOFFStreamer.h @@ -35,7 +35,6 @@ class MCWinCOFFStreamer : public MCObjectStreamer { void InitSections() override; void EmitLabel(MCSymbol *Symbol) override; - void EmitDebugLabel(MCSymbol *Symbol) override; void EmitAssemblerFlag(MCAssemblerFlag Flag) override; void EmitThumbFunc(MCSymbol *Func) override; bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override; @@ -57,7 +56,7 @@ class MCWinCOFFStreamer : public MCObjectStreamer { unsigned ByteAlignment) override; void EmitFileDirective(StringRef Filename) override; void EmitIdent(StringRef IdentString) override; - void EmitWin64EHHandlerData() override; + void EmitWinEHHandlerData() override; void FinishImpl() override; /// \} diff --git a/include/llvm/MC/MCWinEH.h b/include/llvm/MC/MCWinEH.h new file mode 100644 index 000000000000..1cd1b0f1b8ca --- /dev/null +++ b/include/llvm/MC/MCWinEH.h @@ -0,0 +1,29 @@ +//===- MCWinEH.h - Windows Unwinding Support --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MC_MCWINEH_H +#define LLVM_MC_MCWINEH_H + +namespace llvm { +class MCSymbol; + +namespace WinEH { +struct Instruction { + const MCSymbol *Label; + const unsigned Offset; + const unsigned Register; + const unsigned Operation; + + Instruction(unsigned Op, MCSymbol *L, unsigned Reg, unsigned Off) + : Label(L), Offset(Off), Register(Reg), Operation(Op) {} +}; +} +} + +#endif diff --git a/include/llvm/Object/StringTableBuilder.h b/include/llvm/MC/StringTableBuilder.h similarity index 94% rename from include/llvm/Object/StringTableBuilder.h rename to include/llvm/MC/StringTableBuilder.h index c61e216bdf9b..065e9e06e2c9 100644 --- a/include/llvm/Object/StringTableBuilder.h +++ b/include/llvm/MC/StringTableBuilder.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_OBJECT_STRINGTABLE_BUILDER_H -#define LLVM_OBJECT_STRINGTABLE_BUILDER_H +#ifndef LLVM_MC_STRINGTABLE_BUILDER_H +#define LLVM_MC_STRINGTABLE_BUILDER_H #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" diff --git a/include/llvm/Object/YAML.h b/include/llvm/MC/YAML.h similarity index 77% rename from include/llvm/Object/YAML.h rename to include/llvm/MC/YAML.h index 1792e8b62677..383cdc6785fa 100644 --- a/include/llvm/Object/YAML.h +++ b/include/llvm/MC/YAML.h @@ -1,26 +1,10 @@ -//===- YAML.h - YAMLIO utilities for object files ---------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares utility classes for handling the YAML representation of -// object files. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_OBJECT_YAML_H -#define LLVM_OBJECT_YAML_H +#ifndef LLVM_MC_YAML_H +#define LLVM_MC_YAML_H #include "llvm/Support/YAMLTraits.h" namespace llvm { -namespace object { namespace yaml { - /// \brief Specialized YAMLIO scalar type for representing a binary blob. /// /// A typical use case would be to represent the content of a section in a @@ -100,18 +84,11 @@ inline bool operator==(const BinaryRef &LHS, const BinaryRef &RHS) { return LHS.DataIsHexString == RHS.DataIsHexString && LHS.Data == RHS.Data; } -} -} - -namespace yaml { -template <> struct ScalarTraits { - static void output(const object::yaml::BinaryRef &, void *, - llvm::raw_ostream &); - static StringRef input(StringRef, void *, object::yaml::BinaryRef &); +template <> struct ScalarTraits { + static void output(const BinaryRef &, void *, llvm::raw_ostream &); + static StringRef input(StringRef, void *, BinaryRef &); static bool mustQuote(StringRef S) { return needsQuotes(S); } }; } - } - #endif diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h index 4fe44a7769ae..af6c995c1d05 100644 --- a/include/llvm/Object/Archive.h +++ b/include/llvm/Object/Archive.h @@ -164,8 +164,8 @@ class Archive : public Binary { } }; - Archive(MemoryBuffer *source, std::error_code &ec); - static ErrorOr create(MemoryBuffer *Source); + Archive(std::unique_ptr Source, std::error_code &EC); + static ErrorOr create(std::unique_ptr Source); enum Kind { K_GNU, diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h index 8ac84e78d484..91984cb52715 100644 --- a/include/llvm/Object/Binary.h +++ b/include/llvm/Object/Binary.h @@ -32,12 +32,11 @@ class Binary { Binary(const Binary &other) LLVM_DELETED_FUNCTION; unsigned int TypeID; - bool BufferOwned; protected: - MemoryBuffer *Data; + std::unique_ptr Data; - Binary(unsigned int Type, MemoryBuffer *Source, bool BufferOwned = true); + Binary(unsigned int Type, std::unique_ptr Source); enum { ID_Archive, @@ -79,6 +78,7 @@ class Binary { virtual ~Binary(); StringRef getData() const; + MemoryBuffer *releaseBuffer() { return Data.release(); } StringRef getFileName() const; // Cast methods. @@ -125,10 +125,8 @@ class Binary { /// @brief Create a Binary from Source, autodetecting the file type. /// -/// @param Source The data to create the Binary from. Ownership is transferred -/// to the Binary if successful. If an error is returned, -/// Source is destroyed by createBinary before returning. -ErrorOr createBinary(MemoryBuffer *Source, +/// @param Source The data to create the Binary from. +ErrorOr createBinary(std::unique_ptr Source, LLVMContext *Context = nullptr); ErrorOr createBinary(StringRef Path); diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h index f0f2793a928c..e2da070d47bd 100644 --- a/include/llvm/Object/COFF.h +++ b/include/llvm/Object/COFF.h @@ -420,8 +420,7 @@ class COFFObjectFile : public ObjectFile { StringRef &Result) const override; public: - COFFObjectFile(MemoryBuffer *Object, std::error_code &EC, - bool BufferOwned = true); + COFFObjectFile(std::unique_ptr Object, std::error_code &EC); basic_symbol_iterator symbol_begin_impl() const override; basic_symbol_iterator symbol_end_impl() const override; library_iterator needed_library_begin() const override; diff --git a/include/llvm/Object/COFFYAML.h b/include/llvm/Object/COFFYAML.h index 3f48e07f575a..4aba08f75ddc 100644 --- a/include/llvm/Object/COFFYAML.h +++ b/include/llvm/Object/COFFYAML.h @@ -15,7 +15,7 @@ #define LLVM_OBJECT_COFFYAML_H #include "llvm/ADT/Optional.h" -#include "llvm/Object/YAML.h" +#include "llvm/MC/YAML.h" #include "llvm/Support/COFF.h" namespace llvm { @@ -49,7 +49,7 @@ namespace COFFYAML { struct Section { COFF::section Header; unsigned Alignment; - object::yaml::BinaryRef SectionData; + yaml::BinaryRef SectionData; std::vector Relocations; StringRef Name; Section(); diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h index 716b3b93c0ea..fbc48e6d7218 100644 --- a/include/llvm/Object/ELF.h +++ b/include/llvm/Object/ELF.h @@ -40,11 +40,12 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type); // Subclasses of ELFFile may need this for template instantiation inline std::pair -getElfArchType(MemoryBuffer *Object) { - if (Object->getBufferSize() < ELF::EI_NIDENT) - return std::make_pair((uint8_t)ELF::ELFCLASSNONE,(uint8_t)ELF::ELFDATANONE); - return std::make_pair((uint8_t) Object->getBufferStart()[ELF::EI_CLASS], - (uint8_t) Object->getBufferStart()[ELF::EI_DATA]); +getElfArchType(StringRef Object) { + if (Object.size() < ELF::EI_NIDENT) + return std::make_pair((uint8_t)ELF::ELFCLASSNONE, + (uint8_t)ELF::ELFDATANONE); + return std::make_pair((uint8_t)Object[ELF::EI_CLASS], + (uint8_t)Object[ELF::EI_DATA]); } template @@ -230,10 +231,10 @@ class ELFFile { typedef SmallVector Sections_t; typedef DenseMap IndexMap_t; - MemoryBuffer *Buf; + StringRef Buf; const uint8_t *base() const { - return reinterpret_cast(Buf->getBufferStart()); + return reinterpret_cast(Buf.data()); } const Elf_Ehdr *Header; @@ -317,7 +318,7 @@ class ELFFile { std::pair getRelocationSymbol(const Elf_Shdr *RelSec, const RelT *Rel) const; - ELFFile(MemoryBuffer *Object, std::error_code &ec); + ELFFile(StringRef Object, std::error_code &ec); bool isMipsELF64() const { return Header->e_machine == ELF::EM_MIPS && @@ -536,7 +537,7 @@ ELFFile::getSymbol(uint32_t Index) const { template ErrorOr > ELFFile::getSectionContents(const Elf_Shdr *Sec) const { - if (Sec->sh_offset + Sec->sh_size > Buf->getBufferSize()) + if (Sec->sh_offset + Sec->sh_size > Buf.size()) return object_error::parse_failed; const uint8_t *Start = base() + Sec->sh_offset; return ArrayRef(Start, Sec->sh_size); @@ -602,7 +603,7 @@ void ELFFile::VerifyStrTab(const Elf_Shdr *sh) const { template uint64_t ELFFile::getNumSections() const { assert(Header && "Header not initialized!"); - if (Header->e_shnum == ELF::SHN_UNDEF) { + if (Header->e_shnum == ELF::SHN_UNDEF && Header->e_shoff > 0) { assert(SectionHeaderTable && "SectionHeaderTable not initialized!"); return SectionHeaderTable->sh_size; } @@ -621,13 +622,13 @@ typename ELFFile::uintX_t ELFFile::getStringTableIndex() const { } template -ELFFile::ELFFile(MemoryBuffer *Object, std::error_code &ec) +ELFFile::ELFFile(StringRef Object, std::error_code &ec) : Buf(Object), SectionHeaderTable(nullptr), dot_shstrtab_sec(nullptr), dot_strtab_sec(nullptr), dot_symtab_sec(nullptr), SymbolTableSectionHeaderIndex(nullptr), dot_gnu_version_sec(nullptr), dot_gnu_version_r_sec(nullptr), dot_gnu_version_d_sec(nullptr), dt_soname(nullptr) { - const uint64_t FileSize = Buf->getBufferSize(); + const uint64_t FileSize = Buf.size(); if (sizeof(Elf_Ehdr) > FileSize) // FIXME: Proper error handling. diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h index 876206b38a1e..069f38112def 100644 --- a/include/llvm/Object/ELFObjectFile.h +++ b/include/llvm/Object/ELFObjectFile.h @@ -65,6 +65,7 @@ class ELFObjectFile : public ObjectFile { uint32_t &Res) const override; std::error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override; uint32_t getSymbolFlags(DataRefImpl Symb) const override; + std::error_code getSymbolOther(DataRefImpl Symb, uint8_t &Res) const override; std::error_code getSymbolType(DataRefImpl Symb, SymbolRef::Type &Res) const override; std::error_code getSymbolSection(DataRefImpl Symb, @@ -177,8 +178,7 @@ class ELFObjectFile : public ObjectFile { bool isDyldELFObject; public: - ELFObjectFile(MemoryBuffer *Object, std::error_code &EC, - bool BufferOwned = true); + ELFObjectFile(std::unique_ptr Object, std::error_code &EC); const Elf_Sym *getSymbol(DataRefImpl Symb) const; @@ -203,6 +203,11 @@ class ELFObjectFile : public ObjectFile { unsigned getArch() const override; StringRef getLoadName() const override; + std::error_code getPlatformFlags(unsigned &Result) const override { + Result = EF.getHeader()->e_flags; + return object_error::success; + } + const ELFFile *getELFFile() const { return &EF; } bool isDyldType() const { return isDyldELFObject; } @@ -295,6 +300,13 @@ std::error_code ELFObjectFile::getSymbolSize(DataRefImpl Symb, return object_error::success; } +template +std::error_code ELFObjectFile::getSymbolOther(DataRefImpl Symb, + uint8_t &Result) const { + Result = toELFSymIter(Symb)->st_other; + return object_error::success; +} + template std::error_code ELFObjectFile::getSymbolType(DataRefImpl Symb, @@ -774,13 +786,13 @@ ELFObjectFile::getRela(DataRefImpl Rela) const { } template -ELFObjectFile::ELFObjectFile(MemoryBuffer *Object, std::error_code &ec, - bool BufferOwned) +ELFObjectFile::ELFObjectFile(std::unique_ptr Object, + std::error_code &EC) : ObjectFile(getELFType(static_cast(ELFT::TargetEndianness) == support::little, ELFT::Is64Bits), - Object, BufferOwned), - EF(Object, ec) {} + std::move(Object)), + EF(Data->getBuffer(), EC) {} template basic_symbol_iterator ELFObjectFile::symbol_begin_impl() const { @@ -918,6 +930,7 @@ StringRef ELFObjectFile::getFileFormatName() const { template unsigned ELFObjectFile::getArch() const { + bool IsLittleEndian = ELFT::TargetEndianness == support::little; switch (EF.getHeader()->e_machine) { case ELF::EM_386: return Triple::x86; @@ -930,11 +943,16 @@ unsigned ELFObjectFile::getArch() const { case ELF::EM_HEXAGON: return Triple::hexagon; case ELF::EM_MIPS: - return (ELFT::TargetEndianness == support::little) ? Triple::mipsel - : Triple::mips; + switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) { + case ELF::ELFCLASS32: + return IsLittleEndian ? Triple::mipsel : Triple::mips; + case ELF::ELFCLASS64: + return IsLittleEndian ? Triple::mips64el : Triple::mips64; + default: + report_fatal_error("Invalid ELFCLASS!"); + } case ELF::EM_PPC64: - return (ELFT::TargetEndianness == support::little) ? Triple::ppc64le - : Triple::ppc64; + return IsLittleEndian ? Triple::ppc64le : Triple::ppc64; case ELF::EM_S390: return Triple::systemz; diff --git a/include/llvm/Object/ELFYAML.h b/include/llvm/Object/ELFYAML.h index 42eeb0ef752c..fc8cc9581655 100644 --- a/include/llvm/Object/ELFYAML.h +++ b/include/llvm/Object/ELFYAML.h @@ -16,7 +16,7 @@ #ifndef LLVM_OBJECT_ELFYAML_H #define LLVM_OBJECT_ELFYAML_H -#include "llvm/Object/YAML.h" +#include "llvm/MC/YAML.h" #include "llvm/Support/ELF.h" namespace llvm { @@ -83,7 +83,7 @@ struct Section { virtual ~Section(); }; struct RawContentSection : Section { - object::yaml::BinaryRef Content; + yaml::BinaryRef Content; llvm::yaml::Hex64 Size; RawContentSection() : Section(SectionKind::RawContent) {} static bool classof(const Section *S) { diff --git a/include/llvm/Object/IRObjectFile.h b/include/llvm/Object/IRObjectFile.h index c87fe1519acc..b33cc263b9bd 100644 --- a/include/llvm/Object/IRObjectFile.h +++ b/include/llvm/Object/IRObjectFile.h @@ -25,21 +25,33 @@ namespace object { class IRObjectFile : public SymbolicFile { std::unique_ptr M; std::unique_ptr Mang; + std::vector> AsmSymbols; public: - IRObjectFile(MemoryBuffer *Object, std::error_code &EC, LLVMContext &Context, - bool BufferOwned); + IRObjectFile(std::unique_ptr Object, std::unique_ptr M); + ~IRObjectFile(); void moveSymbolNext(DataRefImpl &Symb) const override; std::error_code printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override; uint32_t getSymbolFlags(DataRefImpl Symb) const override; - const GlobalValue &getSymbolGV(DataRefImpl Symb) const; + const GlobalValue *getSymbolGV(DataRefImpl Symb) const; basic_symbol_iterator symbol_begin_impl() const override; basic_symbol_iterator symbol_end_impl() const override; + const Module &getModule() const { + return const_cast(this)->getModule(); + } + Module &getModule() { + return *M; + } + static inline bool classof(const Binary *v) { return v->isIR(); } + + static ErrorOr + createIRObjectFile(std::unique_ptr Object, + LLVMContext &Context); }; } } diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h index 6e1ab253ec9b..4835eb80bd65 100644 --- a/include/llvm/Object/MachO.h +++ b/include/llvm/Object/MachO.h @@ -56,8 +56,8 @@ class MachOObjectFile : public ObjectFile { MachO::load_command C; // The command itself. }; - MachOObjectFile(MemoryBuffer *Object, bool IsLittleEndian, bool Is64Bits, - std::error_code &EC, bool BufferOwned = true); + MachOObjectFile(std::unique_ptr Object, bool IsLittleEndian, + bool Is64Bits, std::error_code &EC); void moveSymbolNext(DataRefImpl &Symb) const override; std::error_code getSymbolName(DataRefImpl Symb, @@ -201,6 +201,8 @@ class MachOObjectFile : public ObjectFile { getLinkerOptionsLoadCommand(const LoadCommandInfo &L) const; MachO::version_min_command getVersionMinLoadCommand(const LoadCommandInfo &L) const; + MachO::dylib_command + getDylibIDLoadCommand(const LoadCommandInfo &L) const; MachO::any_relocation_info getRelocation(DataRefImpl Rel) const; MachO::data_in_code_entry getDice(DataRefImpl Rel) const; @@ -223,6 +225,9 @@ class MachOObjectFile : public ObjectFile { StringRef &Suffix); static Triple::ArchType getArch(uint32_t CPUType); + static Triple getArch(uint32_t CPUType, uint32_t CPUSubType); + static Triple getArch(StringRef ArchFlag); + static Triple getHostArch(); static bool classof(const Binary *v) { return v->isMachO(); diff --git a/include/llvm/Object/MachOUniversal.h b/include/llvm/Object/MachOUniversal.h index 47e93c26b46a..e6677f5bf28b 100644 --- a/include/llvm/Object/MachOUniversal.h +++ b/include/llvm/Object/MachOUniversal.h @@ -18,6 +18,7 @@ #include "llvm/ADT/Triple.h" #include "llvm/Object/Binary.h" #include "llvm/Object/Archive.h" +#include "llvm/Object/MachO.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/MachO.h" @@ -52,8 +53,12 @@ class MachOUniversalBinary : public Binary { ObjectForArch getNext() const { return ObjectForArch(Parent, Index + 1); } uint32_t getCPUType() const { return Header.cputype; } + std::string getArchTypeName() const { + Triple T = MachOObjectFile::getArch(Header.cputype, Header.cpusubtype); + return T.getArchName(); + } - std::error_code getAsObjectFile(std::unique_ptr &Result) const; + ErrorOr> getAsObjectFile() const; std::error_code getAsArchive(std::unique_ptr &Result) const; }; @@ -79,8 +84,10 @@ class MachOUniversalBinary : public Binary { } }; - MachOUniversalBinary(MemoryBuffer *Source, std::error_code &ec); - static ErrorOr create(MemoryBuffer *Source); + MachOUniversalBinary(std::unique_ptr Source, + std::error_code &ec); + static ErrorOr + create(std::unique_ptr Source); object_iterator begin_objects() const { return ObjectForArch(this, 0); @@ -96,8 +103,8 @@ class MachOUniversalBinary : public Binary { return V->isMachOUniversalBinary(); } - std::error_code getObjectForArch(Triple::ArchType Arch, - std::unique_ptr &Result) const; + ErrorOr> + getObjectForArch(Triple::ArchType Arch) const; }; } diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h index 152bb7e10c1f..a4370a3f30d0 100644 --- a/include/llvm/Object/ObjectFile.h +++ b/include/llvm/Object/ObjectFile.h @@ -149,6 +149,7 @@ class SymbolRef : public BasicSymbolRef { std::error_code getAlignment(uint32_t &Result) const; std::error_code getSize(uint64_t &Result) const; std::error_code getType(SymbolRef::Type &Result) const; + std::error_code getOther(uint8_t &Result) const; /// @brief Get section this symbol is defined in reference to. Result is /// end_sections() if it is undefined or is an absolute symbol. @@ -208,7 +209,7 @@ class ObjectFile : public SymbolicFile { ObjectFile(const ObjectFile &other) LLVM_DELETED_FUNCTION; protected: - ObjectFile(unsigned int Type, MemoryBuffer *Source, bool BufferOwned = true); + ObjectFile(unsigned int Type, std::unique_ptr Source); const uint8_t *base() const { return reinterpret_cast(Data->getBufferStart()); @@ -237,6 +238,10 @@ class ObjectFile : public SymbolicFile { SymbolRef::Type &Res) const = 0; virtual std::error_code getSymbolSection(DataRefImpl Symb, section_iterator &Res) const = 0; + virtual std::error_code getSymbolOther(DataRefImpl Symb, + uint8_t &Res) const { + return object_error::invalid_file_type; + } // Same as above for SectionRef. friend class SectionRef; @@ -328,16 +333,23 @@ class ObjectFile : public SymbolicFile { /// LC_ID_DYLIB (install name) on MachO. virtual StringRef getLoadName() const = 0; + /// Returns platform-specific object flags, if any. + virtual std::error_code getPlatformFlags(unsigned &Result) const { + Result = 0; + return object_error::invalid_file_type; + } + /// @returns Pointer to ObjectFile subclass to handle this type of object. /// @param ObjectPath The path to the object file. ObjectPath.isObject must /// return true. /// @brief Create ObjectFile from path. static ErrorOr createObjectFile(StringRef ObjectPath); - static ErrorOr createObjectFile(MemoryBuffer *Object, - bool BufferOwned, - sys::fs::file_magic Type); - static ErrorOr createObjectFile(MemoryBuffer *Object) { - return createObjectFile(Object, true, sys::fs::file_magic::unknown); + static ErrorOr + createObjectFile(std::unique_ptr &Object, + sys::fs::file_magic Type); + static ErrorOr + createObjectFile(std::unique_ptr &Object) { + return createObjectFile(Object, sys::fs::file_magic::unknown); } @@ -346,12 +358,12 @@ class ObjectFile : public SymbolicFile { } public: - static ErrorOr createCOFFObjectFile(MemoryBuffer *Object, - bool BufferOwned = true); - static ErrorOr createELFObjectFile(MemoryBuffer *Object, - bool BufferOwned = true); - static ErrorOr createMachOObjectFile(MemoryBuffer *Object, - bool BufferOwned = true); + static ErrorOr + createCOFFObjectFile(std::unique_ptr Object); + static ErrorOr + createELFObjectFile(std::unique_ptr &Object); + static ErrorOr + createMachOObjectFile(std::unique_ptr &Object); }; // Inline function definitions. @@ -382,6 +394,10 @@ inline std::error_code SymbolRef::getType(SymbolRef::Type &Result) const { return getObject()->getSymbolType(getRawDataRefImpl(), Result); } +inline std::error_code SymbolRef::getOther(uint8_t &Result) const { + return getObject()->getSymbolOther(getRawDataRefImpl(), Result); +} + inline const ObjectFile *SymbolRef::getObject() const { const SymbolicFile *O = BasicSymbolRef::getObject(); return cast(O); diff --git a/include/llvm/Object/SymbolicFile.h b/include/llvm/Object/SymbolicFile.h index 113373694556..77eef4a546aa 100644 --- a/include/llvm/Object/SymbolicFile.h +++ b/include/llvm/Object/SymbolicFile.h @@ -115,7 +115,7 @@ const uint64_t UnknownAddressOrSize = ~0ULL; class SymbolicFile : public Binary { public: virtual ~SymbolicFile(); - SymbolicFile(unsigned int Type, MemoryBuffer *Source, bool BufferOwned); + SymbolicFile(unsigned int Type, std::unique_ptr Source); // virtual interface. virtual void moveSymbolNext(DataRefImpl &Symb) const = 0; @@ -136,20 +136,19 @@ class SymbolicFile : public Binary { basic_symbol_iterator symbol_end() const { return symbol_end_impl(); } + typedef iterator_range basic_symbol_iterator_range; + basic_symbol_iterator_range symbols() const { + return basic_symbol_iterator_range(symbol_begin(), symbol_end()); + } // construction aux. - static ErrorOr createIRObjectFile(MemoryBuffer *Object, - LLVMContext &Context, - bool BufferOwned = true); - - static ErrorOr createSymbolicFile(MemoryBuffer *Object, - bool BufferOwned, - sys::fs::file_magic Type, - LLVMContext *Context); - - static ErrorOr createSymbolicFile(MemoryBuffer *Object) { - return createSymbolicFile(Object, true, sys::fs::file_magic::unknown, - nullptr); + static ErrorOr + createSymbolicFile(std::unique_ptr &Object, + sys::fs::file_magic Type, LLVMContext *Context); + + static ErrorOr + createSymbolicFile(std::unique_ptr &Object) { + return createSymbolicFile(Object, sys::fs::file_magic::unknown, nullptr); } static ErrorOr createSymbolicFile(StringRef ObjectPath); diff --git a/include/llvm/Option/ArgList.h b/include/llvm/Option/ArgList.h index ab40a1a0d40a..d46b0e892faf 100644 --- a/include/llvm/Option/ArgList.h +++ b/include/llvm/Option/ArgList.h @@ -150,6 +150,12 @@ class ArgList { return arg_iterator(Args.end(), *this); } + iterator_range filtered(OptSpecifier Id0 = 0U, + OptSpecifier Id1 = 0U, + OptSpecifier Id2 = 0U) const { + return make_range(filtered_begin(Id0, Id1, Id2), filtered_end()); + } + /// @} /// @name Arg Removal /// @{ @@ -328,6 +334,7 @@ class InputArgList : public ArgList { unsigned MakeIndex(StringRef String0) const; unsigned MakeIndex(StringRef String0, StringRef String1) const; + using ArgList::MakeArgString; const char *MakeArgString(StringRef Str) const override; /// @} @@ -365,6 +372,7 @@ class DerivedArgList : public ArgList { /// (to be freed). void AddSynthesizedArg(Arg *A); + using ArgList::MakeArgString; const char *MakeArgString(StringRef Str) const override; /// AddFlagArg - Construct a new FlagArg for the given option \p Id and diff --git a/include/llvm/Option/OptParser.td b/include/llvm/Option/OptParser.td index 963389f0bc6f..dbf240d74805 100644 --- a/include/llvm/Option/OptParser.td +++ b/include/llvm/Option/OptParser.td @@ -75,6 +75,7 @@ class OptionGroup { string Name = name; string HelpText = ?; OptionGroup Group = ?; + list Flags = []; } // Define the option class. diff --git a/include/llvm/Pass.h b/include/llvm/Pass.h index c2b9f95956e8..22d57edb74e4 100644 --- a/include/llvm/Pass.h +++ b/include/llvm/Pass.h @@ -29,6 +29,7 @@ #ifndef LLVM_PASS_H #define LLVM_PASS_H +#include "llvm/PassRegistry.h" #include "llvm/Support/Compiler.h" #include @@ -82,13 +83,14 @@ enum PassKind { class Pass { AnalysisResolver *Resolver; // Used to resolve analysis const void *PassID; + mutable const PassInfo *PI; PassKind Kind; void operator=(const Pass&) LLVM_DELETED_FUNCTION; Pass(const Pass &) LLVM_DELETED_FUNCTION; public: explicit Pass(PassKind K, char &pid) - : Resolver(nullptr), PassID(&pid), Kind(K) { } + : Resolver(nullptr), PassID(&pid), PI(nullptr), Kind(K) { } virtual ~Pass(); @@ -105,6 +107,13 @@ class Pass { return PassID; } + /// getPassInfo - Return the PassInfo associated with this pass. + const PassInfo *getPassInfo() const { + if (!PI) + PI = PassRegistry::getPassRegistry()->getPassInfo(PassID); + return PI; + } + /// doInitialization - Virtual method overridden by subclasses to do /// any necessary initialization before any pass is run. /// diff --git a/include/llvm/Support/ARMBuildAttributes.h b/include/llvm/Support/ARMBuildAttributes.h index 16312004c871..f63e0a61f639 100644 --- a/include/llvm/Support/ARMBuildAttributes.h +++ b/include/llvm/Support/ARMBuildAttributes.h @@ -159,6 +159,11 @@ enum { AddressDirect = 1, // Address imported data directly AddressGOT = 2, // Address imported data indirectly (via GOT) + // Tag_ABI_PCS_wchar_t, (=18), uleb128 + WCharProhibited = 0, // wchar_t is not used + WCharWidth2Bytes = 2, // sizeof(wchar_t) == 2 + WCharWidth4Bytes = 4, // sizeof(wchar_t) == 4 + // Tag_ABI_FP_denormal, (=20), uleb128 PreserveFPSign = 2, // sign when flushed-to-zero is preserved @@ -166,6 +171,16 @@ enum { AllowRTABI = 2, // numbers, infinities, and one quiet NaN (see [RTABI]) AllowIEE754 = 3, // this code to use all the IEEE 754-defined FP encodings + // Tag_ABI_enum_size, (=26), uleb128 + EnumProhibited = 0, // The user prohibited the use of enums when building + // this entity. + EnumSmallest = 1, // Enum is smallest container big enough to hold all + // values. + Enum32Bit = 2, // Enum is at least 32 bits. + Enum32BitABI = 3, // Every enumeration visible across an ABI-complying + // interface contains a value needing 32 bits to encode + // it; other enums can be containerized. + // Tag_ABI_HardFP_use, (=27), uleb128 HardFPImplied = 0, // FP use should be implied by Tag_FP_arch HardFPSinglePrecision = 1, // Single-precision only diff --git a/include/llvm/Support/ARMWinEH.h b/include/llvm/Support/ARMWinEH.h index 31efbdc4ed59..78deb8d36a98 100644 --- a/include/llvm/Support/ARMWinEH.h +++ b/include/llvm/Support/ARMWinEH.h @@ -193,7 +193,7 @@ inline bool EpilogueFolding(const RuntimeFunction &RF) { } /// StackAdjustment - calculated stack adjustment in words. The stack /// adjustment should be determined via this function to account for the special -/// handling the special encoding when the value is ≥ 0x3f4. +/// handling the special encoding when the value is >= 0x3f4. inline uint16_t StackAdjustment(const RuntimeFunction &RF) { uint16_t Adjustment = RF.StackAdjust(); if (Adjustment >= 0x3f4) diff --git a/include/llvm/Support/COFF.h b/include/llvm/Support/COFF.h index f0e5c7dc6287..e09ef07d81db 100644 --- a/include/llvm/Support/COFF.h +++ b/include/llvm/Support/COFF.h @@ -553,7 +553,8 @@ namespace COFF { IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE = 0x0040, /// Code integrity checks are enforced. IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY = 0x0080, - IMAGE_DLL_CHARACTERISTICS_NX_COMPAT = 0x0100, ///< Image is NX compatible. + ///< Image is NX compatible. + IMAGE_DLL_CHARACTERISTICS_NX_COMPAT = 0x0100, /// Isolation aware, but do not isolate the image. IMAGE_DLL_CHARACTERISTICS_NO_ISOLATION = 0x0200, /// Does not use structured exception handling (SEH). No SEH handler may be @@ -561,7 +562,12 @@ namespace COFF { IMAGE_DLL_CHARACTERISTICS_NO_SEH = 0x0400, /// Do not bind the image. IMAGE_DLL_CHARACTERISTICS_NO_BIND = 0x0800, - IMAGE_DLL_CHARACTERISTICS_WDM_DRIVER = 0x2000, ///< A WDM driver. + ///< Image should execute in an AppContainer. + IMAGE_DLL_CHARACTERISTICS_APPCONTAINER = 0x1000, + ///< A WDM driver. + IMAGE_DLL_CHARACTERISTICS_WDM_DRIVER = 0x2000, + ///< Image supports Control Flow Guard. + IMAGE_DLL_CHARACTERISTICS_GUARD_CF = 0x4000, /// Terminal Server aware. IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE = 0x8000 }; diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h index 5cb550139734..fdd901200fe9 100644 --- a/include/llvm/Support/CommandLine.h +++ b/include/llvm/Support/CommandLine.h @@ -270,8 +270,8 @@ class Option { // addOccurrence - Wrapper around handleOccurrence that enforces Flags. // - bool addOccurrence(unsigned pos, StringRef ArgName, - StringRef Value, bool MultiArg = false); + virtual bool addOccurrence(unsigned pos, StringRef ArgName, + StringRef Value, bool MultiArg = false); // Prints option name followed by message. Always returns true. bool error(const Twine &Message, StringRef ArgName = StringRef()); @@ -1649,6 +1649,10 @@ class alias : public Option { StringRef Arg) override { return AliasFor->handleOccurrence(pos, AliasFor->ArgStr, Arg); } + bool addOccurrence(unsigned pos, StringRef /*ArgName*/, + StringRef Value, bool MultiArg = false) override { + return AliasFor->addOccurrence(pos, AliasFor->ArgStr, Value, MultiArg); + } // Handle printing stuff... size_t getOptionWidth() const override; void printOptionInfo(size_t GlobalWidth) const override; diff --git a/include/llvm/Support/CrashRecoveryContext.h b/include/llvm/Support/CrashRecoveryContext.h index c132373e91d2..3869ebdc4a93 100644 --- a/include/llvm/Support/CrashRecoveryContext.h +++ b/include/llvm/Support/CrashRecoveryContext.h @@ -87,6 +87,9 @@ class CrashRecoveryContext { /// requested stack size). /// /// See RunSafely() and llvm_execute_on_thread(). + /// + /// On Darwin, if PRIO_DARWIN_BG is set on the calling thread, it will be + /// propagated to the new thread as well. bool RunSafelyOnThread(function_ref, unsigned RequestedStackSize = 0); bool RunSafelyOnThread(void (*Fn)(void*), void *UserData, unsigned RequestedStackSize = 0) { diff --git a/include/llvm/Support/Disassembler.h b/include/llvm/Support/Disassembler.h deleted file mode 100644 index 6d1cc0fdcb50..000000000000 --- a/include/llvm/Support/Disassembler.h +++ /dev/null @@ -1,35 +0,0 @@ -//===- llvm/Support/Disassembler.h ------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the necessary glue to call external disassembler -// libraries. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_SYSTEM_DISASSEMBLER_H -#define LLVM_SYSTEM_DISASSEMBLER_H - -#include "llvm/Support/DataTypes.h" -#include - -namespace llvm { -namespace sys { - -/// This function returns true, if there is possible to use some external -/// disassembler library. False otherwise. -bool hasDisassembler(); - -/// This function provides some "glue" code to call external disassembler -/// libraries. -std::string disassembleBuffer(uint8_t* start, size_t length, uint64_t pc = 0); - -} -} - -#endif // LLVM_SYSTEM_DISASSEMBLER_H diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h index ca316441ea7e..cd9f75600cbc 100644 --- a/include/llvm/Support/Dwarf.h +++ b/include/llvm/Support/Dwarf.h @@ -57,7 +57,6 @@ enum LLVMConstants : uint32_t { DW_TAG_user_base = 0x1000, // Recommended base for user tags. DWARF_VERSION = 4, // Default dwarf version we output. - DW_CIE_VERSION = 1, // Common frame information version. DW_PUBTYPES_VERSION = 2, // Section version number for .debug_pubtypes. DW_PUBNAMES_VERSION = 2, // Section version number for .debug_pubnames. DW_ARANGES_VERSION = 2 // Section version number for .debug_aranges. diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h index 31b34ffa703e..42abe8906ea3 100644 --- a/include/llvm/Support/ELF.h +++ b/include/llvm/Support/ELF.h @@ -458,6 +458,7 @@ enum { R_PPC_GOT16_LO = 15, R_PPC_GOT16_HI = 16, R_PPC_GOT16_HA = 17, + R_PPC_PLTREL24 = 18, R_PPC_REL32 = 26, R_PPC_TLS = 67, R_PPC_DTPMOD32 = 68, @@ -495,6 +496,37 @@ enum { R_PPC_REL16_HA = 252 }; +// Specific e_flags for PPC64 +enum { + // e_flags bits specifying ABI: + // 1 for original ABI using function descriptors, + // 2 for revised ABI without function descriptors, + // 0 for unspecified or not using any features affected by the differences. + EF_PPC64_ABI = 3 +}; + +// Special values for the st_other field in the symbol table entry for PPC64. +enum { + STO_PPC64_LOCAL_BIT = 5, + STO_PPC64_LOCAL_MASK = (7 << STO_PPC64_LOCAL_BIT) +}; +static inline int64_t +decodePPC64LocalEntryOffset(unsigned Other) { + unsigned Val = (Other & STO_PPC64_LOCAL_MASK) >> STO_PPC64_LOCAL_BIT; + return ((1 << Val) >> 2) << 2; +} +static inline unsigned +encodePPC64LocalEntryOffset(int64_t Offset) { + unsigned Val = (Offset >= 4 * 4 + ? (Offset >= 8 * 4 + ? (Offset >= 16 * 4 ? 6 : 5) + : 4) + : (Offset >= 2 * 4 + ? 3 + : (Offset >= 1 * 4 ? 2 : 0))); + return Val << STO_PPC64_LOCAL_BIT; +} + // ELF Relocation types for PPC64 enum { R_PPC64_NONE = 0, @@ -1299,6 +1331,7 @@ enum : unsigned { SHT_MIPS_REGINFO = 0x70000006, // Register usage information SHT_MIPS_OPTIONS = 0x7000000d, // General options + SHT_MIPS_ABIFLAGS = 0x7000002a, // ABI information. SHT_HIPROC = 0x7fffffff, // Highest processor arch-specific type. SHT_LOUSER = 0x80000000, // Lowest type reserved for applications. @@ -1616,7 +1649,8 @@ enum { // MIPS program header types. PT_MIPS_REGINFO = 0x70000000, // Register usage information. PT_MIPS_RTPROC = 0x70000001, // Runtime procedure table. - PT_MIPS_OPTIONS = 0x70000002 // Options segment. + PT_MIPS_OPTIONS = 0x70000002, // Options segment. + PT_MIPS_ABIFLAGS = 0x70000003 // Abiflags segment. }; // Segment flag bits. diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h index 6abe90446b47..556701c3ba34 100644 --- a/include/llvm/Support/FileSystem.h +++ b/include/llvm/Support/FileSystem.h @@ -134,7 +134,7 @@ class UniqueID { }; /// file_status - Represents the result of a call to stat and friends. It has -/// a platform specific member to store the result. +/// a platform-specific member to store the result. class file_status { #if defined(LLVM_ON_UNIX) @@ -273,7 +273,7 @@ struct file_magic { /// /// @param path A path that is modified to be an absolute path. /// @returns errc::success if \a path has been made absolute, otherwise a -/// platform specific error_code. +/// platform-specific error_code. std::error_code make_absolute(SmallVectorImpl &path); /// @brief Normalize path separators in \a Path @@ -318,14 +318,14 @@ std::error_code create_link(const Twine &to, const Twine &from); /// /// @param result Holds the current path on return. /// @returns errc::success if the current path has been stored in result, -/// otherwise a platform specific error_code. +/// otherwise a platform-specific error_code. std::error_code current_path(SmallVectorImpl &result); /// @brief Remove path. Equivalent to POSIX remove(). /// /// @param path Input path. /// @returns errc::success if path has been removed or didn't exist, otherwise a -/// platform specific error code. If IgnoreNonExisting is false, also +/// platform-specific error code. If IgnoreNonExisting is false, also /// returns error if the file didn't exist. std::error_code remove(const Twine &path, bool IgnoreNonExisting = true); @@ -335,12 +335,18 @@ std::error_code remove(const Twine &path, bool IgnoreNonExisting = true); /// @param to The path to rename to. This is created. std::error_code rename(const Twine &from, const Twine &to); +/// @brief Copy the contents of \a From to \a To. +/// +/// @param From The path to copy from. +/// @param To The path to copy to. This is created. +std::error_code copy_file(const Twine &From, const Twine &To); + /// @brief Resize path to size. File is resized as if by POSIX truncate(). /// /// @param path Input path. /// @param size Size to resize to. /// @returns errc::success if \a path has been resized to \a size, otherwise a -/// platform specific error_code. +/// platform-specific error_code. std::error_code resize_file(const Twine &path, uint64_t size); /// @} @@ -360,7 +366,7 @@ bool exists(file_status status); /// @param result Set to true if the file represented by status exists, false if /// it does not. Undefined otherwise. /// @returns errc::success if result has been successfully set, otherwise a -/// platform specific error_code. +/// platform-specific error_code. std::error_code exists(const Twine &path, bool &result); /// @brief Simpler version of exists for clients that don't need to @@ -402,7 +408,7 @@ bool equivalent(file_status A, file_status B); /// @param result Set to true if stat(A) and stat(B) have the same device and /// inode (or equivalent). /// @returns errc::success if result has been successfully set, otherwise a -/// platform specific error_code. +/// platform-specific error_code. std::error_code equivalent(const Twine &A, const Twine &B, bool &result); /// @brief Simpler version of equivalent for clients that don't need to @@ -424,7 +430,7 @@ bool is_directory(file_status status); /// @param result Set to true if \a path is a directory, false if it is not. /// Undefined otherwise. /// @returns errc::success if result has been successfully set, otherwise a -/// platform specific error_code. +/// platform-specific error_code. std::error_code is_directory(const Twine &path, bool &result); /// @brief Simpler version of is_directory for clients that don't need to @@ -446,7 +452,7 @@ bool is_regular_file(file_status status); /// @param result Set to true if \a path is a regular file, false if it is not. /// Undefined otherwise. /// @returns errc::success if result has been successfully set, otherwise a -/// platform specific error_code. +/// platform-specific error_code. std::error_code is_regular_file(const Twine &path, bool &result); /// @brief Simpler version of is_regular_file for clients that don't need to @@ -472,7 +478,7 @@ bool is_other(file_status status); /// @param result Set to true if \a path exists, but is not a directory, regular /// file, or a symlink, false if it does not. Undefined otherwise. /// @returns errc::success if result has been successfully set, otherwise a -/// platform specific error_code. +/// platform-specific error_code. std::error_code is_other(const Twine &path, bool &result); /// @brief Get file status as if by POSIX stat(). @@ -480,7 +486,7 @@ std::error_code is_other(const Twine &path, bool &result); /// @param path Input path. /// @param result Set to the file status. /// @returns errc::success if result has been successfully set, otherwise a -/// platform specific error_code. +/// platform-specific error_code. std::error_code status(const Twine &path, file_status &result); /// @brief A version for when a file descriptor is already available. @@ -491,7 +497,7 @@ std::error_code status(int FD, file_status &Result); /// @param Path Input path. /// @param Result Set to the size of the file in \a Path. /// @returns errc::success if result has been successfully set, otherwise a -/// platform specific error_code. +/// platform-specific error_code. inline std::error_code file_size(const Twine &Path, uint64_t &Result) { file_status Status; std::error_code EC = status(Path, Status); @@ -504,8 +510,8 @@ inline std::error_code file_size(const Twine &Path, uint64_t &Result) { /// @brief Set the file modification and access time. /// /// @returns errc::success if the file times were successfully set, otherwise a -/// platform specific error_code or errc::not_supported on platforms -/// where the functionality isn't available. +/// platform-specific error_code or errc::function_not_supported on +/// platforms where the functionality isn't available. std::error_code setLastModificationAndAccessTime(int FD, TimeValue Time); /// @brief Is status available? @@ -519,7 +525,7 @@ bool status_known(file_status s); /// @param path Input path. /// @param result Set to true if status() != status_error. /// @returns errc::success if result has been successfully set, otherwise a -/// platform specific error_code. +/// platform-specific error_code. std::error_code status_known(const Twine &path, bool &result); /// @brief Create a uniquely named file. @@ -542,7 +548,7 @@ std::error_code status_known(const Twine &path, bool &result); /// @param ResultFD Set to the opened file's file descriptor. /// @param ResultPath Set to the opened file's absolute path. /// @returns errc::success if Result{FD,Path} have been successfully set, -/// otherwise a platform specific error_code. +/// otherwise a platform-specific error_code. std::error_code createUniqueFile(const Twine &Model, int &ResultFD, SmallVectorImpl &ResultPath, unsigned Mode = all_read | all_write); @@ -612,7 +618,7 @@ file_magic identify_magic(StringRef magic); /// @param path Input path. /// @param result Set to the type of file, or file_magic::unknown. /// @returns errc::success if result has been successfully set, otherwise a -/// platform specific error_code. +/// platform-specific error_code. std::error_code identify_magic(const Twine &path, file_magic &result); std::error_code getUniqueID(const Twine Path, UniqueID &Result); @@ -632,7 +638,7 @@ class mapped_file_region { }; private: - /// Platform specific mapping state. + /// Platform-specific mapping state. mapmode Mode; uint64_t Size; void *Mapping; diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h index e344220d953a..876ab6ec71a5 100644 --- a/include/llvm/Support/GenericDomTree.h +++ b/include/llvm/Support/GenericDomTree.h @@ -330,6 +330,10 @@ class DominatorTreeBase : public DominatorBase { return DomTreeNodes.lookup(BB); } + inline DomTreeNodeBase *operator[](NodeT *BB) const { + return getNode(BB); + } + /// getRootNode - This returns the entry node for the CFG of the function. If /// this tree represents the post-dominance relations for a function, however, /// this root may be a node with the block == NULL. This is the case when diff --git a/include/llvm/Support/MachO.h b/include/llvm/Support/MachO.h index 60f7918ae72e..90df1f4f140f 100644 --- a/include/llvm/Support/MachO.h +++ b/include/llvm/Support/MachO.h @@ -386,13 +386,15 @@ namespace llvm { enum StabType { // Constant values for the "n_type" field in llvm::MachO::nlist and - // llvm::MachO::nlist_64 when "(n_type & NlistMaskStab) != 0" + // llvm::MachO::nlist_64 when "(n_type & N_STAB) != 0" N_GSYM = 0x20u, N_FNAME = 0x22u, N_FUN = 0x24u, N_STSYM = 0x26u, N_LCSYM = 0x28u, N_BNSYM = 0x2Eu, + N_PC = 0x30u, + N_AST = 0x32u, N_OPT = 0x3Cu, N_RSYM = 0x40u, N_SLINE = 0x44u, @@ -957,6 +959,13 @@ namespace llvm { }; // Structs from + struct nlist_base { + uint32_t n_strx; + uint8_t n_type; + uint8_t n_sect; + uint16_t n_desc; + }; + struct nlist { uint32_t n_strx; uint8_t n_type; @@ -973,6 +982,206 @@ namespace llvm { uint64_t n_value; }; + + // Byte order swapping functions for MachO structs + + inline void swapStruct(mach_header &mh) { + sys::swapByteOrder(mh.magic); + sys::swapByteOrder(mh.cputype); + sys::swapByteOrder(mh.cpusubtype); + sys::swapByteOrder(mh.filetype); + sys::swapByteOrder(mh.ncmds); + sys::swapByteOrder(mh.sizeofcmds); + sys::swapByteOrder(mh.flags); + } + + inline void swapStruct(mach_header_64 &H) { + sys::swapByteOrder(H.magic); + sys::swapByteOrder(H.cputype); + sys::swapByteOrder(H.cpusubtype); + sys::swapByteOrder(H.filetype); + sys::swapByteOrder(H.ncmds); + sys::swapByteOrder(H.sizeofcmds); + sys::swapByteOrder(H.flags); + sys::swapByteOrder(H.reserved); + } + + inline void swapStruct(load_command &lc) { + sys::swapByteOrder(lc.cmd); + sys::swapByteOrder(lc.cmdsize); + } + + inline void swapStruct(symtab_command &lc) { + sys::swapByteOrder(lc.cmd); + sys::swapByteOrder(lc.cmdsize); + sys::swapByteOrder(lc.symoff); + sys::swapByteOrder(lc.nsyms); + sys::swapByteOrder(lc.stroff); + sys::swapByteOrder(lc.strsize); + } + + inline void swapStruct(segment_command_64 &seg) { + sys::swapByteOrder(seg.cmd); + sys::swapByteOrder(seg.cmdsize); + sys::swapByteOrder(seg.vmaddr); + sys::swapByteOrder(seg.vmsize); + sys::swapByteOrder(seg.fileoff); + sys::swapByteOrder(seg.filesize); + sys::swapByteOrder(seg.maxprot); + sys::swapByteOrder(seg.initprot); + sys::swapByteOrder(seg.nsects); + sys::swapByteOrder(seg.flags); + } + + inline void swapStruct(segment_command &seg) { + sys::swapByteOrder(seg.cmd); + sys::swapByteOrder(seg.cmdsize); + sys::swapByteOrder(seg.vmaddr); + sys::swapByteOrder(seg.vmsize); + sys::swapByteOrder(seg.fileoff); + sys::swapByteOrder(seg.filesize); + sys::swapByteOrder(seg.maxprot); + sys::swapByteOrder(seg.initprot); + sys::swapByteOrder(seg.nsects); + sys::swapByteOrder(seg.flags); + } + + inline void swapStruct(section_64 §) { + sys::swapByteOrder(sect.addr); + sys::swapByteOrder(sect.size); + sys::swapByteOrder(sect.offset); + sys::swapByteOrder(sect.align); + sys::swapByteOrder(sect.reloff); + sys::swapByteOrder(sect.nreloc); + sys::swapByteOrder(sect.flags); + sys::swapByteOrder(sect.reserved1); + sys::swapByteOrder(sect.reserved2); + } + + inline void swapStruct(section §) { + sys::swapByteOrder(sect.addr); + sys::swapByteOrder(sect.size); + sys::swapByteOrder(sect.offset); + sys::swapByteOrder(sect.align); + sys::swapByteOrder(sect.reloff); + sys::swapByteOrder(sect.nreloc); + sys::swapByteOrder(sect.flags); + sys::swapByteOrder(sect.reserved1); + sys::swapByteOrder(sect.reserved2); + } + + inline void swapStruct(dyld_info_command &info) { + sys::swapByteOrder(info.cmd); + sys::swapByteOrder(info.cmdsize); + sys::swapByteOrder(info.rebase_off); + sys::swapByteOrder(info.rebase_size); + sys::swapByteOrder(info.bind_off); + sys::swapByteOrder(info.bind_size); + sys::swapByteOrder(info.weak_bind_off); + sys::swapByteOrder(info.weak_bind_size); + sys::swapByteOrder(info.lazy_bind_off); + sys::swapByteOrder(info.lazy_bind_size); + sys::swapByteOrder(info.export_off); + sys::swapByteOrder(info.export_size); + } + + inline void swapStruct(dylib_command &d) { + sys::swapByteOrder(d.cmd); + sys::swapByteOrder(d.cmdsize); + sys::swapByteOrder(d.dylib.name); + sys::swapByteOrder(d.dylib.timestamp); + sys::swapByteOrder(d.dylib.current_version); + sys::swapByteOrder(d.dylib.compatibility_version); + } + + inline void swapStruct(dylinker_command &d) { + sys::swapByteOrder(d.cmd); + sys::swapByteOrder(d.cmdsize); + sys::swapByteOrder(d.name); + } + + inline void swapStruct(entry_point_command &e) { + sys::swapByteOrder(e.cmd); + sys::swapByteOrder(e.cmdsize); + sys::swapByteOrder(e.entryoff); + sys::swapByteOrder(e.stacksize); + } + + inline void swapStruct(dysymtab_command &dst) { + sys::swapByteOrder(dst.cmd); + sys::swapByteOrder(dst.cmdsize); + sys::swapByteOrder(dst.ilocalsym); + sys::swapByteOrder(dst.nlocalsym); + sys::swapByteOrder(dst.iextdefsym); + sys::swapByteOrder(dst.nextdefsym); + sys::swapByteOrder(dst.iundefsym); + sys::swapByteOrder(dst.nundefsym); + sys::swapByteOrder(dst.tocoff); + sys::swapByteOrder(dst.ntoc); + sys::swapByteOrder(dst.modtaboff); + sys::swapByteOrder(dst.nmodtab); + sys::swapByteOrder(dst.extrefsymoff); + sys::swapByteOrder(dst.nextrefsyms); + sys::swapByteOrder(dst.indirectsymoff); + sys::swapByteOrder(dst.nindirectsyms); + sys::swapByteOrder(dst.extreloff); + sys::swapByteOrder(dst.nextrel); + sys::swapByteOrder(dst.locreloff); + sys::swapByteOrder(dst.nlocrel); + } + + inline void swapStruct(any_relocation_info &reloc) { + sys::swapByteOrder(reloc.r_word0); + sys::swapByteOrder(reloc.r_word1); + } + + inline void swapStruct(nlist_base &S) { + sys::swapByteOrder(S.n_strx); + sys::swapByteOrder(S.n_desc); + } + + inline void swapStruct(nlist &sym) { + sys::swapByteOrder(sym.n_strx); + sys::swapByteOrder(sym.n_desc); + sys::swapByteOrder(sym.n_value); + } + + inline void swapStruct(nlist_64 &sym) { + sys::swapByteOrder(sym.n_strx); + sys::swapByteOrder(sym.n_desc); + sys::swapByteOrder(sym.n_value); + } + + inline void swapStruct(linkedit_data_command &C) { + sys::swapByteOrder(C.cmd); + sys::swapByteOrder(C.cmdsize); + sys::swapByteOrder(C.dataoff); + sys::swapByteOrder(C.datasize); + } + + inline void swapStruct(linker_options_command &C) { + sys::swapByteOrder(C.cmd); + sys::swapByteOrder(C.cmdsize); + sys::swapByteOrder(C.count); + } + + inline void swapStruct(version_min_command&C) { + sys::swapByteOrder(C.cmd); + sys::swapByteOrder(C.cmdsize); + sys::swapByteOrder(C.version); + sys::swapByteOrder(C.reserved); + } + + inline void swapStruct(data_in_code_entry &C) { + sys::swapByteOrder(C.offset); + sys::swapByteOrder(C.length); + sys::swapByteOrder(C.kind); + } + + inline void swapStruct(uint32_t &C) { + sys::swapByteOrder(C); + } + // Get/Set functions from static inline uint16_t GET_LIBRARY_ORDINAL(uint16_t n_desc) { @@ -1015,8 +1224,8 @@ namespace llvm { enum : uint32_t { // Capability bits used in the definition of cpusubtype. - CPU_SUB_TYPE_MASK = 0xff000000, // Mask for architecture bits - CPU_SUB_TYPE_LIB64 = 0x80000000, // 64 bit libraries + CPU_SUBTYPE_MASK = 0xff000000, // Mask for architecture bits + CPU_SUBTYPE_LIB64 = 0x80000000, // 64 bit libraries // Special CPU subtype constants. CPU_SUBTYPE_MULTIPLE = ~0u diff --git a/include/llvm/Support/ManagedStatic.h b/include/llvm/Support/ManagedStatic.h index 1bb8cea092f9..d8fbfeb8e20c 100644 --- a/include/llvm/Support/ManagedStatic.h +++ b/include/llvm/Support/ManagedStatic.h @@ -103,9 +103,6 @@ void llvm_shutdown(); /// llvm_shutdown() when it is destroyed. struct llvm_shutdown_obj { llvm_shutdown_obj() { } - explicit llvm_shutdown_obj(bool multithreaded) { - if (multithreaded) llvm_start_multithreaded(); - } ~llvm_shutdown_obj() { llvm_shutdown(); } }; diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h index 6965faf8df8b..0abba62a2c23 100644 --- a/include/llvm/Support/MathExtras.h +++ b/include/llvm/Support/MathExtras.h @@ -230,6 +230,9 @@ static const unsigned char BitReverseTable256[256] = { #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) R6(0), R6(2), R6(1), R6(3) +#undef R2 +#undef R4 +#undef R6 }; /// \brief Reverse the bits in \p Val. diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h index 8c742c686b15..147be47e1c8f 100644 --- a/include/llvm/Support/MemoryBuffer.h +++ b/include/llvm/Support/MemoryBuffer.h @@ -19,6 +19,7 @@ #include "llvm/Support/CBindingWrapping.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/DataTypes.h" +#include "llvm/Support/ErrorOr.h" #include #include @@ -60,19 +61,17 @@ class MemoryBuffer { return "Unknown buffer"; } - /// getFile - Open the specified file as a MemoryBuffer, returning a new - /// MemoryBuffer if successful, otherwise returning null. If FileSize is - /// specified, this means that the client knows that the file exists and that - /// it has the specified size. + /// Open the specified file as a MemoryBuffer, returning a new MemoryBuffer + /// if successful, otherwise returning null. If FileSize is specified, this + /// means that the client knows that the file exists and that it has the + /// specified size. /// /// \param IsVolatileSize Set to true to indicate that the file size may be /// changing, e.g. when libclang tries to parse while the user is /// editing/updating the file. - static std::error_code getFile(Twine Filename, - std::unique_ptr &Result, - int64_t FileSize = -1, - bool RequiresNullTerminator = true, - bool IsVolatileSize = false); + static ErrorOr> + getFile(Twine Filename, int64_t FileSize = -1, + bool RequiresNullTerminator = true, bool IsVolatileSize = false); /// Given an already-open file descriptor, map some slice of it into a /// MemoryBuffer. The slice is specified by an \p Offset and \p MapSize. @@ -81,10 +80,9 @@ class MemoryBuffer { /// \param IsVolatileSize Set to true to indicate that the file size may be /// changing, e.g. when libclang tries to parse while the user is /// editing/updating the file. - static std::error_code getOpenFileSlice(int FD, const char *Filename, - std::unique_ptr &Result, - uint64_t MapSize, int64_t Offset, - bool IsVolatileSize = false); + static ErrorOr> + getOpenFileSlice(int FD, const char *Filename, uint64_t MapSize, + int64_t Offset, bool IsVolatileSize = false); /// Given an already-open file descriptor, read the file and return a /// MemoryBuffer. @@ -92,11 +90,9 @@ class MemoryBuffer { /// \param IsVolatileSize Set to true to indicate that the file size may be /// changing, e.g. when libclang tries to parse while the user is /// editing/updating the file. - static std::error_code getOpenFile(int FD, const char *Filename, - std::unique_ptr &Result, - uint64_t FileSize, - bool RequiresNullTerminator = true, - bool IsVolatileSize = false); + static ErrorOr> + getOpenFile(int FD, const char *Filename, uint64_t FileSize, + bool RequiresNullTerminator = true, bool IsVolatileSize = false); /// getMemBuffer - Open the specified memory range as a MemoryBuffer. Note /// that InputData must be null terminated if RequiresNullTerminator is true. @@ -123,16 +119,13 @@ class MemoryBuffer { static MemoryBuffer *getNewUninitMemBuffer(size_t Size, StringRef BufferName = ""); - /// getSTDIN - Read all of stdin into a file buffer, and return it. - /// If an error occurs, this returns null and sets ec. - static std::error_code getSTDIN(std::unique_ptr &Result); + /// Read all of stdin into a file buffer, and return it. + static ErrorOr> getSTDIN(); - /// getFileOrSTDIN - Open the specified file as a MemoryBuffer, or open stdin - /// if the Filename is "-". If an error occurs, this returns null and sets - /// ec. - static std::error_code getFileOrSTDIN(StringRef Filename, - std::unique_ptr &Result, - int64_t FileSize = -1); + /// Open the specified file as a MemoryBuffer, or open stdin if the Filename + /// is "-". + static ErrorOr> + getFileOrSTDIN(StringRef Filename, int64_t FileSize = -1); //===--------------------------------------------------------------------===// // Provided for performance analysis. diff --git a/include/llvm/Support/Process.h b/include/llvm/Support/Process.h index 4f98e4de8125..30973de3aac4 100644 --- a/include/llvm/Support/Process.h +++ b/include/llvm/Support/Process.h @@ -171,6 +171,13 @@ class Process { // string. \arg Name is assumed to be in UTF-8 encoding too. static Optional GetEnv(StringRef name); + /// This function searches for an existing file in the list of directories + /// in a PATH like environment variable, and returns the first file found, + /// according to the order of the entries in the PATH like environment + /// variable. + static Optional FindInEnvPath(const std::string& EnvName, + const std::string& FileName); + /// This function returns a SmallVector containing the arguments passed from /// the operating system to the program. This function expects to be handed /// the vector passed in from main. diff --git a/include/llvm/Support/RandomNumberGenerator.h b/include/llvm/Support/RandomNumberGenerator.h new file mode 100644 index 000000000000..cadc713659d2 --- /dev/null +++ b/include/llvm/Support/RandomNumberGenerator.h @@ -0,0 +1,57 @@ +//==- llvm/Support/RandomNumberGenerator.h - RNG for diversity ---*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an abstraction for random number generation (RNG). +// Note that the current implementation is not cryptographically secure +// as it uses the C++11 facilities. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_RANDOMNUMBERGENERATOR_H_ +#define LLVM_SUPPORT_RANDOMNUMBERGENERATOR_H_ + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/DataTypes.h" // Needed for uint64_t on Windows. +#include + +namespace llvm { + +/// A random number generator. +/// Instances of this class should not be shared across threads. +class RandomNumberGenerator { +public: + /// Seeds and salts the underlying RNG engine. The salt of type StringRef + /// is passed into the constructor. The seed can be set on the command + /// line via -rng-seed=. + /// The reason for the salt is to ensure different random streams even if + /// the same seed is used for multiple invocations of the compiler. + /// A good salt value should add additional entropy and be constant across + /// different machines (i.e., no paths) to allow for reproducible builds. + /// An instance of this class can be retrieved from the current Module. + /// \see Module::getRNG + RandomNumberGenerator(StringRef Salt); + + /// Returns a random number in the range [0, Max). + uint64_t next(uint64_t Max); + +private: + // 64-bit Mersenne Twister by Matsumoto and Nishimura, 2000 + // http://en.cppreference.com/w/cpp/numeric/random/mersenne_twister_engine + std::mt19937_64 Generator; + + // Noncopyable. + RandomNumberGenerator(const RandomNumberGenerator &other) + LLVM_DELETED_FUNCTION; + RandomNumberGenerator & + operator=(const RandomNumberGenerator &other) LLVM_DELETED_FUNCTION; +}; +} + +#endif diff --git a/include/llvm/Support/ScaledNumber.h b/include/llvm/Support/ScaledNumber.h new file mode 100644 index 000000000000..2bd7e741dd28 --- /dev/null +++ b/include/llvm/Support/ScaledNumber.h @@ -0,0 +1,897 @@ +//===- llvm/Support/ScaledNumber.h - Support for scaled numbers -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains functions (and a class) useful for working with scaled +// numbers -- in particular, pairs of integers where one represents digits and +// another represents a scale. The functions are helpers and live in the +// namespace ScaledNumbers. The class ScaledNumber is useful for modelling +// certain cost metrics that need simple, integer-like semantics that are easy +// to reason about. +// +// These might remind you of soft-floats. If you want one of those, you're in +// the wrong place. Look at include/llvm/ADT/APFloat.h instead. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_SCALEDNUMBER_H +#define LLVM_SUPPORT_SCALEDNUMBER_H + +#include "llvm/Support/MathExtras.h" + +#include +#include +#include +#include +#include +#include + +namespace llvm { +namespace ScaledNumbers { + +/// \brief Maximum scale; same as APFloat for easy debug printing. +const int32_t MaxScale = 16383; + +/// \brief Maximum scale; same as APFloat for easy debug printing. +const int32_t MinScale = -16382; + +/// \brief Get the width of a number. +template inline int getWidth() { return sizeof(DigitsT) * 8; } + +/// \brief Conditionally round up a scaled number. +/// +/// Given \c Digits and \c Scale, round up iff \c ShouldRound is \c true. +/// Always returns \c Scale unless there's an overflow, in which case it +/// returns \c 1+Scale. +/// +/// \pre adding 1 to \c Scale will not overflow INT16_MAX. +template +inline std::pair getRounded(DigitsT Digits, int16_t Scale, + bool ShouldRound) { + static_assert(!std::numeric_limits::is_signed, "expected unsigned"); + + if (ShouldRound) + if (!++Digits) + // Overflow. + return std::make_pair(DigitsT(1) << (getWidth() - 1), Scale + 1); + return std::make_pair(Digits, Scale); +} + +/// \brief Convenience helper for 32-bit rounding. +inline std::pair getRounded32(uint32_t Digits, int16_t Scale, + bool ShouldRound) { + return getRounded(Digits, Scale, ShouldRound); +} + +/// \brief Convenience helper for 64-bit rounding. +inline std::pair getRounded64(uint64_t Digits, int16_t Scale, + bool ShouldRound) { + return getRounded(Digits, Scale, ShouldRound); +} + +/// \brief Adjust a 64-bit scaled number down to the appropriate width. +/// +/// \pre Adding 64 to \c Scale will not overflow INT16_MAX. +template +inline std::pair getAdjusted(uint64_t Digits, + int16_t Scale = 0) { + static_assert(!std::numeric_limits::is_signed, "expected unsigned"); + + const int Width = getWidth(); + if (Width == 64 || Digits <= std::numeric_limits::max()) + return std::make_pair(Digits, Scale); + + // Shift right and round. + int Shift = 64 - Width - countLeadingZeros(Digits); + return getRounded(Digits >> Shift, Scale + Shift, + Digits & (UINT64_C(1) << (Shift - 1))); +} + +/// \brief Convenience helper for adjusting to 32 bits. +inline std::pair getAdjusted32(uint64_t Digits, + int16_t Scale = 0) { + return getAdjusted(Digits, Scale); +} + +/// \brief Convenience helper for adjusting to 64 bits. +inline std::pair getAdjusted64(uint64_t Digits, + int16_t Scale = 0) { + return getAdjusted(Digits, Scale); +} + +/// \brief Multiply two 64-bit integers to create a 64-bit scaled number. +/// +/// Implemented with four 64-bit integer multiplies. +std::pair multiply64(uint64_t LHS, uint64_t RHS); + +/// \brief Multiply two 32-bit integers to create a 32-bit scaled number. +/// +/// Implemented with one 64-bit integer multiply. +template +inline std::pair getProduct(DigitsT LHS, DigitsT RHS) { + static_assert(!std::numeric_limits::is_signed, "expected unsigned"); + + if (getWidth() <= 32 || (LHS <= UINT32_MAX && RHS <= UINT32_MAX)) + return getAdjusted(uint64_t(LHS) * RHS); + + return multiply64(LHS, RHS); +} + +/// \brief Convenience helper for 32-bit product. +inline std::pair getProduct32(uint32_t LHS, uint32_t RHS) { + return getProduct(LHS, RHS); +} + +/// \brief Convenience helper for 64-bit product. +inline std::pair getProduct64(uint64_t LHS, uint64_t RHS) { + return getProduct(LHS, RHS); +} + +/// \brief Divide two 64-bit integers to create a 64-bit scaled number. +/// +/// Implemented with long division. +/// +/// \pre \c Dividend and \c Divisor are non-zero. +std::pair divide64(uint64_t Dividend, uint64_t Divisor); + +/// \brief Divide two 32-bit integers to create a 32-bit scaled number. +/// +/// Implemented with one 64-bit integer divide/remainder pair. +/// +/// \pre \c Dividend and \c Divisor are non-zero. +std::pair divide32(uint32_t Dividend, uint32_t Divisor); + +/// \brief Divide two 32-bit numbers to create a 32-bit scaled number. +/// +/// Implemented with one 64-bit integer divide/remainder pair. +/// +/// Returns \c (DigitsT_MAX, MaxScale) for divide-by-zero (0 for 0/0). +template +std::pair getQuotient(DigitsT Dividend, DigitsT Divisor) { + static_assert(!std::numeric_limits::is_signed, "expected unsigned"); + static_assert(sizeof(DigitsT) == 4 || sizeof(DigitsT) == 8, + "expected 32-bit or 64-bit digits"); + + // Check for zero. + if (!Dividend) + return std::make_pair(0, 0); + if (!Divisor) + return std::make_pair(std::numeric_limits::max(), MaxScale); + + if (getWidth() == 64) + return divide64(Dividend, Divisor); + return divide32(Dividend, Divisor); +} + +/// \brief Convenience helper for 32-bit quotient. +inline std::pair getQuotient32(uint32_t Dividend, + uint32_t Divisor) { + return getQuotient(Dividend, Divisor); +} + +/// \brief Convenience helper for 64-bit quotient. +inline std::pair getQuotient64(uint64_t Dividend, + uint64_t Divisor) { + return getQuotient(Dividend, Divisor); +} + +/// \brief Implementation of getLg() and friends. +/// +/// Returns the rounded lg of \c Digits*2^Scale and an int specifying whether +/// this was rounded up (1), down (-1), or exact (0). +/// +/// Returns \c INT32_MIN when \c Digits is zero. +template +inline std::pair getLgImpl(DigitsT Digits, int16_t Scale) { + static_assert(!std::numeric_limits::is_signed, "expected unsigned"); + + if (!Digits) + return std::make_pair(INT32_MIN, 0); + + // Get the floor of the lg of Digits. + int32_t LocalFloor = sizeof(Digits) * 8 - countLeadingZeros(Digits) - 1; + + // Get the actual floor. + int32_t Floor = Scale + LocalFloor; + if (Digits == UINT64_C(1) << LocalFloor) + return std::make_pair(Floor, 0); + + // Round based on the next digit. + assert(LocalFloor >= 1); + bool Round = Digits & UINT64_C(1) << (LocalFloor - 1); + return std::make_pair(Floor + Round, Round ? 1 : -1); +} + +/// \brief Get the lg (rounded) of a scaled number. +/// +/// Get the lg of \c Digits*2^Scale. +/// +/// Returns \c INT32_MIN when \c Digits is zero. +template int32_t getLg(DigitsT Digits, int16_t Scale) { + return getLgImpl(Digits, Scale).first; +} + +/// \brief Get the lg floor of a scaled number. +/// +/// Get the floor of the lg of \c Digits*2^Scale. +/// +/// Returns \c INT32_MIN when \c Digits is zero. +template int32_t getLgFloor(DigitsT Digits, int16_t Scale) { + auto Lg = getLgImpl(Digits, Scale); + return Lg.first - (Lg.second > 0); +} + +/// \brief Get the lg ceiling of a scaled number. +/// +/// Get the ceiling of the lg of \c Digits*2^Scale. +/// +/// Returns \c INT32_MIN when \c Digits is zero. +template int32_t getLgCeiling(DigitsT Digits, int16_t Scale) { + auto Lg = getLgImpl(Digits, Scale); + return Lg.first + (Lg.second < 0); +} + +/// \brief Implementation for comparing scaled numbers. +/// +/// Compare two 64-bit numbers with different scales. Given that the scale of +/// \c L is higher than that of \c R by \c ScaleDiff, compare them. Return -1, +/// 1, and 0 for less than, greater than, and equal, respectively. +/// +/// \pre 0 <= ScaleDiff < 64. +int compareImpl(uint64_t L, uint64_t R, int ScaleDiff); + +/// \brief Compare two scaled numbers. +/// +/// Compare two scaled numbers. Returns 0 for equal, -1 for less than, and 1 +/// for greater than. +template +int compare(DigitsT LDigits, int16_t LScale, DigitsT RDigits, int16_t RScale) { + static_assert(!std::numeric_limits::is_signed, "expected unsigned"); + + // Check for zero. + if (!LDigits) + return RDigits ? -1 : 0; + if (!RDigits) + return 1; + + // Check for the scale. Use getLgFloor to be sure that the scale difference + // is always lower than 64. + int32_t lgL = getLgFloor(LDigits, LScale), lgR = getLgFloor(RDigits, RScale); + if (lgL != lgR) + return lgL < lgR ? -1 : 1; + + // Compare digits. + if (LScale < RScale) + return compareImpl(LDigits, RDigits, RScale - LScale); + + return -compareImpl(RDigits, LDigits, LScale - RScale); +} + +/// \brief Match scales of two numbers. +/// +/// Given two scaled numbers, match up their scales. Change the digits and +/// scales in place. Shift the digits as necessary to form equivalent numbers, +/// losing precision only when necessary. +/// +/// If the output value of \c LDigits (\c RDigits) is \c 0, the output value of +/// \c LScale (\c RScale) is unspecified. +/// +/// As a convenience, returns the matching scale. If the output value of one +/// number is zero, returns the scale of the other. If both are zero, which +/// scale is returned is unspecifed. +template +int16_t matchScales(DigitsT &LDigits, int16_t &LScale, DigitsT &RDigits, + int16_t &RScale) { + static_assert(!std::numeric_limits::is_signed, "expected unsigned"); + + if (LScale < RScale) + // Swap arguments. + return matchScales(RDigits, RScale, LDigits, LScale); + if (!LDigits) + return RScale; + if (!RDigits || LScale == RScale) + return LScale; + + // Now LScale > RScale. Get the difference. + int32_t ScaleDiff = int32_t(LScale) - RScale; + if (ScaleDiff >= 2 * getWidth()) { + // Don't bother shifting. RDigits will get zero-ed out anyway. + RDigits = 0; + return LScale; + } + + // Shift LDigits left as much as possible, then shift RDigits right. + int32_t ShiftL = std::min(countLeadingZeros(LDigits), ScaleDiff); + assert(ShiftL < getWidth() && "can't shift more than width"); + + int32_t ShiftR = ScaleDiff - ShiftL; + if (ShiftR >= getWidth()) { + // Don't bother shifting. RDigits will get zero-ed out anyway. + RDigits = 0; + return LScale; + } + + LDigits <<= ShiftL; + RDigits >>= ShiftR; + + LScale -= ShiftL; + RScale += ShiftR; + assert(LScale == RScale && "scales should match"); + return LScale; +} + +/// \brief Get the sum of two scaled numbers. +/// +/// Get the sum of two scaled numbers with as much precision as possible. +/// +/// \pre Adding 1 to \c LScale (or \c RScale) will not overflow INT16_MAX. +template +std::pair getSum(DigitsT LDigits, int16_t LScale, + DigitsT RDigits, int16_t RScale) { + static_assert(!std::numeric_limits::is_signed, "expected unsigned"); + + // Check inputs up front. This is only relevent if addition overflows, but + // testing here should catch more bugs. + assert(LScale < INT16_MAX && "scale too large"); + assert(RScale < INT16_MAX && "scale too large"); + + // Normalize digits to match scales. + int16_t Scale = matchScales(LDigits, LScale, RDigits, RScale); + + // Compute sum. + DigitsT Sum = LDigits + RDigits; + if (Sum >= RDigits) + return std::make_pair(Sum, Scale); + + // Adjust sum after arithmetic overflow. + DigitsT HighBit = DigitsT(1) << (getWidth() - 1); + return std::make_pair(HighBit | Sum >> 1, Scale + 1); +} + +/// \brief Convenience helper for 32-bit sum. +inline std::pair getSum32(uint32_t LDigits, int16_t LScale, + uint32_t RDigits, int16_t RScale) { + return getSum(LDigits, LScale, RDigits, RScale); +} + +/// \brief Convenience helper for 64-bit sum. +inline std::pair getSum64(uint64_t LDigits, int16_t LScale, + uint64_t RDigits, int16_t RScale) { + return getSum(LDigits, LScale, RDigits, RScale); +} + +/// \brief Get the difference of two scaled numbers. +/// +/// Get LHS minus RHS with as much precision as possible. +/// +/// Returns \c (0, 0) if the RHS is larger than the LHS. +template +std::pair getDifference(DigitsT LDigits, int16_t LScale, + DigitsT RDigits, int16_t RScale) { + static_assert(!std::numeric_limits::is_signed, "expected unsigned"); + + // Normalize digits to match scales. + const DigitsT SavedRDigits = RDigits; + const int16_t SavedRScale = RScale; + matchScales(LDigits, LScale, RDigits, RScale); + + // Compute difference. + if (LDigits <= RDigits) + return std::make_pair(0, 0); + if (RDigits || !SavedRDigits) + return std::make_pair(LDigits - RDigits, LScale); + + // Check if RDigits just barely lost its last bit. E.g., for 32-bit: + // + // 1*2^32 - 1*2^0 == 0xffffffff != 1*2^32 + const auto RLgFloor = getLgFloor(SavedRDigits, SavedRScale); + if (!compare(LDigits, LScale, DigitsT(1), RLgFloor + getWidth())) + return std::make_pair(std::numeric_limits::max(), RLgFloor); + + return std::make_pair(LDigits, LScale); +} + +/// \brief Convenience helper for 32-bit difference. +inline std::pair getDifference32(uint32_t LDigits, + int16_t LScale, + uint32_t RDigits, + int16_t RScale) { + return getDifference(LDigits, LScale, RDigits, RScale); +} + +/// \brief Convenience helper for 64-bit difference. +inline std::pair getDifference64(uint64_t LDigits, + int16_t LScale, + uint64_t RDigits, + int16_t RScale) { + return getDifference(LDigits, LScale, RDigits, RScale); +} + +} // end namespace ScaledNumbers +} // end namespace llvm + +namespace llvm { + +class raw_ostream; +class ScaledNumberBase { +public: + static const int DefaultPrecision = 10; + + static void dump(uint64_t D, int16_t E, int Width); + static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width, + unsigned Precision); + static std::string toString(uint64_t D, int16_t E, int Width, + unsigned Precision); + static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); } + static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); } + static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); } + + static std::pair splitSigned(int64_t N) { + if (N >= 0) + return std::make_pair(N, false); + uint64_t Unsigned = N == INT64_MIN ? UINT64_C(1) << 63 : uint64_t(-N); + return std::make_pair(Unsigned, true); + } + static int64_t joinSigned(uint64_t U, bool IsNeg) { + if (U > uint64_t(INT64_MAX)) + return IsNeg ? INT64_MIN : INT64_MAX; + return IsNeg ? -int64_t(U) : int64_t(U); + } +}; + +/// \brief Simple representation of a scaled number. +/// +/// ScaledNumber is a number represented by digits and a scale. It uses simple +/// saturation arithmetic and every operation is well-defined for every value. +/// It's somewhat similar in behaviour to a soft-float, but is *not* a +/// replacement for one. If you're doing numerics, look at \a APFloat instead. +/// Nevertheless, we've found these semantics useful for modelling certain cost +/// metrics. +/// +/// The number is split into a signed scale and unsigned digits. The number +/// represented is \c getDigits()*2^getScale(). In this way, the digits are +/// much like the mantissa in the x87 long double, but there is no canonical +/// form so the same number can be represented by many bit representations. +/// +/// ScaledNumber is templated on the underlying integer type for digits, which +/// is expected to be unsigned. +/// +/// Unlike APFloat, ScaledNumber does not model architecture floating point +/// behaviour -- while this might make it a little faster and easier to reason +/// about, it certainly makes it more dangerous for general numerics. +/// +/// ScaledNumber is totally ordered. However, there is no canonical form, so +/// there are multiple representations of most scalars. E.g.: +/// +/// ScaledNumber(8u, 0) == ScaledNumber(4u, 1) +/// ScaledNumber(4u, 1) == ScaledNumber(2u, 2) +/// ScaledNumber(2u, 2) == ScaledNumber(1u, 3) +/// +/// ScaledNumber implements most arithmetic operations. Precision is kept +/// where possible. Uses simple saturation arithmetic, so that operations +/// saturate to 0.0 or getLargest() rather than under or overflowing. It has +/// some extra arithmetic for unit inversion. 0.0/0.0 is defined to be 0.0. +/// Any other division by 0.0 is defined to be getLargest(). +/// +/// As a convenience for modifying the exponent, left and right shifting are +/// both implemented, and both interpret negative shifts as positive shifts in +/// the opposite direction. +/// +/// Scales are limited to the range accepted by x87 long double. This makes +/// it trivial to add functionality to convert to APFloat (this is already +/// relied on for the implementation of printing). +/// +/// Possible (and conflicting) future directions: +/// +/// 1. Turn this into a wrapper around \a APFloat. +/// 2. Share the algorithm implementations with \a APFloat. +/// 3. Allow \a ScaledNumber to represent a signed number. +template class ScaledNumber : ScaledNumberBase { +public: + static_assert(!std::numeric_limits::is_signed, + "only unsigned floats supported"); + + typedef DigitsT DigitsType; + +private: + typedef std::numeric_limits DigitsLimits; + + static const int Width = sizeof(DigitsType) * 8; + static_assert(Width <= 64, "invalid integer width for digits"); + +private: + DigitsType Digits; + int16_t Scale; + +public: + ScaledNumber() : Digits(0), Scale(0) {} + + ScaledNumber(DigitsType Digits, int16_t Scale) + : Digits(Digits), Scale(Scale) {} + +private: + ScaledNumber(const std::pair &X) + : Digits(X.first), Scale(X.second) {} + +public: + static ScaledNumber getZero() { return ScaledNumber(0, 0); } + static ScaledNumber getOne() { return ScaledNumber(1, 0); } + static ScaledNumber getLargest() { + return ScaledNumber(DigitsLimits::max(), ScaledNumbers::MaxScale); + } + static ScaledNumber get(uint64_t N) { return adjustToWidth(N, 0); } + static ScaledNumber getInverse(uint64_t N) { + return get(N).invert(); + } + static ScaledNumber getFraction(DigitsType N, DigitsType D) { + return getQuotient(N, D); + } + + int16_t getScale() const { return Scale; } + DigitsType getDigits() const { return Digits; } + + /// \brief Convert to the given integer type. + /// + /// Convert to \c IntT using simple saturating arithmetic, truncating if + /// necessary. + template IntT toInt() const; + + bool isZero() const { return !Digits; } + bool isLargest() const { return *this == getLargest(); } + bool isOne() const { + if (Scale > 0 || Scale <= -Width) + return false; + return Digits == DigitsType(1) << -Scale; + } + + /// \brief The log base 2, rounded. + /// + /// Get the lg of the scalar. lg 0 is defined to be INT32_MIN. + int32_t lg() const { return ScaledNumbers::getLg(Digits, Scale); } + + /// \brief The log base 2, rounded towards INT32_MIN. + /// + /// Get the lg floor. lg 0 is defined to be INT32_MIN. + int32_t lgFloor() const { return ScaledNumbers::getLgFloor(Digits, Scale); } + + /// \brief The log base 2, rounded towards INT32_MAX. + /// + /// Get the lg ceiling. lg 0 is defined to be INT32_MIN. + int32_t lgCeiling() const { + return ScaledNumbers::getLgCeiling(Digits, Scale); + } + + bool operator==(const ScaledNumber &X) const { return compare(X) == 0; } + bool operator<(const ScaledNumber &X) const { return compare(X) < 0; } + bool operator!=(const ScaledNumber &X) const { return compare(X) != 0; } + bool operator>(const ScaledNumber &X) const { return compare(X) > 0; } + bool operator<=(const ScaledNumber &X) const { return compare(X) <= 0; } + bool operator>=(const ScaledNumber &X) const { return compare(X) >= 0; } + + bool operator!() const { return isZero(); } + + /// \brief Convert to a decimal representation in a string. + /// + /// Convert to a string. Uses scientific notation for very large/small + /// numbers. Scientific notation is used roughly for numbers outside of the + /// range 2^-64 through 2^64. + /// + /// \c Precision indicates the number of decimal digits of precision to use; + /// 0 requests the maximum available. + /// + /// As a special case to make debugging easier, if the number is small enough + /// to convert without scientific notation and has more than \c Precision + /// digits before the decimal place, it's printed accurately to the first + /// digit past zero. E.g., assuming 10 digits of precision: + /// + /// 98765432198.7654... => 98765432198.8 + /// 8765432198.7654... => 8765432198.8 + /// 765432198.7654... => 765432198.8 + /// 65432198.7654... => 65432198.77 + /// 5432198.7654... => 5432198.765 + std::string toString(unsigned Precision = DefaultPrecision) { + return ScaledNumberBase::toString(Digits, Scale, Width, Precision); + } + + /// \brief Print a decimal representation. + /// + /// Print a string. See toString for documentation. + raw_ostream &print(raw_ostream &OS, + unsigned Precision = DefaultPrecision) const { + return ScaledNumberBase::print(OS, Digits, Scale, Width, Precision); + } + void dump() const { return ScaledNumberBase::dump(Digits, Scale, Width); } + + ScaledNumber &operator+=(const ScaledNumber &X) { + std::tie(Digits, Scale) = + ScaledNumbers::getSum(Digits, Scale, X.Digits, X.Scale); + // Check for exponent past MaxScale. + if (Scale > ScaledNumbers::MaxScale) + *this = getLargest(); + return *this; + } + ScaledNumber &operator-=(const ScaledNumber &X) { + std::tie(Digits, Scale) = + ScaledNumbers::getDifference(Digits, Scale, X.Digits, X.Scale); + return *this; + } + ScaledNumber &operator*=(const ScaledNumber &X); + ScaledNumber &operator/=(const ScaledNumber &X); + ScaledNumber &operator<<=(int16_t Shift) { + shiftLeft(Shift); + return *this; + } + ScaledNumber &operator>>=(int16_t Shift) { + shiftRight(Shift); + return *this; + } + +private: + void shiftLeft(int32_t Shift); + void shiftRight(int32_t Shift); + + /// \brief Adjust two floats to have matching exponents. + /// + /// Adjust \c this and \c X to have matching exponents. Returns the new \c X + /// by value. Does nothing if \a isZero() for either. + /// + /// The value that compares smaller will lose precision, and possibly become + /// \a isZero(). + ScaledNumber matchScales(ScaledNumber X) { + ScaledNumbers::matchScales(Digits, Scale, X.Digits, X.Scale); + return X; + } + +public: + /// \brief Scale a large number accurately. + /// + /// Scale N (multiply it by this). Uses full precision multiplication, even + /// if Width is smaller than 64, so information is not lost. + uint64_t scale(uint64_t N) const; + uint64_t scaleByInverse(uint64_t N) const { + // TODO: implement directly, rather than relying on inverse. Inverse is + // expensive. + return inverse().scale(N); + } + int64_t scale(int64_t N) const { + std::pair Unsigned = splitSigned(N); + return joinSigned(scale(Unsigned.first), Unsigned.second); + } + int64_t scaleByInverse(int64_t N) const { + std::pair Unsigned = splitSigned(N); + return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second); + } + + int compare(const ScaledNumber &X) const { + return ScaledNumbers::compare(Digits, Scale, X.Digits, X.Scale); + } + int compareTo(uint64_t N) const { + ScaledNumber Scaled = get(N); + int Compare = compare(Scaled); + if (Width == 64 || Compare != 0) + return Compare; + + // Check for precision loss. We know *this == RoundTrip. + uint64_t RoundTrip = Scaled.template toInt(); + return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1; + } + int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); } + + ScaledNumber &invert() { return *this = ScaledNumber::get(1) / *this; } + ScaledNumber inverse() const { return ScaledNumber(*this).invert(); } + +private: + static ScaledNumber getProduct(DigitsType LHS, DigitsType RHS) { + return ScaledNumbers::getProduct(LHS, RHS); + } + static ScaledNumber getQuotient(DigitsType Dividend, DigitsType Divisor) { + return ScaledNumbers::getQuotient(Dividend, Divisor); + } + + static int countLeadingZerosWidth(DigitsType Digits) { + if (Width == 64) + return countLeadingZeros64(Digits); + if (Width == 32) + return countLeadingZeros32(Digits); + return countLeadingZeros32(Digits) + Width - 32; + } + + /// \brief Adjust a number to width, rounding up if necessary. + /// + /// Should only be called for \c Shift close to zero. + /// + /// \pre Shift >= MinScale && Shift + 64 <= MaxScale. + static ScaledNumber adjustToWidth(uint64_t N, int32_t Shift) { + assert(Shift >= ScaledNumbers::MinScale && "Shift should be close to 0"); + assert(Shift <= ScaledNumbers::MaxScale - 64 && + "Shift should be close to 0"); + auto Adjusted = ScaledNumbers::getAdjusted(N, Shift); + return Adjusted; + } + + static ScaledNumber getRounded(ScaledNumber P, bool Round) { + // Saturate. + if (P.isLargest()) + return P; + + return ScaledNumbers::getRounded(P.Digits, P.Scale, Round); + } +}; + +#define SCALED_NUMBER_BOP(op, base) \ + template \ + ScaledNumber operator op(const ScaledNumber &L, \ + const ScaledNumber &R) { \ + return ScaledNumber(L) base R; \ + } +SCALED_NUMBER_BOP(+, += ) +SCALED_NUMBER_BOP(-, -= ) +SCALED_NUMBER_BOP(*, *= ) +SCALED_NUMBER_BOP(/, /= ) +SCALED_NUMBER_BOP(<<, <<= ) +SCALED_NUMBER_BOP(>>, >>= ) +#undef SCALED_NUMBER_BOP + +template +raw_ostream &operator<<(raw_ostream &OS, const ScaledNumber &X) { + return X.print(OS, 10); +} + +#define SCALED_NUMBER_COMPARE_TO_TYPE(op, T1, T2) \ + template \ + bool operator op(const ScaledNumber &L, T1 R) { \ + return L.compareTo(T2(R)) op 0; \ + } \ + template \ + bool operator op(T1 L, const ScaledNumber &R) { \ + return 0 op R.compareTo(T2(L)); \ + } +#define SCALED_NUMBER_COMPARE_TO(op) \ + SCALED_NUMBER_COMPARE_TO_TYPE(op, uint64_t, uint64_t) \ + SCALED_NUMBER_COMPARE_TO_TYPE(op, uint32_t, uint64_t) \ + SCALED_NUMBER_COMPARE_TO_TYPE(op, int64_t, int64_t) \ + SCALED_NUMBER_COMPARE_TO_TYPE(op, int32_t, int64_t) +SCALED_NUMBER_COMPARE_TO(< ) +SCALED_NUMBER_COMPARE_TO(> ) +SCALED_NUMBER_COMPARE_TO(== ) +SCALED_NUMBER_COMPARE_TO(!= ) +SCALED_NUMBER_COMPARE_TO(<= ) +SCALED_NUMBER_COMPARE_TO(>= ) +#undef SCALED_NUMBER_COMPARE_TO +#undef SCALED_NUMBER_COMPARE_TO_TYPE + +template +uint64_t ScaledNumber::scale(uint64_t N) const { + if (Width == 64 || N <= DigitsLimits::max()) + return (get(N) * *this).template toInt(); + + // Defer to the 64-bit version. + return ScaledNumber(Digits, Scale).scale(N); +} + +template +template +IntT ScaledNumber::toInt() const { + typedef std::numeric_limits Limits; + if (*this < 1) + return 0; + if (*this >= Limits::max()) + return Limits::max(); + + IntT N = Digits; + if (Scale > 0) { + assert(size_t(Scale) < sizeof(IntT) * 8); + return N << Scale; + } + if (Scale < 0) { + assert(size_t(-Scale) < sizeof(IntT) * 8); + return N >> -Scale; + } + return N; +} + +template +ScaledNumber &ScaledNumber:: +operator*=(const ScaledNumber &X) { + if (isZero()) + return *this; + if (X.isZero()) + return *this = X; + + // Save the exponents. + int32_t Scales = int32_t(Scale) + int32_t(X.Scale); + + // Get the raw product. + *this = getProduct(Digits, X.Digits); + + // Combine with exponents. + return *this <<= Scales; +} +template +ScaledNumber &ScaledNumber:: +operator/=(const ScaledNumber &X) { + if (isZero()) + return *this; + if (X.isZero()) + return *this = getLargest(); + + // Save the exponents. + int32_t Scales = int32_t(Scale) - int32_t(X.Scale); + + // Get the raw quotient. + *this = getQuotient(Digits, X.Digits); + + // Combine with exponents. + return *this <<= Scales; +} +template void ScaledNumber::shiftLeft(int32_t Shift) { + if (!Shift || isZero()) + return; + assert(Shift != INT32_MIN); + if (Shift < 0) { + shiftRight(-Shift); + return; + } + + // Shift as much as we can in the exponent. + int32_t ScaleShift = std::min(Shift, ScaledNumbers::MaxScale - Scale); + Scale += ScaleShift; + if (ScaleShift == Shift) + return; + + // Check this late, since it's rare. + if (isLargest()) + return; + + // Shift the digits themselves. + Shift -= ScaleShift; + if (Shift > countLeadingZerosWidth(Digits)) { + // Saturate. + *this = getLargest(); + return; + } + + Digits <<= Shift; + return; +} + +template void ScaledNumber::shiftRight(int32_t Shift) { + if (!Shift || isZero()) + return; + assert(Shift != INT32_MIN); + if (Shift < 0) { + shiftLeft(-Shift); + return; + } + + // Shift as much as we can in the exponent. + int32_t ScaleShift = std::min(Shift, Scale - ScaledNumbers::MinScale); + Scale -= ScaleShift; + if (ScaleShift == Shift) + return; + + // Shift the digits themselves. + Shift -= ScaleShift; + if (Shift >= Width) { + // Saturate. + *this = getZero(); + return; + } + + Digits >>= Shift; + return; +} + +template struct isPodLike; +template struct isPodLike> { + static const bool value = true; +}; + +} // end namespace llvm + +#endif diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h index 30eccd99acbc..4717553bd0de 100644 --- a/include/llvm/Support/SourceMgr.h +++ b/include/llvm/Support/SourceMgr.h @@ -66,6 +66,8 @@ class SourceMgr { DiagHandlerTy DiagHandler; void *DiagContext; + bool isValidBufferID(unsigned i) const { return i && i <= Buffers.size(); } + SourceMgr(const SourceMgr&) LLVM_DELETED_FUNCTION; void operator=(const SourceMgr&) LLVM_DELETED_FUNCTION; public: @@ -88,58 +90,63 @@ class SourceMgr { void *getDiagContext() const { return DiagContext; } const SrcBuffer &getBufferInfo(unsigned i) const { - assert(i < Buffers.size() && "Invalid Buffer ID!"); - return Buffers[i]; + assert(isValidBufferID(i)); + return Buffers[i - 1]; } const MemoryBuffer *getMemoryBuffer(unsigned i) const { - assert(i < Buffers.size() && "Invalid Buffer ID!"); - return Buffers[i].Buffer; + assert(isValidBufferID(i)); + return Buffers[i - 1].Buffer; } - size_t getNumBuffers() const { + unsigned getNumBuffers() const { return Buffers.size(); } + unsigned getMainFileID() const { + assert(getNumBuffers()); + return 1; + } + SMLoc getParentIncludeLoc(unsigned i) const { - assert(i < Buffers.size() && "Invalid Buffer ID!"); - return Buffers[i].IncludeLoc; + assert(isValidBufferID(i)); + return Buffers[i - 1].IncludeLoc; } /// Add a new source buffer to this source manager. This takes ownership of /// the memory buffer. - size_t AddNewSourceBuffer(MemoryBuffer *F, SMLoc IncludeLoc) { + unsigned AddNewSourceBuffer(MemoryBuffer *F, SMLoc IncludeLoc) { SrcBuffer NB; NB.Buffer = F; NB.IncludeLoc = IncludeLoc; Buffers.push_back(NB); - return Buffers.size() - 1; + return Buffers.size(); } /// Search for a file with the specified name in the current directory or in /// one of the IncludeDirs. /// - /// If no file is found, this returns ~0, otherwise it returns the buffer ID + /// If no file is found, this returns 0, otherwise it returns the buffer ID /// of the stacked file. The full path to the included file can be found in /// \p IncludedFile. - size_t AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc, - std::string &IncludedFile); + unsigned AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc, + std::string &IncludedFile); /// Return the ID of the buffer containing the specified location. /// - /// -1 is returned if the buffer is not found. - int FindBufferContainingLoc(SMLoc Loc) const; + /// 0 is returned if the buffer is not found. + unsigned FindBufferContainingLoc(SMLoc Loc) const; /// Find the line number for the specified location in the specified file. /// This is not a fast method. - unsigned FindLineNumber(SMLoc Loc, int BufferID = -1) const { + unsigned FindLineNumber(SMLoc Loc, unsigned BufferID = 0) const { return getLineAndColumn(Loc, BufferID).first; } /// Find the line and column number for the specified location in the /// specified file. This is not a fast method. - std::pair - getLineAndColumn(SMLoc Loc, int BufferID = -1) const; + std::pair getLineAndColumn(SMLoc Loc, + unsigned BufferID = 0) const; /// Emit a message about the specified location with the specified string. /// diff --git a/include/llvm/Support/SpecialCaseList.h b/include/llvm/Support/SpecialCaseList.h new file mode 100644 index 000000000000..098b9c7a17b7 --- /dev/null +++ b/include/llvm/Support/SpecialCaseList.h @@ -0,0 +1,96 @@ +//===-- SpecialCaseList.h - special case list for sanitizers ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +//===----------------------------------------------------------------------===// +// +// This is a utility class used to parse user-provided text files with +// "special case lists" for code sanitizers. Such files are used to +// define "ABI list" for DataFlowSanitizer and blacklists for another sanitizers +// like AddressSanitizer or UndefinedBehaviorSanitizer. +// +// Empty lines and lines starting with "#" are ignored. All the rest lines +// should have the form: +// section:wildcard_expression[=category] +// If category is not specified, it is assumed to be empty string. +// Definitions of "section" and "category" are sanitizer-specific. For example, +// sanitizer blacklists support sections "src", "fun" and "global". +// Wildcard expressions define, respectively, source files, functions or +// globals which shouldn't be instrumented. +// Examples of categories: +// "functional": used in DFSan to list functions with pure functional +// semantics. +// "init": used in ASan blacklist to disable initialization-order bugs +// detection for certain globals or source files. +// Full special case list file example: +// --- +// # Blacklisted items: +// fun:*_ZN4base6subtle* +// global:*global_with_bad_access_or_initialization* +// global:*global_with_initialization_issues*=init +// type:*Namespace::ClassName*=init +// src:file_with_tricky_code.cc +// src:ignore-global-initializers-issues.cc=init +// +// # Functions with pure functional semantics: +// fun:cos=functional +// fun:sin=functional +// --- +// Note that the wild card is in fact an llvm::Regex, but * is automatically +// replaced with .* +// This is similar to the "ignore" feature of ThreadSanitizer. +// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_SPECIALCASELIST_H +#define LLVM_SUPPORT_SPECIALCASELIST_H + +#include "llvm/ADT/StringMap.h" + +namespace llvm { +class MemoryBuffer; +class Regex; +class StringRef; + +class SpecialCaseList { + public: + /// Parses the special case list from a file. If Path is empty, returns + /// an empty special case list. On failure, returns 0 and writes an error + /// message to string. + static SpecialCaseList *create(const StringRef Path, std::string &Error); + /// Parses the special case list from a memory buffer. On failure, returns + /// 0 and writes an error message to string. + static SpecialCaseList *create(const MemoryBuffer *MB, std::string &Error); + /// Parses the special case list from a file. On failure, reports a fatal + /// error. + static SpecialCaseList *createOrDie(const StringRef Path); + + ~SpecialCaseList(); + + /// Returns true, if special case list contains a line + /// \code + /// @Section:=@Category + /// \endcode + /// and @Query satisfies a wildcard expression . + bool inSection(const StringRef Section, const StringRef Query, + const StringRef Category = StringRef()) const; + + private: + SpecialCaseList(SpecialCaseList const &) LLVM_DELETED_FUNCTION; + SpecialCaseList &operator=(SpecialCaseList const &) LLVM_DELETED_FUNCTION; + + struct Entry; + StringMap > Entries; + + SpecialCaseList(); + /// Parses just-constructed SpecialCaseList entries from a memory buffer. + bool parse(const MemoryBuffer *MB, std::string &Error); +}; + +} // namespace llvm + +#endif // LLVM_SUPPORT_SPECIALCASELIST_H + diff --git a/include/llvm/Support/StreamableMemoryObject.h b/include/llvm/Support/StreamableMemoryObject.h index 9c9e55c0a75a..6e71ad47c8dd 100644 --- a/include/llvm/Support/StreamableMemoryObject.h +++ b/include/llvm/Support/StreamableMemoryObject.h @@ -13,6 +13,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/DataStream.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryObject.h" #include #include @@ -115,7 +116,7 @@ class StreamingMemoryObject : public StreamableMemoryObject { // requiring that the bitcode size be known, or otherwise ensuring that // the memory doesn't go away/get reallocated, but it's // not currently necessary. Users that need the pointer don't stream. - assert(0 && "getPointer in streaming memory objects not allowed"); + llvm_unreachable("getPointer in streaming memory objects not allowed"); return nullptr; } bool isValidAddress(uint64_t address) const override; @@ -154,8 +155,8 @@ class StreamingMemoryObject : public StreamableMemoryObject { kChunkSize); BytesRead += bytes; if (bytes < kChunkSize) { - if (ObjectSize && BytesRead < Pos) - assert(0 && "Unexpected short read fetching bitcode"); + assert((!ObjectSize || BytesRead >= Pos) && + "Unexpected short read fetching bitcode"); if (BytesRead <= Pos) { // reached EOF/ran out of bytes ObjectSize = BytesRead; EOFReached = true; diff --git a/include/llvm/Support/StringPool.h b/include/llvm/Support/StringPool.h index 7e1394cb2335..3e0465340c3b 100644 --- a/include/llvm/Support/StringPool.h +++ b/include/llvm/Support/StringPool.h @@ -29,6 +29,7 @@ #ifndef LLVM_SUPPORT_STRINGPOOL_H #define LLVM_SUPPORT_STRINGPOOL_H +#include "llvm/Support/Compiler.h" #include "llvm/ADT/StringMap.h" #include #include @@ -128,10 +129,10 @@ namespace llvm { } inline const char *operator*() const { return begin(); } - inline operator bool() const { return S != nullptr; } + inline LLVM_EXPLICIT operator bool() const { return S != nullptr; } - inline bool operator==(const PooledStringPtr &That) { return S == That.S; } - inline bool operator!=(const PooledStringPtr &That) { return S != That.S; } + inline bool operator==(const PooledStringPtr &That) const { return S == That.S; } + inline bool operator!=(const PooledStringPtr &That) const { return S != That.S; } }; } // End llvm namespace diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h index fcdc60490caa..5d5b86a6a2ae 100644 --- a/include/llvm/Support/TargetRegistry.h +++ b/include/llvm/Support/TargetRegistry.h @@ -51,6 +51,7 @@ namespace llvm { class raw_ostream; class formatted_raw_ostream; + MCStreamer *createNullStreamer(MCContext &Ctx); MCStreamer *createAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, bool isVerboseAsm, bool useDwarfDirectory, MCInstPrinter *InstPrint, MCCodeEmitter *CE, @@ -139,6 +140,7 @@ namespace llvm { MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst); + typedef MCStreamer *(*NullStreamerCtorTy)(MCContext &Ctx); typedef MCRelocationInfo *(*MCRelocationInfoCtorTy)(StringRef TT, MCContext &Ctx); typedef MCSymbolizer *(*MCSymbolizerCtorTy)(StringRef TT, @@ -225,6 +227,10 @@ namespace llvm { /// AsmStreamer, if registered (default = llvm::createAsmStreamer). AsmStreamerCtorTy AsmStreamerCtorFn; + /// Construction function for this target's NullStreamer, if registered + /// (default = llvm::createNullStreamer). + NullStreamerCtorTy NullStreamerCtorFn; + /// MCRelocationInfoCtorFn - Construction function for this target's /// MCRelocationInfo, if registered (default = llvm::createMCRelocationInfo) MCRelocationInfoCtorTy MCRelocationInfoCtorFn; @@ -235,8 +241,8 @@ namespace llvm { public: Target() - : AsmStreamerCtorFn(nullptr), MCRelocationInfoCtorFn(nullptr), - MCSymbolizerCtorFn(nullptr) {} + : AsmStreamerCtorFn(nullptr), NullStreamerCtorFn(nullptr), + MCRelocationInfoCtorFn(nullptr), MCSymbolizerCtorFn(nullptr) {} /// @name Target Information /// @{ @@ -447,6 +453,12 @@ namespace llvm { InstPrint, CE, TAB, ShowInst); } + MCStreamer *createNullStreamer(MCContext &Ctx) const { + if (NullStreamerCtorFn) + return NullStreamerCtorFn(Ctx); + return llvm::createNullStreamer(Ctx); + } + /// createMCRelocationInfo - Create a target specific MCRelocationInfo. /// /// \param TT The target triple. @@ -553,13 +565,6 @@ namespace llvm { Triple &TheTriple, std::string &Error); - /// getClosestTargetForJIT - Pick the best target that is compatible with - /// the current host. If no close target can be found, this returns null - /// and sets the Error string to a reason. - /// - /// Maintained for compatibility through 2.6. - static const Target *getClosestTargetForJIT(std::string &Error); - /// @} /// @name Target Registration /// @{ @@ -780,6 +785,10 @@ namespace llvm { T.AsmStreamerCtorFn = Fn; } + static void RegisterNullStreamer(Target &T, Target::NullStreamerCtorTy Fn) { + T.NullStreamerCtorFn = Fn; + } + /// RegisterMCRelocationInfo - Register an MCRelocationInfo /// implementation for the given target. /// diff --git a/include/llvm/Support/Threading.h b/include/llvm/Support/Threading.h index a7e8774558d5..7e8758407c7c 100644 --- a/include/llvm/Support/Threading.h +++ b/include/llvm/Support/Threading.h @@ -7,7 +7,8 @@ // //===----------------------------------------------------------------------===// // -// TThis file defines llvm_start_multithreaded() and friends. +// This file declares helper functions for running LLVM in a multi-threaded +// environment. // //===----------------------------------------------------------------------===// @@ -15,32 +16,10 @@ #define LLVM_SUPPORT_THREADING_H namespace llvm { - /// llvm_start_multithreaded - Allocate and initialize structures needed to - /// make LLVM safe for multithreading. The return value indicates whether - /// multithreaded initialization succeeded. LLVM will still be operational - /// on "failed" return, and will still be safe for hosting threading - /// applications in the JIT, but will not be safe for concurrent calls to the - /// LLVM APIs. - /// THIS MUST EXECUTE IN ISOLATION FROM ALL OTHER LLVM API CALLS. - bool llvm_start_multithreaded(); - - /// llvm_stop_multithreaded - Deallocate structures necessary to make LLVM - /// safe for multithreading. - /// THIS MUST EXECUTE IN ISOLATION FROM ALL OTHER LLVM API CALLS. - void llvm_stop_multithreaded(); - - /// llvm_is_multithreaded - Check whether LLVM is executing in thread-safe - /// mode or not. + /// Returns true if LLVM is compiled with support for multi-threading, and + /// false otherwise. bool llvm_is_multithreaded(); - /// acquire_global_lock - Acquire the global lock. This is a no-op if called - /// before llvm_start_multithreaded(). - void llvm_acquire_global_lock(); - - /// release_global_lock - Release the global lock. This is a no-op if called - /// before llvm_start_multithreaded(). - void llvm_release_global_lock(); - /// llvm_execute_on_thread - Execute the given \p UserFn on a separate /// thread, passing it the provided \p UserData. /// diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/Target/TargetFrameLowering.h index 7c42e23a11d1..bfddd0601794 100644 --- a/include/llvm/Target/TargetFrameLowering.h +++ b/include/llvm/Target/TargetFrameLowering.h @@ -93,6 +93,19 @@ class TargetFrameLowering { /// stack pointer. virtual bool isFPCloseToIncomingSP() const { return true; } + /// assignCalleeSavedSpillSlots - Allows target to override spill slot + /// assignment logic. If implemented, assignCalleeSavedSpillSlots() should + /// assign frame slots to all CSI entries and return true. If this method + /// returns false, spill slots will be assigned using generic implementation. + /// assignCalleeSavedSpillSlots() may add, delete or rearrange elements of + /// CSI. + virtual bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const { + return false; + } + /// getCalleeSavedSpillSlots - This method returns a pointer to an array of /// pairs, that contains an entry for each callee saved register that must be /// spilled to a particular stack location if it is spilled. diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 60a4079babd5..ea9a48e2db8a 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -185,10 +185,15 @@ class TargetLoweringBase { /// Return true if the target has BitExtract instructions. bool hasExtractBitsInsn() const { return HasExtractBitsInsn; } - /// Return true if a vector of the given type should be split - /// (TypeSplitVector) instead of promoted (TypePromoteInteger) during type - /// legalization. - virtual bool shouldSplitVectorType(EVT /*VT*/) const { return false; } + /// Return the preferred vector type legalization action. + virtual TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const { + // The default action for one element vectors is to scalarize + if (VT.getVectorNumElements() == 1) + return TypeScalarizeVector; + // The default action for other vectors is to promote + return TypePromoteInteger; + } // There are two general methods for expanding a BUILD_VECTOR node: // 1. Use SCALAR_TO_VECTOR on the defined scalar values and then shuffle @@ -279,8 +284,17 @@ class TargetLoweringBase { /// selects between the two kinds. For example on X86 a scalar boolean should /// be zero extended from i1, while the elements of a vector of booleans /// should be sign extended from i1. - BooleanContent getBooleanContents(bool isVec) const { - return isVec ? BooleanVectorContents : BooleanContents; + /// + /// Some cpus also treat floating point types the same way as they treat + /// vectors instead of the way they treat scalars. + BooleanContent getBooleanContents(bool isVec, bool isFloat) const { + if (isVec) + return BooleanVectorContents; + return isFloat ? BooleanFloatContents : BooleanContents; + } + + BooleanContent getBooleanContents(EVT Type) const { + return getBooleanContents(Type.isVector(), Type.isFloatingPoint()); } /// Return target scheduling preference. @@ -711,6 +725,13 @@ class TargetLoweringBase { /// reduce runtime. virtual bool ShouldShrinkFPConstant(EVT) const { return true; } + /// When splitting a value of the specified type into parts, does the Lo + /// or Hi part come first? This usually follows the endianness, except + /// for ppcf128, where the Hi part always comes first. + bool hasBigEndianPartOrdering(EVT VT) const { + return isBigEndian() || VT == MVT::ppcf128; + } + /// If true, the target has custom DAG combine transformations that it can /// perform for the specified node. bool hasTargetDAGCombine(ISD::NodeType NT) const { @@ -938,9 +959,19 @@ class TargetLoweringBase { virtual void resetOperationActions() {} protected: - /// Specify how the target extends the result of a boolean value from i1 to a - /// wider type. See getBooleanContents. - void setBooleanContents(BooleanContent Ty) { BooleanContents = Ty; } + /// Specify how the target extends the result of integer and floating point + /// boolean values from i1 to a wider type. See getBooleanContents. + void setBooleanContents(BooleanContent Ty) { + BooleanContents = Ty; + BooleanFloatContents = Ty; + } + + /// Specify how the target extends the result of integer and floating point + /// boolean values from i1 to a wider type. See getBooleanContents. + void setBooleanContents(BooleanContent IntTy, BooleanContent FloatTy) { + BooleanContents = IntTy; + BooleanFloatContents = FloatTy; + } /// Specify how the target extends the result of a vector boolean value from a /// vector of i1 to a wider type. See getBooleanContents. @@ -1484,6 +1515,10 @@ class TargetLoweringBase { /// a type wider than i1. See getBooleanContents. BooleanContent BooleanContents; + /// Information about the contents of the high-bits in boolean values held in + /// a type wider than i1. See getBooleanContents. + BooleanContent BooleanFloatContents; + /// Information about the contents of the high-bits in boolean vector values /// when the element type is wider than i1. See getBooleanContents. BooleanContent BooleanVectorContents; @@ -1600,7 +1635,7 @@ class TargetLoweringBase { LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT); assert( - (LA == TypeLegal || + (LA == TypeLegal || LA == TypeSoftenFloat || ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger) && "Promote may not follow Expand or Promote"); @@ -2111,7 +2146,7 @@ class TargetLowering : public TargetLoweringBase { unsigned NumFixedArgs; CallingConv::ID CallConv; SDValue Callee; - ArgListTy *Args; + ArgListTy Args; SelectionDAG &DAG; SDLoc DL; ImmutableCallSite *CS; @@ -2123,7 +2158,7 @@ class TargetLowering : public TargetLoweringBase { : RetTy(nullptr), RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true), IsTailCall(false), NumFixedArgs(-1), CallConv(CallingConv::C), - Args(nullptr), DAG(DAG), CS(nullptr) {} + DAG(DAG), CS(nullptr) {} CallLoweringInfo &setDebugLoc(SDLoc dl) { DL = dl; @@ -2136,19 +2171,19 @@ class TargetLowering : public TargetLoweringBase { } CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultType, - SDValue Target, ArgListTy *ArgsList, + SDValue Target, ArgListTy &&ArgsList, unsigned FixedArgs = -1) { RetTy = ResultType; Callee = Target; CallConv = CC; NumFixedArgs = - (FixedArgs == static_cast(-1) ? Args->size() : FixedArgs); - Args = ArgsList; + (FixedArgs == static_cast(-1) ? Args.size() : FixedArgs); + Args = std::move(ArgsList); return *this; } CallLoweringInfo &setCallee(Type *ResultType, FunctionType *FTy, - SDValue Target, ArgListTy *ArgsList, + SDValue Target, ArgListTy &&ArgsList, ImmutableCallSite &Call) { RetTy = ResultType; @@ -2163,7 +2198,7 @@ class TargetLowering : public TargetLoweringBase { CallConv = Call.getCallingConv(); NumFixedArgs = FTy->getNumParams(); - Args = ArgsList; + Args = std::move(ArgsList); CS = &Call; @@ -2206,8 +2241,7 @@ class TargetLowering : public TargetLoweringBase { } ArgListTy &getArgs() { - assert(Args && "Arguments must be set before accessing them"); - return *Args; + return Args; } }; @@ -2530,6 +2564,12 @@ class TargetLowering : public TargetLoweringBase { SDValue LH = SDValue(), SDValue RL = SDValue(), SDValue RH = SDValue()) const; + /// Expand float(f32) to SINT(i64) conversion + /// \param N Node to expand + /// \param Result output after conversion + /// \returns True, if the expansion was successful, false otherwise + bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + //===--------------------------------------------------------------------===// // Instruction Emitting Hooks // diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h index 419eced0a0be..7c32a5e3d0ca 100644 --- a/include/llvm/Target/TargetLoweringObjectFile.h +++ b/include/llvm/Target/TargetLoweringObjectFile.h @@ -70,7 +70,8 @@ class TargetLoweringObjectFile : public MCObjectFileInfo { /// Given a constant with the SectionKind, return a section that it should be /// placed in. - virtual const MCSection *getSectionForConstant(SectionKind Kind) const; + virtual const MCSection *getSectionForConstant(SectionKind Kind, + const Constant *C) const; /// Classify the specified global variable into a set of target independent /// categories embodied in SectionKind. diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h index a162297afc72..5dda8bd4b938 100644 --- a/include/llvm/Target/TargetRegisterInfo.h +++ b/include/llvm/Target/TargetRegisterInfo.h @@ -807,18 +807,24 @@ class TargetRegisterInfo : public MCRegisterInfo { int SPAdj, unsigned FIOperandNum, RegScavenger *RS = nullptr) const = 0; + //===--------------------------------------------------------------------===// + /// Subtarget Hooks + + /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true. + virtual bool shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const + { return true; } + //===--------------------------------------------------------------------===// /// Debug information queries. /// getFrameRegister - This method should return the register used as a base /// for values allocated in the current stack frame. virtual unsigned getFrameRegister(const MachineFunction &MF) const = 0; - - /// getCompactUnwindRegNum - This function maps the register to the number for - /// compact unwind encoding. Return -1 if the register isn't valid. - virtual int getCompactUnwindRegNum(unsigned, bool) const { - return -1; - } }; diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td index e6eeb885c0b1..89db37ca859b 100644 --- a/include/llvm/Target/TargetSchedule.td +++ b/include/llvm/Target/TargetSchedule.td @@ -88,6 +88,8 @@ class SchedMachineModel { // Per-cycle resources tables. ProcessorItineraries Itineraries = NoItineraries; + bit PostRAScheduler = 0; // Enable Post RegAlloc Scheduler pass. + // Subtargets that define a model for only a subset of instructions // that have a scheduling class (itinerary class or SchedRW list) // and may actually be generated for that subtarget must clear this diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td index 16cfff179c77..2d822de4ad69 100644 --- a/include/llvm/Target/TargetSelectionDAG.td +++ b/include/llvm/Target/TargetSelectionDAG.td @@ -205,7 +205,7 @@ def SDTPrefetch : SDTypeProfile<0, 4, [ // prefetch SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, SDTCisInt<1> ]>; -def SDTMemBarrier : SDTypeProfile<0, 5, [ // memory barier +def SDTMemBarrier : SDTypeProfile<0, 5, [ // memory barrier SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisSameAs<0,3>, SDTCisSameAs<0,4>, SDTCisInt<0> ]>; @@ -392,8 +392,8 @@ def sint_to_fp : SDNode<"ISD::SINT_TO_FP" , SDTIntToFPOp>; def uint_to_fp : SDNode<"ISD::UINT_TO_FP" , SDTIntToFPOp>; def fp_to_sint : SDNode<"ISD::FP_TO_SINT" , SDTFPToIntOp>; def fp_to_uint : SDNode<"ISD::FP_TO_UINT" , SDTFPToIntOp>; -def f16_to_f32 : SDNode<"ISD::FP16_TO_FP32", SDTIntToFPOp>; -def f32_to_f16 : SDNode<"ISD::FP32_TO_FP16", SDTFPToIntOp>; +def f16_to_fp : SDNode<"ISD::FP16_TO_FP" , SDTIntToFPOp>; +def fp_to_f16 : SDNode<"ISD::FP_TO_FP16" , SDTFPToIntOp>; def setcc : SDNode<"ISD::SETCC" , SDTSetCC>; def select : SDNode<"ISD::SELECT" , SDTSelect>; diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h index bb164288b013..86e303e18348 100644 --- a/include/llvm/Target/TargetSubtargetInfo.h +++ b/include/llvm/Target/TargetSubtargetInfo.h @@ -73,6 +73,9 @@ class TargetSubtargetInfo : public MCSubtargetInfo { /// MISchedPostRA, is set. virtual bool enablePostMachineScheduler() const; + /// \brief True if the subtarget should run the atomic expansion pass. + virtual bool enableAtomicExpandLoadLinked() const; + /// \brief Override generic scheduling policy within a region. /// /// This is a convenient way for targets that don't provide any custom @@ -87,15 +90,31 @@ class TargetSubtargetInfo : public MCSubtargetInfo { // dependency. virtual void adjustSchedDependency(SUnit *def, SUnit *use, SDep& dep) const { } - - // enablePostRAScheduler - If the target can benefit from post-regalloc - // scheduling and the specified optimization level meets the requirement - // return true to enable post-register-allocation scheduling. In - // CriticalPathRCs return any register classes that should only be broken - // if on the critical path. - virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, - AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const; + + // For use with PostRAScheduling: get the anti-dependence breaking that should + // be performed before post-RA scheduling. + virtual AntiDepBreakMode getAntiDepBreakMode() const { + return ANTIDEP_NONE; + } + + // For use with PostRAScheduling: in CriticalPathRCs, return any register + // classes that should only be considered for anti-dependence breaking if they + // are on the critical path. + virtual void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const { + return CriticalPathRCs.clear(); + } + + // For use with PostRAScheduling: get the minimum optimization level needed + // to enable post-RA scheduling. + virtual CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const { + return CodeGenOpt::Default; + } + + /// \brief True if the subtarget should run the local reassignment + /// heuristic of the register allocator. + /// This heuristic may be compile time intensive, \p OptLevel provides + /// a finer grain to tune the register allocator. + virtual bool enableRALocalReassignment(CodeGenOpt::Level OptLevel) const; /// \brief Enable use of alias analysis during code generation (during MI /// scheduling, DAGCombine, etc.). @@ -106,6 +125,7 @@ class TargetSubtargetInfo : public MCSubtargetInfo { /// \brief Reset the features for the subtarget. virtual void resetSubtargetFeatures(const MachineFunction *MF) { } + }; } // End llvm namespace diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h index e2139efe472f..c6a339b0fd22 100644 --- a/include/llvm/Transforms/Instrumentation.h +++ b/include/llvm/Transforms/Instrumentation.h @@ -65,8 +65,7 @@ ModulePass *createGCOVProfilerPass(const GCOVOptions &Options = // Insert AddressSanitizer (address sanity checking) instrumentation FunctionPass *createAddressSanitizerFunctionPass(); -ModulePass * -createAddressSanitizerModulePass(StringRef BlacklistFile = StringRef()); +ModulePass *createAddressSanitizerModulePass(); // Insert MemorySanitizer instrumentation (detection of uninitialized reads) FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 8ecfd801d0d8..46c8bc21d434 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -286,6 +286,13 @@ extern char &LCSSAID; // FunctionPass *createEarlyCSEPass(); +//===----------------------------------------------------------------------===// +// +// MergedLoadStoreMotion - This pass merges loads and stores in diamonds. Loads +// are hoisted into the header, while stores sink into the footer. +// +FunctionPass *createMergedLoadStoreMotionPass(); + //===----------------------------------------------------------------------===// // // GVN - This pass performs global value numbering and redundant load @@ -388,6 +395,13 @@ FunctionPass *createSeparateConstOffsetFromGEPPass(); // BasicBlockPass *createLoadCombinePass(); +// Specific to the rust-lang llvm branch: +//===----------------------------------------------------------------------===// +// +// NullCheckElimination - Eliminate null checks. +// +FunctionPass *createNullCheckEliminationPass(); + } // End llvm namespace #endif diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h index 7309f6960a77..bcafda657c2b 100644 --- a/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -23,6 +23,7 @@ namespace llvm { class AliasAnalysis; +class DominatorTree; class Instruction; class MDNode; class Pass; @@ -202,9 +203,12 @@ ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, /// If Unreachable is true, then ThenBlock ends with /// UnreachableInst, otherwise it branches to Tail. /// Returns the NewBasicBlock's terminator. +/// +/// Updates DT if given. TerminatorInst *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, - MDNode *BranchWeights = nullptr); + MDNode *BranchWeights = nullptr, + DominatorTree *DT = nullptr); /// SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, /// but also creates the ElseBlock. diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h index 6f642692c7d4..c0c690664a9c 100644 --- a/include/llvm/Transforms/Utils/Local.h +++ b/include/llvm/Transforms/Utils/Local.h @@ -148,7 +148,7 @@ bool FlattenCFG(BasicBlock *BB, AliasAnalysis *AA = nullptr); /// and if a predecessor branches to us and one of our successors, fold the /// setcc into the predecessor and use logical operations to pick the right /// destination. -bool FoldBranchToCommonDest(BranchInst *BI); +bool FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL = nullptr); /// DemoteRegToStack - This function takes a virtual register computed by an /// Instruction and replaces it with a slot in the stack frame, allocated via diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h index ee26d834c2a0..7e3a74aae93c 100644 --- a/include/llvm/Transforms/Utils/LoopUtils.h +++ b/include/llvm/Transforms/Utils/LoopUtils.h @@ -17,6 +17,7 @@ namespace llvm { class AliasAnalysis; class BasicBlock; +class DataLayout; class DominatorTree; class Loop; class LoopInfo; @@ -32,7 +33,8 @@ BasicBlock *InsertPreheaderForLoop(Loop *L, Pass *P); /// will optionally update \c AliasAnalysis and \c ScalarEvolution analyses if /// passed into it. bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, - AliasAnalysis *AA = nullptr, ScalarEvolution *SE = nullptr); + AliasAnalysis *AA = nullptr, ScalarEvolution *SE = nullptr, + const DataLayout *DL = nullptr); /// \brief Put loop into LCSSA form. /// diff --git a/include/llvm/Transforms/Utils/SpecialCaseList.h b/include/llvm/Transforms/Utils/SpecialCaseList.h deleted file mode 100644 index 508a6df5dce4..000000000000 --- a/include/llvm/Transforms/Utils/SpecialCaseList.h +++ /dev/null @@ -1,114 +0,0 @@ -//===-- SpecialCaseList.h - special case list for sanitizers ----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -//===----------------------------------------------------------------------===// -// -// This is a utility class for instrumentation passes (like AddressSanitizer -// or ThreadSanitizer) to avoid instrumenting some functions or global -// variables based on a user-supplied list. -// -// The list can also specify categories for specific globals, which can be used -// to instruct an instrumentation pass to treat certain functions or global -// variables in a specific way, such as by omitting certain aspects of -// instrumentation while keeping others, or informing the instrumentation pass -// that a specific uninstrumentable function has certain semantics, thus -// allowing the pass to instrument callers according to those semantics. -// -// For example, AddressSanitizer uses the "init" category for globals whose -// initializers should not be instrumented, but which in all other respects -// should be instrumented. -// -// Each line contains a prefix, followed by a colon and a wild card expression, -// followed optionally by an equals sign and an instrumentation-specific -// category. Empty lines and lines starting with "#" are ignored. -// --- -// # Blacklisted items: -// fun:*_ZN4base6subtle* -// global:*global_with_bad_access_or_initialization* -// global:*global_with_initialization_issues*=init -// type:*Namespace::ClassName*=init -// src:file_with_tricky_code.cc -// src:ignore-global-initializers-issues.cc=init -// -// # Functions with pure functional semantics: -// fun:cos=functional -// fun:sin=functional -// --- -// Note that the wild card is in fact an llvm::Regex, but * is automatically -// replaced with .* -// This is similar to the "ignore" feature of ThreadSanitizer. -// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_UTILS_SPECIALCASELIST_H -#define LLVM_TRANSFORMS_UTILS_SPECIALCASELIST_H - -#include "llvm/ADT/StringMap.h" - -namespace llvm { -class Function; -class GlobalAlias; -class GlobalVariable; -class MemoryBuffer; -class Module; -class Regex; -class StringRef; - -class SpecialCaseList { - public: - /// Parses the special case list from a file. If Path is empty, returns - /// an empty special case list. On failure, returns 0 and writes an error - /// message to string. - static SpecialCaseList *create(const StringRef Path, std::string &Error); - /// Parses the special case list from a memory buffer. On failure, returns - /// 0 and writes an error message to string. - static SpecialCaseList *create(const MemoryBuffer *MB, std::string &Error); - /// Parses the special case list from a file. On failure, reports a fatal - /// error. - static SpecialCaseList *createOrDie(const StringRef Path); - - ~SpecialCaseList(); - - /// Returns whether either this function or its source file are listed in the - /// given category, which may be omitted to search the empty category. - bool isIn(const Function &F, const StringRef Category = StringRef()) const; - - /// Returns whether this global, its type or its source file are listed in the - /// given category, which may be omitted to search the empty category. - bool isIn(const GlobalVariable &G, - const StringRef Category = StringRef()) const; - - /// Returns whether this global alias is listed in the given category, which - /// may be omitted to search the empty category. - /// - /// If GA aliases a function, the alias's name is matched as a function name - /// would be. Similarly, aliases of globals are matched like globals. - bool isIn(const GlobalAlias &GA, - const StringRef Category = StringRef()) const; - - /// Returns whether this module is listed in the given category, which may be - /// omitted to search the empty category. - bool isIn(const Module &M, const StringRef Category = StringRef()) const; - - private: - SpecialCaseList(SpecialCaseList const &) LLVM_DELETED_FUNCTION; - SpecialCaseList &operator=(SpecialCaseList const &) LLVM_DELETED_FUNCTION; - - struct Entry; - StringMap > Entries; - - SpecialCaseList(); - /// Parses just-constructed SpecialCaseList entries from a memory buffer. - bool parse(const MemoryBuffer *MB, std::string &Error); - - bool inSectionCategory(const StringRef Section, const StringRef Query, - const StringRef Category) const; -}; - -} // namespace llvm - -#endif // LLVM_TRANSFORMS_UTILS_SPECIALCASELIST_H diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp index 57237e59e82b..8b8106b950a0 100644 --- a/lib/Analysis/AliasAnalysis.cpp +++ b/lib/Analysis/AliasAnalysis.cpp @@ -60,6 +60,13 @@ bool AliasAnalysis::pointsToConstantMemory(const Location &Loc, return AA->pointsToConstantMemory(Loc, OrLocal); } +AliasAnalysis::Location +AliasAnalysis::getArgLocation(ImmutableCallSite CS, unsigned ArgIdx, + AliasAnalysis::ModRefResult &Mask) { + assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); + return AA->getArgLocation(CS, ArgIdx, Mask); +} + void AliasAnalysis::deleteValue(Value *V) { assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); AA->deleteValue(V); @@ -91,22 +98,26 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS, if (onlyAccessesArgPointees(MRB)) { bool doesAlias = false; + ModRefResult AllArgsMask = NoModRef; if (doesAccessArgPointees(MRB)) { - MDNode *CSTag = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa); for (ImmutableCallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end(); AI != AE; ++AI) { const Value *Arg = *AI; if (!Arg->getType()->isPointerTy()) continue; - Location CSLoc(Arg, UnknownSize, CSTag); + ModRefResult ArgMask; + Location CSLoc = + getArgLocation(CS, (unsigned) std::distance(CS.arg_begin(), AI), + ArgMask); if (!isNoAlias(CSLoc, Loc)) { doesAlias = true; - break; + AllArgsMask = ModRefResult(AllArgsMask | ArgMask); } } } if (!doesAlias) return NoModRef; + Mask = ModRefResult(Mask & AllArgsMask); } // If Loc is a constant memory location, the call definitely could not @@ -150,14 +161,23 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) { if (onlyAccessesArgPointees(CS2B)) { AliasAnalysis::ModRefResult R = NoModRef; if (doesAccessArgPointees(CS2B)) { - MDNode *CS2Tag = CS2.getInstruction()->getMetadata(LLVMContext::MD_tbaa); for (ImmutableCallSite::arg_iterator I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) { const Value *Arg = *I; if (!Arg->getType()->isPointerTy()) continue; - Location CS2Loc(Arg, UnknownSize, CS2Tag); - R = ModRefResult((R | getModRefInfo(CS1, CS2Loc)) & Mask); + ModRefResult ArgMask; + Location CS2Loc = + getArgLocation(CS2, (unsigned) std::distance(CS2.arg_begin(), I), + ArgMask); + // ArgMask indicates what CS2 might do to CS2Loc, and the dependence of + // CS1 on that location is the inverse. + if (ArgMask == Mod) + ArgMask = ModRef; + else if (ArgMask == Ref) + ArgMask = Mod; + + R = ModRefResult((R | (getModRefInfo(CS1, CS2Loc) & ArgMask)) & Mask); if (R == Mask) break; } @@ -170,21 +190,28 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) { if (onlyAccessesArgPointees(CS1B)) { AliasAnalysis::ModRefResult R = NoModRef; if (doesAccessArgPointees(CS1B)) { - MDNode *CS1Tag = CS1.getInstruction()->getMetadata(LLVMContext::MD_tbaa); for (ImmutableCallSite::arg_iterator I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) { const Value *Arg = *I; if (!Arg->getType()->isPointerTy()) continue; - Location CS1Loc(Arg, UnknownSize, CS1Tag); - if (getModRefInfo(CS2, CS1Loc) != NoModRef) { - R = Mask; + ModRefResult ArgMask; + Location CS1Loc = + getArgLocation(CS1, (unsigned) std::distance(CS1.arg_begin(), I), + ArgMask); + // ArgMask indicates what CS1 might do to CS1Loc; if CS1 might Mod + // CS1Loc, then we care about either a Mod or a Ref by CS2. If CS1 + // might Ref, then we care only about a Mod by CS2. + ModRefResult ArgR = getModRefInfo(CS2, CS1Loc); + if (((ArgMask & Mod) != NoModRef && (ArgR & ModRef) != NoModRef) || + ((ArgMask & Ref) != NoModRef && (ArgR & Mod) != NoModRef)) + R = ModRefResult((R | ArgMask) & Mask); + + if (R == Mask) break; - } } } - if (R == NoModRef) - return R; + return R; } // If this is the end of the chain, don't forward. @@ -361,53 +388,6 @@ AliasAnalysis::getModRefInfo(const AtomicRMWInst *RMW, const Location &Loc) { return ModRef; } -namespace { - /// Only find pointer captures which happen before the given instruction. Uses - /// the dominator tree to determine whether one instruction is before another. - /// Only support the case where the Value is defined in the same basic block - /// as the given instruction and the use. - struct CapturesBefore : public CaptureTracker { - CapturesBefore(const Instruction *I, DominatorTree *DT) - : BeforeHere(I), DT(DT), Captured(false) {} - - void tooManyUses() override { Captured = true; } - - bool shouldExplore(const Use *U) override { - Instruction *I = cast(U->getUser()); - BasicBlock *BB = I->getParent(); - // We explore this usage only if the usage can reach "BeforeHere". - // If use is not reachable from entry, there is no need to explore. - if (BeforeHere != I && !DT->isReachableFromEntry(BB)) - return false; - // If the value is defined in the same basic block as use and BeforeHere, - // there is no need to explore the use if BeforeHere dominates use. - // Check whether there is a path from I to BeforeHere. - if (BeforeHere != I && DT->dominates(BeforeHere, I) && - !isPotentiallyReachable(I, BeforeHere, DT)) - return false; - return true; - } - - bool captured(const Use *U) override { - Instruction *I = cast(U->getUser()); - BasicBlock *BB = I->getParent(); - // Same logic as in shouldExplore. - if (BeforeHere != I && !DT->isReachableFromEntry(BB)) - return false; - if (BeforeHere != I && DT->dominates(BeforeHere, I) && - !isPotentiallyReachable(I, BeforeHere, DT)) - return false; - Captured = true; - return true; - } - - const Instruction *BeforeHere; - DominatorTree *DT; - - bool Captured; - }; -} - // FIXME: this is really just shoring-up a deficiency in alias analysis. // BasicAA isn't willing to spend linear time determining whether an alloca // was captured before or after this particular call, while we are. However, @@ -427,9 +407,9 @@ AliasAnalysis::callCapturesBefore(const Instruction *I, if (!CS.getInstruction() || CS.getInstruction() == Object) return AliasAnalysis::ModRef; - CapturesBefore CB(I, DT); - llvm::PointerMayBeCaptured(Object, &CB); - if (CB.Captured) + if (llvm::PointerMayBeCapturedBefore(Object, /* ReturnCaptures */ true, + /* StoreCaptures */ true, I, DT, + /* include Object */ true)) return AliasAnalysis::ModRef; unsigned ArgNo = 0; @@ -555,3 +535,14 @@ bool llvm::isIdentifiedObject(const Value *V) { return A->hasNoAliasAttr() || A->hasByValAttr(); return false; } + +/// isIdentifiedFunctionLocal - Return true if V is umabigously identified +/// at the function-level. Different IdentifiedFunctionLocals can't alias. +/// Further, an IdentifiedFunctionLocal can not alias with any function +/// arguments other than itself, which is not necessarily true for +/// IdentifiedObjects. +bool llvm::isIdentifiedFunctionLocal(const Value *V) +{ + return isa(V) || isNoAliasCall(V) || isNoAliasArgument(V); +} + diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp index ade940a7d300..c58012f3a8c3 100644 --- a/lib/Analysis/Analysis.cpp +++ b/lib/Analysis/Analysis.cpp @@ -57,7 +57,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializeMemoryDependenceAnalysisPass(Registry); initializeModuleDebugInfoPrinterPass(Registry); initializePostDominatorTreePass(Registry); - initializeRegionInfoPass(Registry); + initializeRegionInfoPassPass(Registry); initializeRegionViewerPass(Registry); initializeRegionPrinterPass(Registry); initializeRegionOnlyViewerPass(Registry); diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp index fe90b84533dc..38ec52d6b9ac 100644 --- a/lib/Analysis/BasicAliasAnalysis.cpp +++ b/lib/Analysis/BasicAliasAnalysis.cpp @@ -156,17 +156,6 @@ static bool isObjectSize(const Value *V, uint64_t Size, return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize == Size; } -/// isIdentifiedFunctionLocal - Return true if V is umabigously identified -/// at the function-level. Different IdentifiedFunctionLocals can't alias. -/// Further, an IdentifiedFunctionLocal can not alias with any function -/// arguments other than itself, which is not necessarily true for -/// IdentifiedObjects. -static bool isIdentifiedFunctionLocal(const Value *V) -{ - return isa(V) || isNoAliasCall(V) || isNoAliasArgument(V); -} - - //===----------------------------------------------------------------------===// // GetElementPtr Instruction Decomposition and Analysis //===----------------------------------------------------------------------===// @@ -309,7 +298,8 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs, return V; } - if (Op->getOpcode() == Instruction::BitCast) { + if (Op->getOpcode() == Instruction::BitCast || + Op->getOpcode() == Instruction::AddrSpaceCast) { V = Op->getOperand(0); continue; } @@ -490,6 +480,10 @@ namespace { /// global) or not. bool pointsToConstantMemory(const Location &Loc, bool OrLocal) override; + /// Get the location associated with a pointer argument of a callsite. + Location getArgLocation(ImmutableCallSite CS, unsigned ArgIdx, + ModRefResult &Mask) override; + /// getModRefBehavior - Return the behavior when calling the given /// call site. ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override; @@ -653,6 +647,21 @@ BasicAliasAnalysis::pointsToConstantMemory(const Location &Loc, bool OrLocal) { return Worklist.empty(); } +static bool isMemsetPattern16(const Function *MS, + const TargetLibraryInfo &TLI) { + if (TLI.has(LibFunc::memset_pattern16) && + MS->getName() == "memset_pattern16") { + FunctionType *MemsetType = MS->getFunctionType(); + if (!MemsetType->isVarArg() && MemsetType->getNumParams() == 3 && + isa(MemsetType->getParamType(0)) && + isa(MemsetType->getParamType(1)) && + isa(MemsetType->getParamType(2))) + return true; + } + + return false; +} + /// getModRefBehavior - Return the behavior when calling the given call site. AliasAnalysis::ModRefBehavior BasicAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) { @@ -692,10 +701,93 @@ BasicAliasAnalysis::getModRefBehavior(const Function *F) { if (F->onlyReadsMemory()) Min = OnlyReadsMemory; + const TargetLibraryInfo &TLI = getAnalysis(); + if (isMemsetPattern16(F, TLI)) + Min = OnlyAccessesArgumentPointees; + // Otherwise be conservative. return ModRefBehavior(AliasAnalysis::getModRefBehavior(F) & Min); } +AliasAnalysis::Location +BasicAliasAnalysis::getArgLocation(ImmutableCallSite CS, unsigned ArgIdx, + ModRefResult &Mask) { + Location Loc = AliasAnalysis::getArgLocation(CS, ArgIdx, Mask); + const TargetLibraryInfo &TLI = getAnalysis(); + const IntrinsicInst *II = dyn_cast(CS.getInstruction()); + if (II != nullptr) + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::memset: + case Intrinsic::memcpy: + case Intrinsic::memmove: { + assert((ArgIdx == 0 || ArgIdx == 1) && + "Invalid argument index for memory intrinsic"); + if (ConstantInt *LenCI = dyn_cast(II->getArgOperand(2))) + Loc.Size = LenCI->getZExtValue(); + assert(Loc.Ptr == II->getArgOperand(ArgIdx) && + "Memory intrinsic location pointer not argument?"); + Mask = ArgIdx ? Ref : Mod; + break; + } + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::invariant_start: { + assert(ArgIdx == 1 && "Invalid argument index"); + assert(Loc.Ptr == II->getArgOperand(ArgIdx) && + "Intrinsic location pointer not argument?"); + Loc.Size = cast(II->getArgOperand(0))->getZExtValue(); + break; + } + case Intrinsic::invariant_end: { + assert(ArgIdx == 2 && "Invalid argument index"); + assert(Loc.Ptr == II->getArgOperand(ArgIdx) && + "Intrinsic location pointer not argument?"); + Loc.Size = cast(II->getArgOperand(1))->getZExtValue(); + break; + } + case Intrinsic::arm_neon_vld1: { + assert(ArgIdx == 0 && "Invalid argument index"); + assert(Loc.Ptr == II->getArgOperand(ArgIdx) && + "Intrinsic location pointer not argument?"); + // LLVM's vld1 and vst1 intrinsics currently only support a single + // vector register. + if (DL) + Loc.Size = DL->getTypeStoreSize(II->getType()); + break; + } + case Intrinsic::arm_neon_vst1: { + assert(ArgIdx == 0 && "Invalid argument index"); + assert(Loc.Ptr == II->getArgOperand(ArgIdx) && + "Intrinsic location pointer not argument?"); + if (DL) + Loc.Size = DL->getTypeStoreSize(II->getArgOperand(1)->getType()); + break; + } + } + + // We can bound the aliasing properties of memset_pattern16 just as we can + // for memcpy/memset. This is particularly important because the + // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16 + // whenever possible. + else if (CS.getCalledFunction() && + isMemsetPattern16(CS.getCalledFunction(), TLI)) { + assert((ArgIdx == 0 || ArgIdx == 1) && + "Invalid argument index for memset_pattern16"); + if (ArgIdx == 1) + Loc.Size = 16; + else if (const ConstantInt *LenCI = + dyn_cast(CS.getArgument(2))) + Loc.Size = LenCI->getZExtValue(); + assert(Loc.Ptr == CS.getArgument(ArgIdx) && + "memset_pattern16 location pointer not argument?"); + Mask = ArgIdx ? Ref : Mod; + } + // FIXME: Handle memset_pattern4 and memset_pattern8 also. + + return Loc; +} + /// getModRefInfo - Check to see if the specified callsite can clobber the /// specified memory object. Since we only look at local properties of this /// function, we really can't say much about this query. We do, however, use @@ -748,124 +840,8 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS, return NoModRef; } - const TargetLibraryInfo &TLI = getAnalysis(); - ModRefResult Min = ModRef; - - // Finally, handle specific knowledge of intrinsics. - const IntrinsicInst *II = dyn_cast(CS.getInstruction()); - if (II != nullptr) - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::memcpy: - case Intrinsic::memmove: { - uint64_t Len = UnknownSize; - if (ConstantInt *LenCI = dyn_cast(II->getArgOperand(2))) - Len = LenCI->getZExtValue(); - Value *Dest = II->getArgOperand(0); - Value *Src = II->getArgOperand(1); - // If it can't overlap the source dest, then it doesn't modref the loc. - if (isNoAlias(Location(Dest, Len), Loc)) { - if (isNoAlias(Location(Src, Len), Loc)) - return NoModRef; - // If it can't overlap the dest, then worst case it reads the loc. - Min = Ref; - } else if (isNoAlias(Location(Src, Len), Loc)) { - // If it can't overlap the source, then worst case it mutates the loc. - Min = Mod; - } - break; - } - case Intrinsic::memset: - // Since memset is 'accesses arguments' only, the AliasAnalysis base class - // will handle it for the variable length case. - if (ConstantInt *LenCI = dyn_cast(II->getArgOperand(2))) { - uint64_t Len = LenCI->getZExtValue(); - Value *Dest = II->getArgOperand(0); - if (isNoAlias(Location(Dest, Len), Loc)) - return NoModRef; - } - // We know that memset doesn't load anything. - Min = Mod; - break; - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - case Intrinsic::invariant_start: { - uint64_t PtrSize = - cast(II->getArgOperand(0))->getZExtValue(); - if (isNoAlias(Location(II->getArgOperand(1), - PtrSize, - II->getMetadata(LLVMContext::MD_tbaa)), - Loc)) - return NoModRef; - break; - } - case Intrinsic::invariant_end: { - uint64_t PtrSize = - cast(II->getArgOperand(1))->getZExtValue(); - if (isNoAlias(Location(II->getArgOperand(2), - PtrSize, - II->getMetadata(LLVMContext::MD_tbaa)), - Loc)) - return NoModRef; - break; - } - case Intrinsic::arm_neon_vld1: { - // LLVM's vld1 and vst1 intrinsics currently only support a single - // vector register. - uint64_t Size = - DL ? DL->getTypeStoreSize(II->getType()) : UnknownSize; - if (isNoAlias(Location(II->getArgOperand(0), Size, - II->getMetadata(LLVMContext::MD_tbaa)), - Loc)) - return NoModRef; - break; - } - case Intrinsic::arm_neon_vst1: { - uint64_t Size = - DL ? DL->getTypeStoreSize(II->getArgOperand(1)->getType()) : UnknownSize; - if (isNoAlias(Location(II->getArgOperand(0), Size, - II->getMetadata(LLVMContext::MD_tbaa)), - Loc)) - return NoModRef; - break; - } - } - - // We can bound the aliasing properties of memset_pattern16 just as we can - // for memcpy/memset. This is particularly important because the - // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16 - // whenever possible. - else if (TLI.has(LibFunc::memset_pattern16) && - CS.getCalledFunction() && - CS.getCalledFunction()->getName() == "memset_pattern16") { - const Function *MS = CS.getCalledFunction(); - FunctionType *MemsetType = MS->getFunctionType(); - if (!MemsetType->isVarArg() && MemsetType->getNumParams() == 3 && - isa(MemsetType->getParamType(0)) && - isa(MemsetType->getParamType(1)) && - isa(MemsetType->getParamType(2))) { - uint64_t Len = UnknownSize; - if (const ConstantInt *LenCI = dyn_cast(CS.getArgument(2))) - Len = LenCI->getZExtValue(); - const Value *Dest = CS.getArgument(0); - const Value *Src = CS.getArgument(1); - // If it can't overlap the source dest, then it doesn't modref the loc. - if (isNoAlias(Location(Dest, Len), Loc)) { - // Always reads 16 bytes of the source. - if (isNoAlias(Location(Src, 16), Loc)) - return NoModRef; - // If it can't overlap the dest, then worst case it reads the loc. - Min = Ref; - // Always reads 16 bytes of the source. - } else if (isNoAlias(Location(Src, 16), Loc)) { - // If it can't overlap the source, then worst case it mutates the loc. - Min = Mod; - } - } - } - // The AliasAnalysis base class has some smarts, lets use them. - return ModRefResult(AliasAnalysis::getModRefInfo(CS, Loc) & Min); + return AliasAnalysis::getModRefInfo(CS, Loc); } /// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp index 87d93a4bd5f9..3203c371648d 100644 --- a/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/BlockFrequencyInfoImpl.h" -#include "llvm/ADT/APFloat.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/Support/raw_ostream.h" #include @@ -22,300 +21,10 @@ using namespace llvm::bfi_detail; #define DEBUG_TYPE "block-freq" -//===----------------------------------------------------------------------===// -// -// UnsignedFloat implementation. -// -//===----------------------------------------------------------------------===// -#ifndef _MSC_VER -const int32_t UnsignedFloatBase::MaxExponent; -const int32_t UnsignedFloatBase::MinExponent; -#endif - -static void appendDigit(std::string &Str, unsigned D) { - assert(D < 10); - Str += '0' + D % 10; -} - -static void appendNumber(std::string &Str, uint64_t N) { - while (N) { - appendDigit(Str, N % 10); - N /= 10; - } -} - -static bool doesRoundUp(char Digit) { - switch (Digit) { - case '5': - case '6': - case '7': - case '8': - case '9': - return true; - default: - return false; - } -} - -static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) { - assert(E >= UnsignedFloatBase::MinExponent); - assert(E <= UnsignedFloatBase::MaxExponent); - - // Find a new E, but don't let it increase past MaxExponent. - int LeadingZeros = UnsignedFloatBase::countLeadingZeros64(D); - int NewE = std::min(UnsignedFloatBase::MaxExponent, E + 63 - LeadingZeros); - int Shift = 63 - (NewE - E); - assert(Shift <= LeadingZeros); - assert(Shift == LeadingZeros || NewE == UnsignedFloatBase::MaxExponent); - D <<= Shift; - E = NewE; - - // Check for a denormal. - unsigned AdjustedE = E + 16383; - if (!(D >> 63)) { - assert(E == UnsignedFloatBase::MaxExponent); - AdjustedE = 0; - } - - // Build the float and print it. - uint64_t RawBits[2] = {D, AdjustedE}; - APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits)); - SmallVector Chars; - Float.toString(Chars, Precision, 0); - return std::string(Chars.begin(), Chars.end()); -} - -static std::string stripTrailingZeros(const std::string &Float) { - size_t NonZero = Float.find_last_not_of('0'); - assert(NonZero != std::string::npos && "no . in floating point string"); - - if (Float[NonZero] == '.') - ++NonZero; - - return Float.substr(0, NonZero + 1); -} - -std::string UnsignedFloatBase::toString(uint64_t D, int16_t E, int Width, - unsigned Precision) { - if (!D) - return "0.0"; - - // Canonicalize exponent and digits. - uint64_t Above0 = 0; - uint64_t Below0 = 0; - uint64_t Extra = 0; - int ExtraShift = 0; - if (E == 0) { - Above0 = D; - } else if (E > 0) { - if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) { - D <<= Shift; - E -= Shift; - - if (!E) - Above0 = D; - } - } else if (E > -64) { - Above0 = D >> -E; - Below0 = D << (64 + E); - } else if (E > -120) { - Below0 = D >> (-E - 64); - Extra = D << (128 + E); - ExtraShift = -64 - E; - } - - // Fall back on APFloat for very small and very large numbers. - if (!Above0 && !Below0) - return toStringAPFloat(D, E, Precision); - - // Append the digits before the decimal. - std::string Str; - size_t DigitsOut = 0; - if (Above0) { - appendNumber(Str, Above0); - DigitsOut = Str.size(); - } else - appendDigit(Str, 0); - std::reverse(Str.begin(), Str.end()); - - // Return early if there's nothing after the decimal. - if (!Below0) - return Str + ".0"; - - // Append the decimal and beyond. - Str += '.'; - uint64_t Error = UINT64_C(1) << (64 - Width); - - // We need to shift Below0 to the right to make space for calculating - // digits. Save the precision we're losing in Extra. - Extra = (Below0 & 0xf) << 56 | (Extra >> 8); - Below0 >>= 4; - size_t SinceDot = 0; - size_t AfterDot = Str.size(); - do { - if (ExtraShift) { - --ExtraShift; - Error *= 5; - } else - Error *= 10; - - Below0 *= 10; - Extra *= 10; - Below0 += (Extra >> 60); - Extra = Extra & (UINT64_MAX >> 4); - appendDigit(Str, Below0 >> 60); - Below0 = Below0 & (UINT64_MAX >> 4); - if (DigitsOut || Str.back() != '0') - ++DigitsOut; - ++SinceDot; - } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 && - (!Precision || DigitsOut <= Precision || SinceDot < 2)); - - // Return early for maximum precision. - if (!Precision || DigitsOut <= Precision) - return stripTrailingZeros(Str); - - // Find where to truncate. - size_t Truncate = - std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1); - - // Check if there's anything to truncate. - if (Truncate >= Str.size()) - return stripTrailingZeros(Str); - - bool Carry = doesRoundUp(Str[Truncate]); - if (!Carry) - return stripTrailingZeros(Str.substr(0, Truncate)); - - // Round with the first truncated digit. - for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend(); - I != E; ++I) { - if (*I == '.') - continue; - if (*I == '9') { - *I = '0'; - continue; - } - - ++*I; - Carry = false; - break; - } - - // Add "1" in front if we still need to carry. - return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate)); -} - -raw_ostream &UnsignedFloatBase::print(raw_ostream &OS, uint64_t D, int16_t E, - int Width, unsigned Precision) { - return OS << toString(D, E, Width, Precision); -} - -void UnsignedFloatBase::dump(uint64_t D, int16_t E, int Width) { - print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E - << "]"; -} - -static std::pair -getRoundedFloat(uint64_t N, bool ShouldRound, int64_t Shift) { - if (ShouldRound) - if (!++N) - // Rounding caused an overflow. - return std::make_pair(UINT64_C(1), Shift + 64); - return std::make_pair(N, Shift); -} - -std::pair UnsignedFloatBase::divide64(uint64_t Dividend, - uint64_t Divisor) { - // Input should be sanitized. - assert(Divisor); - assert(Dividend); - - // Minimize size of divisor. - int16_t Shift = 0; - if (int Zeros = countTrailingZeros(Divisor)) { - Shift -= Zeros; - Divisor >>= Zeros; - } - - // Check for powers of two. - if (Divisor == 1) - return std::make_pair(Dividend, Shift); - - // Maximize size of dividend. - if (int Zeros = countLeadingZeros64(Dividend)) { - Shift -= Zeros; - Dividend <<= Zeros; - } - - // Start with the result of a divide. - uint64_t Quotient = Dividend / Divisor; - Dividend %= Divisor; - - // Continue building the quotient with long division. - // - // TODO: continue with largers digits. - while (!(Quotient >> 63) && Dividend) { - // Shift Dividend, and check for overflow. - bool IsOverflow = Dividend >> 63; - Dividend <<= 1; - --Shift; - - // Divide. - bool DoesDivide = IsOverflow || Divisor <= Dividend; - Quotient = (Quotient << 1) | uint64_t(DoesDivide); - Dividend -= DoesDivide ? Divisor : 0; - } - - // Round. - if (Dividend >= getHalf(Divisor)) - if (!++Quotient) - // Rounding caused an overflow in Quotient. - return std::make_pair(UINT64_C(1), Shift + 64); - - return getRoundedFloat(Quotient, Dividend >= getHalf(Divisor), Shift); -} - -std::pair UnsignedFloatBase::multiply64(uint64_t L, - uint64_t R) { - // Separate into two 32-bit digits (U.L). - uint64_t UL = L >> 32, LL = L & UINT32_MAX, UR = R >> 32, LR = R & UINT32_MAX; - - // Compute cross products. - uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR; - - // Sum into two 64-bit digits. - uint64_t Upper = P1, Lower = P4; - auto addWithCarry = [&](uint64_t N) { - uint64_t NewLower = Lower + (N << 32); - Upper += (N >> 32) + (NewLower < Lower); - Lower = NewLower; - }; - addWithCarry(P2); - addWithCarry(P3); - - // Check whether the upper digit is empty. - if (!Upper) - return std::make_pair(Lower, 0); - - // Shift as little as possible to maximize precision. - unsigned LeadingZeros = countLeadingZeros64(Upper); - int16_t Shift = 64 - LeadingZeros; - if (LeadingZeros) - Upper = Upper << LeadingZeros | Lower >> Shift; - bool ShouldRound = Shift && (Lower & UINT64_C(1) << (Shift - 1)); - return getRoundedFloat(Upper, ShouldRound, Shift); -} - -//===----------------------------------------------------------------------===// -// -// BlockMass implementation. -// -//===----------------------------------------------------------------------===// -UnsignedFloat BlockMass::toFloat() const { +ScaledNumber BlockMass::toScaled() const { if (isFull()) - return UnsignedFloat(1, 0); - return UnsignedFloat(getMass() + 1, -64); + return ScaledNumber(1, 0); + return ScaledNumber(getMass() + 1, -64); } void BlockMass::dump() const { print(dbgs()); } @@ -332,17 +41,12 @@ raw_ostream &BlockMass::print(raw_ostream &OS) const { return OS; } -//===----------------------------------------------------------------------===// -// -// BlockFrequencyInfoImpl implementation. -// -//===----------------------------------------------------------------------===// namespace { typedef BlockFrequencyInfoImplBase::BlockNode BlockNode; typedef BlockFrequencyInfoImplBase::Distribution Distribution; typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList; -typedef BlockFrequencyInfoImplBase::Float Float; +typedef BlockFrequencyInfoImplBase::Scaled64 Scaled64; typedef BlockFrequencyInfoImplBase::LoopData LoopData; typedef BlockFrequencyInfoImplBase::Weight Weight; typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData; @@ -373,7 +77,8 @@ struct DitheringDistributer { BlockMass takeMass(uint32_t Weight); }; -} + +} // end namespace DitheringDistributer::DitheringDistributer(Distribution &Dist, const BlockMass &Mass) { @@ -407,11 +112,7 @@ void Distribution::add(const BlockNode &Node, uint64_t Amount, Total = NewTotal; // Save the weight. - Weight W; - W.TargetNode = Node; - W.Amount = Amount; - W.Type = Type; - Weights.push_back(W); + Weights.push_back(Weight(Type, Node, Amount)); } static void combineWeight(Weight &W, const Weight &OtherW) { @@ -622,7 +323,7 @@ bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( /// /// Gives the maximum number of estimated iterations allowed for a loop. Very /// large numbers cause problems downstream (even within 64-bits). -static Float getMaxLoopScale() { return Float(1, 12); } +static Scaled64 getMaxLoopScale() { return Scaled64(1, 12); } /// \brief Compute the loop scale for a loop. void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { @@ -634,7 +335,7 @@ void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass; // Block scale stores the inverse of the scale. - Loop.Scale = ExitMass.toFloat().inverse(); + Loop.Scale = ExitMass.toScaled().inverse(); DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull() << " - " << Loop.BackedgeMass << ")\n" @@ -708,15 +409,16 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, } static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, - const Float &Min, const Float &Max) { + const Scaled64 &Min, const Scaled64 &Max) { // Scale the Factor to a size that creates integers. Ideally, integers would // be scaled so that Max == UINT64_MAX so that they can be best // differentiated. However, the register allocator currently deals poorly // with large numbers. Instead, push Min up a little from 1 to give some // room to differentiate small, unequal numbers. // - // TODO: fix issues downstream so that ScalingFactor can be Float(1,64)/Max. - Float ScalingFactor = Min.inverse(); + // TODO: fix issues downstream so that ScalingFactor can be + // Scaled64(1,64)/Max. + Scaled64 ScalingFactor = Min.inverse(); if ((Max / Min).lg() < 60) ScalingFactor <<= 3; @@ -724,10 +426,10 @@ static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max << ", factor = " << ScalingFactor << "\n"); for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) { - Float Scaled = BFI.Freqs[Index].Floating * ScalingFactor; + Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor; BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt()); DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " - << BFI.Freqs[Index].Floating << ", scaled = " << Scaled + << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled << ", int = " << BFI.Freqs[Index].Integer << "\n"); } } @@ -740,7 +442,7 @@ static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop) << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale << "\n"); - Loop.Scale *= Loop.Mass.toFloat(); + Loop.Scale *= Loop.Mass.toScaled(); Loop.IsPackaged = false; DEBUG(dbgs() << " => combined-scale = " << Loop.Scale << "\n"); @@ -749,9 +451,9 @@ static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { // final head scale will be used for updated the rest of the members. for (const BlockNode &N : Loop.Nodes) { const auto &Working = BFI.Working[N.Index]; - Float &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale - : BFI.Freqs[N.Index].Floating; - Float New = Loop.Scale * F; + Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale + : BFI.Freqs[N.Index].Scaled; + Scaled64 New = Loop.Scale * F; DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " << New << "\n"); F = New; @@ -761,7 +463,7 @@ static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { void BlockFrequencyInfoImplBase::unwrapLoops() { // Set initial frequencies from loop-local masses. for (size_t Index = 0; Index < Working.size(); ++Index) - Freqs[Index].Floating = Working[Index].Mass.toFloat(); + Freqs[Index].Scaled = Working[Index].Mass.toScaled(); for (LoopData &Loop : Loops) unwrapLoop(*this, Loop); @@ -770,12 +472,12 @@ void BlockFrequencyInfoImplBase::unwrapLoops() { void BlockFrequencyInfoImplBase::finalizeMetrics() { // Unwrap loop packages in reverse post-order, tracking min and max // frequencies. - auto Min = Float::getLargest(); - auto Max = Float::getZero(); + auto Min = Scaled64::getLargest(); + auto Max = Scaled64::getZero(); for (size_t Index = 0; Index < Working.size(); ++Index) { // Update min/max scale. - Min = std::min(Min, Freqs[Index].Floating); - Max = std::max(Max, Freqs[Index].Floating); + Min = std::min(Min, Freqs[Index].Scaled); + Max = std::max(Max, Freqs[Index].Scaled); } // Convert to integers. @@ -794,11 +496,11 @@ BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { return 0; return Freqs[Node.Index].Integer; } -Float +Scaled64 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { if (!Node.isValid()) - return Float::getZero(); - return Freqs[Node.Index].Floating; + return Scaled64::getZero(); + return Freqs[Node.Index].Scaled; } std::string @@ -819,8 +521,8 @@ BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, raw_ostream & BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, const BlockFrequency &Freq) const { - Float Block(Freq.getFrequency(), 0); - Float Entry(getEntryFreq(), 0); + Scaled64 Block(Freq.getFrequency(), 0); + Scaled64 Entry(getEntryFreq(), 0); return OS << Block / Entry; } diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp index 3708e6080f32..f2f8877af1a2 100644 --- a/lib/Analysis/CaptureTracking.cpp +++ b/lib/Analysis/CaptureTracking.cpp @@ -20,8 +20,10 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/CFG.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" using namespace llvm; @@ -49,6 +51,65 @@ namespace { bool Captured; }; + + /// Only find pointer captures which happen before the given instruction. Uses + /// the dominator tree to determine whether one instruction is before another. + /// Only support the case where the Value is defined in the same basic block + /// as the given instruction and the use. + struct CapturesBefore : public CaptureTracker { + CapturesBefore(bool ReturnCaptures, const Instruction *I, DominatorTree *DT, + bool IncludeI) + : BeforeHere(I), DT(DT), ReturnCaptures(ReturnCaptures), + IncludeI(IncludeI), Captured(false) {} + + void tooManyUses() override { Captured = true; } + + bool shouldExplore(const Use *U) override { + Instruction *I = cast(U->getUser()); + if (BeforeHere == I && !IncludeI) + return false; + + BasicBlock *BB = I->getParent(); + // We explore this usage only if the usage can reach "BeforeHere". + // If use is not reachable from entry, there is no need to explore. + if (BeforeHere != I && !DT->isReachableFromEntry(BB)) + return false; + // If the value is defined in the same basic block as use and BeforeHere, + // there is no need to explore the use if BeforeHere dominates use. + // Check whether there is a path from I to BeforeHere. + if (BeforeHere != I && DT->dominates(BeforeHere, I) && + !isPotentiallyReachable(I, BeforeHere, DT)) + return false; + return true; + } + + bool captured(const Use *U) override { + if (isa(U->getUser()) && !ReturnCaptures) + return false; + + Instruction *I = cast(U->getUser()); + if (BeforeHere == I && !IncludeI) + return false; + + BasicBlock *BB = I->getParent(); + // Same logic as in shouldExplore. + if (BeforeHere != I && !DT->isReachableFromEntry(BB)) + return false; + if (BeforeHere != I && DT->dominates(BeforeHere, I) && + !isPotentiallyReachable(I, BeforeHere, DT)) + return false; + Captured = true; + return true; + } + + const Instruction *BeforeHere; + DominatorTree *DT; + + bool ReturnCaptures; + bool IncludeI; + + bool Captured; + }; } /// PointerMayBeCaptured - Return true if this pointer value may be captured @@ -74,6 +135,32 @@ bool llvm::PointerMayBeCaptured(const Value *V, return SCT.Captured; } +/// PointerMayBeCapturedBefore - Return true if this pointer value may be +/// captured by the enclosing function (which is required to exist). If a +/// DominatorTree is provided, only captures which happen before the given +/// instruction are considered. This routine can be expensive, so consider +/// caching the results. The boolean ReturnCaptures specifies whether +/// returning the value (or part of it) from the function counts as capturing +/// it or not. The boolean StoreCaptures specified whether storing the value +/// (or part of it) into memory anywhere automatically counts as capturing it +/// or not. +bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures, + bool StoreCaptures, const Instruction *I, + DominatorTree *DT, bool IncludeI) { + assert(!isa(V) && + "It doesn't make sense to ask whether a global is captured."); + + if (!DT) + return PointerMayBeCaptured(V, ReturnCaptures, StoreCaptures); + + // TODO: See comment in PointerMayBeCaptured regarding what could be done + // with StoreCaptures. + + CapturesBefore CB(ReturnCaptures, I, DT, IncludeI); + PointerMayBeCaptured(V, &CB); + return CB.Captured; +} + /// TODO: Write a new FunctionPass AliasAnalysis so that it can keep /// a cache. Then we can move the code from BasicAliasAnalysis into /// that path, and remove this threshold. diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index eb3e2c636909..8dc94219027f 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -240,7 +240,8 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV, // Look through ptr->int and ptr->ptr casts. if (CE->getOpcode() == Instruction::PtrToInt || - CE->getOpcode() == Instruction::BitCast) + CE->getOpcode() == Instruction::BitCast || + CE->getOpcode() == Instruction::AddrSpaceCast) return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD); // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5) diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp index 780b1aaa820e..1b74f8c19c51 100644 --- a/lib/Analysis/CostModel.cpp +++ b/lib/Analysis/CostModel.cpp @@ -95,6 +95,31 @@ static bool isReverseVectorMask(SmallVectorImpl &Mask) { return true; } +static bool isAlternateVectorMask(SmallVectorImpl &Mask) { + bool isAlternate = true; + unsigned MaskSize = Mask.size(); + + // Example: shufflevector A, B, <0,5,2,7> + for (unsigned i = 0; i < MaskSize && isAlternate; ++i) { + if (Mask[i] < 0) + continue; + isAlternate = Mask[i] == (int)((i & 1) ? MaskSize + i : i); + } + + if (isAlternate) + return true; + + isAlternate = true; + // Example: shufflevector A, B, <4,1,6,3> + for (unsigned i = 0; i < MaskSize && isAlternate; ++i) { + if (Mask[i] < 0) + continue; + isAlternate = Mask[i] == (int)((i & 1) ? i : MaskSize + i); + } + + return isAlternate; +} + static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) { TargetTransformInfo::OperandValueKind OpInfo = TargetTransformInfo::OK_AnyValue; @@ -466,9 +491,15 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { unsigned NumVecElems = VecTypOp0->getVectorNumElements(); SmallVector Mask = Shuffle->getShuffleMask(); - if (NumVecElems == Mask.size() && isReverseVectorMask(Mask)) - return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0, 0, - nullptr); + if (NumVecElems == Mask.size()) { + if (isReverseVectorMask(Mask)) + return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0, + 0, nullptr); + if (isAlternateVectorMask(Mask)) + return TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, + VecTypOp0, 0, nullptr); + } + return -1; } case Instruction::Call: diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp index 74594f8b5f77..7ba91bc90dfc 100644 --- a/lib/Analysis/DominanceFrontier.cpp +++ b/lib/Analysis/DominanceFrontier.cpp @@ -8,133 +8,50 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/DominanceFrontier.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Analysis/DominanceFrontierImpl.h" + using namespace llvm; +namespace llvm { +template class DominanceFrontierBase; +template class ForwardDominanceFrontierBase; +} + char DominanceFrontier::ID = 0; + INITIALIZE_PASS_BEGIN(DominanceFrontier, "domfrontier", "Dominance Frontier Construction", true, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(DominanceFrontier, "domfrontier", "Dominance Frontier Construction", true, true) -namespace { - class DFCalculateWorkObject { - public: - DFCalculateWorkObject(BasicBlock *B, BasicBlock *P, - const DomTreeNode *N, - const DomTreeNode *PN) - : currentBB(B), parentBB(P), Node(N), parentNode(PN) {} - BasicBlock *currentBB; - BasicBlock *parentBB; - const DomTreeNode *Node; - const DomTreeNode *parentNode; - }; +DominanceFrontier::DominanceFrontier() + : FunctionPass(ID), + Base() { + initializeDominanceFrontierPass(*PassRegistry::getPassRegistry()); } -void DominanceFrontier::anchor() { } - -const DominanceFrontier::DomSetType & -DominanceFrontier::calculate(const DominatorTree &DT, - const DomTreeNode *Node) { - BasicBlock *BB = Node->getBlock(); - DomSetType *Result = nullptr; - - std::vector workList; - SmallPtrSet visited; - - workList.push_back(DFCalculateWorkObject(BB, nullptr, Node, nullptr)); - do { - DFCalculateWorkObject *currentW = &workList.back(); - assert (currentW && "Missing work object."); - - BasicBlock *currentBB = currentW->currentBB; - BasicBlock *parentBB = currentW->parentBB; - const DomTreeNode *currentNode = currentW->Node; - const DomTreeNode *parentNode = currentW->parentNode; - assert (currentBB && "Invalid work object. Missing current Basic Block"); - assert (currentNode && "Invalid work object. Missing current Node"); - DomSetType &S = Frontiers[currentBB]; - - // Visit each block only once. - if (visited.count(currentBB) == 0) { - visited.insert(currentBB); - - // Loop over CFG successors to calculate DFlocal[currentNode] - for (succ_iterator SI = succ_begin(currentBB), SE = succ_end(currentBB); - SI != SE; ++SI) { - // Does Node immediately dominate this successor? - if (DT[*SI]->getIDom() != currentNode) - S.insert(*SI); - } - } - - // At this point, S is DFlocal. Now we union in DFup's of our children... - // Loop through and visit the nodes that Node immediately dominates (Node's - // children in the IDomTree) - bool visitChild = false; - for (DomTreeNode::const_iterator NI = currentNode->begin(), - NE = currentNode->end(); NI != NE; ++NI) { - DomTreeNode *IDominee = *NI; - BasicBlock *childBB = IDominee->getBlock(); - if (visited.count(childBB) == 0) { - workList.push_back(DFCalculateWorkObject(childBB, currentBB, - IDominee, currentNode)); - visitChild = true; - } - } - - // If all children are visited or there is any child then pop this block - // from the workList. - if (!visitChild) { - - if (!parentBB) { - Result = &S; - break; - } - - DomSetType::const_iterator CDFI = S.begin(), CDFE = S.end(); - DomSetType &parentSet = Frontiers[parentBB]; - for (; CDFI != CDFE; ++CDFI) { - if (!DT.properlyDominates(parentNode, DT[*CDFI])) - parentSet.insert(*CDFI); - } - workList.pop_back(); - } +void DominanceFrontier::releaseMemory() { + Base.releaseMemory(); +} - } while (!workList.empty()); +bool DominanceFrontier::runOnFunction(Function &) { + releaseMemory(); + Base.analyze(getAnalysis().getDomTree()); + return false; +} - return *Result; +void DominanceFrontier::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired(); } -void DominanceFrontierBase::print(raw_ostream &OS, const Module* ) const { - for (const_iterator I = begin(), E = end(); I != E; ++I) { - OS << " DomFrontier for BB "; - if (I->first) - I->first->printAsOperand(OS, false); - else - OS << " <>"; - OS << " is:\t"; - - const std::set &BBs = I->second; - - for (std::set::const_iterator I = BBs.begin(), E = BBs.end(); - I != E; ++I) { - OS << ' '; - if (*I) - (*I)->printAsOperand(OS, false); - else - OS << "<>"; - } - OS << "\n"; - } +void DominanceFrontier::print(raw_ostream &OS, const Module *) const { + Base.print(OS); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void DominanceFrontierBase::dump() const { +void DominanceFrontier::dump() const { print(dbgs()); } #endif - diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp index caec2534691a..dfabb0ad063d 100644 --- a/lib/Analysis/IPA/CallGraph.cpp +++ b/lib/Analysis/IPA/CallGraph.cpp @@ -267,7 +267,7 @@ INITIALIZE_PASS(CallGraphWrapperPass, "basiccg", "CallGraph Construction", char CallGraphWrapperPass::ID = 0; -void CallGraphWrapperPass::releaseMemory() { G.reset(nullptr); } +void CallGraphWrapperPass::releaseMemory() { G.reset(); } void CallGraphWrapperPass::print(raw_ostream &OS, const Module *) const { if (!G) { diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp index 0d9d0ef842c6..c27edbfa2ff5 100644 --- a/lib/Analysis/IPA/CallGraphSCCPass.cpp +++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp @@ -603,8 +603,10 @@ namespace { bool runOnSCC(CallGraphSCC &SCC) override { Out << Banner; for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - assert((*I)->getFunction() && "Expecting non-null Function"); - (*I)->getFunction()->print(Out); + if ((*I)->getFunction()) + (*I)->getFunction()->print(Out); + else + Out << "\nPrinting Function\n"; } return false; } diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp index 66f3f8e02528..8807529cabac 100644 --- a/lib/Analysis/IPA/InlineCost.cpp +++ b/lib/Analysis/IPA/InlineCost.cpp @@ -841,10 +841,7 @@ bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { // original function which is extremely undefined behavior. // FIXME: This logic isn't really right; we can safely inline functions with // indirectbr's as long as no other function or global references the - // blockaddress of a block within the current function. And as a QOI issue, - // if someone is using a blockaddress without an indirectbr, and that - // reference somehow ends up in another function or global, we probably don't - // want to inline this function. + // blockaddress of a block within the current function. HasIndirectBr = true; return false; } @@ -1121,6 +1118,15 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { if (BB->empty()) continue; + // Disallow inlining a blockaddress. A blockaddress only has defined + // behavior for an indirect branch in the same function, and we do not + // currently support inlining indirect branches. But, the inliner may not + // see an indirect branch that ends up being dead code at a particular call + // site. If the blockaddress escapes the function, e.g., via a global + // variable, inlining may lead to an invalid cross-function reference. + if (BB->hasAddressTaken()) + return false; + // Analyze the cost of this block. If we blow through the threshold, this // returns false, and we can bail on out. if (!analyzeBlock(BB)) { @@ -1303,8 +1309,9 @@ bool InlineCostAnalysis::isInlineViable(Function &F) { F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::ReturnsTwice); for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { - // Disallow inlining of functions which contain an indirect branch. - if (isa(BI->getTerminator())) + // Disallow inlining of functions which contain indirect branches or + // blockaddresses. + if (isa(BI->getTerminator()) || BI->hasAddressTaken()) return false; for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp index 0b94238e6631..24655aa002c8 100644 --- a/lib/Analysis/IVUsers.cpp +++ b/lib/Analysis/IVUsers.cpp @@ -287,8 +287,10 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const { OS << ")"; } OS << " in "; - assert(UI->getUser() != nullptr && "Expected non-null User"); - UI->getUser()->print(OS); + if (UI->getUser()) + UI->getUser()->print(OS); + else + OS << "Printing User"; OS << '\n'; } } diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 3684fda854fc..7a820a58f6f0 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -39,7 +39,6 @@ using namespace llvm::PatternMatch; enum { RecursionLimit = 3 }; STATISTIC(NumExpand, "Number of expansions"); -STATISTIC(NumFactor , "Number of factorizations"); STATISTIC(NumReassoc, "Number of reassociations"); struct Query { @@ -183,78 +182,6 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS, return nullptr; } -/// FactorizeBinOp - Simplify "LHS Opcode RHS" by factorizing out a common term -/// using the operation OpCodeToExtract. For example, when Opcode is Add and -/// OpCodeToExtract is Mul then this tries to turn "(A*B)+(A*C)" into "A*(B+C)". -/// Returns the simplified value, or null if no simplification was performed. -static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS, - unsigned OpcToExtract, const Query &Q, - unsigned MaxRecurse) { - Instruction::BinaryOps OpcodeToExtract = (Instruction::BinaryOps)OpcToExtract; - // Recursion is always used, so bail out at once if we already hit the limit. - if (!MaxRecurse--) - return nullptr; - - BinaryOperator *Op0 = dyn_cast(LHS); - BinaryOperator *Op1 = dyn_cast(RHS); - - if (!Op0 || Op0->getOpcode() != OpcodeToExtract || - !Op1 || Op1->getOpcode() != OpcodeToExtract) - return nullptr; - - // The expression has the form "(A op' B) op (C op' D)". - Value *A = Op0->getOperand(0), *B = Op0->getOperand(1); - Value *C = Op1->getOperand(0), *D = Op1->getOperand(1); - - // Use left distributivity, i.e. "X op' (Y op Z) = (X op' Y) op (X op' Z)". - // Does the instruction have the form "(A op' B) op (A op' D)" or, in the - // commutative case, "(A op' B) op (C op' A)"? - if (A == C || (Instruction::isCommutative(OpcodeToExtract) && A == D)) { - Value *DD = A == C ? D : C; - // Form "A op' (B op DD)" if it simplifies completely. - // Does "B op DD" simplify? - if (Value *V = SimplifyBinOp(Opcode, B, DD, Q, MaxRecurse)) { - // It does! Return "A op' V" if it simplifies or is already available. - // If V equals B then "A op' V" is just the LHS. If V equals DD then - // "A op' V" is just the RHS. - if (V == B || V == DD) { - ++NumFactor; - return V == B ? LHS : RHS; - } - // Otherwise return "A op' V" if it simplifies. - if (Value *W = SimplifyBinOp(OpcodeToExtract, A, V, Q, MaxRecurse)) { - ++NumFactor; - return W; - } - } - } - - // Use right distributivity, i.e. "(X op Y) op' Z = (X op' Z) op (Y op' Z)". - // Does the instruction have the form "(A op' B) op (C op' B)" or, in the - // commutative case, "(A op' B) op (B op' D)"? - if (B == D || (Instruction::isCommutative(OpcodeToExtract) && B == C)) { - Value *CC = B == D ? C : D; - // Form "(A op CC) op' B" if it simplifies completely.. - // Does "A op CC" simplify? - if (Value *V = SimplifyBinOp(Opcode, A, CC, Q, MaxRecurse)) { - // It does! Return "V op' B" if it simplifies or is already available. - // If V equals A then "V op' B" is just the LHS. If V equals CC then - // "V op' B" is just the RHS. - if (V == A || V == CC) { - ++NumFactor; - return V == A ? LHS : RHS; - } - // Otherwise return "V op' B" if it simplifies. - if (Value *W = SimplifyBinOp(OpcodeToExtract, V, B, Q, MaxRecurse)) { - ++NumFactor; - return W; - } - } - } - - return nullptr; -} - /// SimplifyAssociativeBinOp - Generic simplifications for associative binary /// operations. Returns the simpler value, or null if none was found. static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS, @@ -634,11 +561,6 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, MaxRecurse)) return V; - // Mul distributes over Add. Try some generic simplifications based on this. - if (Value *V = FactorizeBinOp(Instruction::Add, Op0, Op1, Instruction::Mul, - Q, MaxRecurse)) - return V; - // Threading Add over selects and phi nodes is pointless, so don't bother. // Threading over the select in "A + select(cond, B, C)" means evaluating // "A+B" and "A+C" and seeing if they are equal; but they are equal if and @@ -754,16 +676,9 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, if (Op0 == Op1) return Constant::getNullValue(Op0->getType()); - // (X*2) - X -> X - // (X<<1) - X -> X - Value *X = nullptr; - if (match(Op0, m_Mul(m_Specific(Op1), m_ConstantInt<2>())) || - match(Op0, m_Shl(m_Specific(Op1), m_One()))) - return Op1; - // (X + Y) - Z -> X + (Y - Z) or Y + (X - Z) if everything simplifies. // For example, (X + Y) - Y -> X; (Y + X) - Y -> X - Value *Y = nullptr, *Z = Op1; + Value *X = nullptr, *Y = nullptr, *Z = Op1; if (MaxRecurse && match(Op0, m_Add(m_Value(X), m_Value(Y)))) { // (X + Y) - Z // See if "V === Y - Z" simplifies. if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse-1)) @@ -835,11 +750,6 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, if (Constant *Result = computePointerDifference(Q.DL, X, Y)) return ConstantExpr::getIntegerCast(Result, Op0->getType(), true); - // Mul distributes over Sub. Try some generic simplifications based on this. - if (Value *V = FactorizeBinOp(Instruction::Sub, Op0, Op1, Instruction::Mul, - Q, MaxRecurse)) - return V; - // i1 sub -> xor. if (MaxRecurse && Op0->getType()->isIntegerTy(1)) if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1)) @@ -1436,6 +1346,11 @@ static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, cast(Op0)->hasNoSignedWrap()) return X; + // Arithmetic shifting an all-sign-bit value is a no-op. + unsigned NumSignBits = ComputeNumSignBits(Op0, Q.DL); + if (NumSignBits == Op0->getType()->getScalarSizeInBits()) + return Op0; + return nullptr; } @@ -1518,11 +1433,6 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q, Q, MaxRecurse)) return V; - // Or distributes over And. Try some generic simplifications based on this. - if (Value *V = FactorizeBinOp(Instruction::And, Op0, Op1, Instruction::Or, - Q, MaxRecurse)) - return V; - // If the operation is with the result of a select instruction, check whether // operating on either branch of the select always yields the same value. if (isa(Op0) || isa(Op1)) @@ -1613,11 +1523,6 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q, MaxRecurse)) return V; - // And distributes over Or. Try some generic simplifications based on this. - if (Value *V = FactorizeBinOp(Instruction::Or, Op0, Op1, Instruction::And, - Q, MaxRecurse)) - return V; - // If the operation is with the result of a select instruction, check whether // operating on either branch of the select always yields the same value. if (isa(Op0) || isa(Op1)) @@ -1625,6 +1530,38 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q, MaxRecurse)) return V; + // (A & C)|(B & D) + Value *C = nullptr, *D = nullptr; + if (match(Op0, m_And(m_Value(A), m_Value(C))) && + match(Op1, m_And(m_Value(B), m_Value(D)))) { + ConstantInt *C1 = dyn_cast(C); + ConstantInt *C2 = dyn_cast(D); + if (C1 && C2 && (C1->getValue() == ~C2->getValue())) { + // (A & C1)|(B & C2) + // If we have: ((V + N) & C1) | (V & C2) + // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 + // replace with V+N. + Value *V1, *V2; + if ((C2->getValue() & (C2->getValue() + 1)) == 0 && // C2 == 0+1+ + match(A, m_Add(m_Value(V1), m_Value(V2)))) { + // Add commutes, try both ways. + if (V1 == B && MaskedValueIsZero(V2, C2->getValue())) + return A; + if (V2 == B && MaskedValueIsZero(V1, C2->getValue())) + return A; + } + // Or commutes, try both ways. + if ((C1->getValue() & (C1->getValue() + 1)) == 0 && + match(B, m_Add(m_Value(V1), m_Value(V2)))) { + // Add commutes, try both ways. + if (V1 == A && MaskedValueIsZero(V2, C1->getValue())) + return B; + if (V2 == A && MaskedValueIsZero(V1, C1->getValue())) + return B; + } + } + } + // If the operation is with the result of a phi instruction, check whether // operating on all incoming values of the phi always yields the same value. if (isa(Op0) || isa(Op1)) @@ -1677,11 +1614,6 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q, MaxRecurse)) return V; - // And distributes over Xor. Try some generic simplifications based on this. - if (Value *V = FactorizeBinOp(Instruction::Xor, Op0, Op1, Instruction::And, - Q, MaxRecurse)) - return V; - // Threading Xor over selects and phi nodes is pointless, so don't bother. // Threading over the select in "A ^ select(cond, B, C)" means evaluating // "A^B" and "A^C" and seeing if they are equal; but they are equal if and @@ -2021,17 +1953,33 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, if (!CI2->isZero()) Upper = NegOne.udiv(CI2->getValue()) + 1; } else if (match(LHS, m_SDiv(m_ConstantInt(CI2), m_Value()))) { - // 'sdiv CI2, x' produces [-|CI2|, |CI2|]. - Upper = CI2->getValue().abs() + 1; - Lower = (-Upper) + 1; + if (CI2->isMinSignedValue()) { + // 'sdiv INT_MIN, x' produces [INT_MIN, INT_MIN / -2]. + Lower = CI2->getValue(); + Upper = Lower.lshr(1) + 1; + } else { + // 'sdiv CI2, x' produces [-|CI2|, |CI2|]. + Upper = CI2->getValue().abs() + 1; + Lower = (-Upper) + 1; + } } else if (match(LHS, m_SDiv(m_Value(), m_ConstantInt(CI2)))) { - // 'sdiv x, CI2' produces [INT_MIN / CI2, INT_MAX / CI2]. APInt IntMin = APInt::getSignedMinValue(Width); APInt IntMax = APInt::getSignedMaxValue(Width); - APInt Val = CI2->getValue().abs(); - if (!Val.isMinValue()) { + APInt Val = CI2->getValue(); + if (Val.isAllOnesValue()) { + // 'sdiv x, -1' produces [INT_MIN + 1, INT_MAX] + // where CI2 != -1 and CI2 != 0 and CI2 != 1 + Lower = IntMin + 1; + Upper = IntMax + 1; + } else if (Val.countLeadingZeros() < Width - 1) { + // 'sdiv x, CI2' produces [INT_MIN / CI2, INT_MAX / CI2] + // where CI2 != -1 and CI2 != 0 and CI2 != 1 Lower = IntMin.sdiv(Val); - Upper = IntMax.sdiv(Val) + 1; + Upper = IntMax.sdiv(Val); + if (Lower.sgt(Upper)) + std::swap(Lower, Upper); + Upper = Upper + 1; + assert(Upper != Lower && "Upper part of range has wrapped!"); } } else if (match(LHS, m_LShr(m_Value(), m_ConstantInt(CI2)))) { // 'lshr x, CI2' produces [0, UINT_MAX >> CI2]. @@ -2241,6 +2189,25 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, } } + // If a bit is known to be zero for A and known to be one for B, + // then A and B cannot be equal. + if (ICmpInst::isEquality(Pred)) { + if (ConstantInt *CI = dyn_cast(RHS)) { + uint32_t BitWidth = CI->getBitWidth(); + APInt LHSKnownZero(BitWidth, 0); + APInt LHSKnownOne(BitWidth, 0); + computeKnownBits(LHS, LHSKnownZero, LHSKnownOne); + APInt RHSKnownZero(BitWidth, 0); + APInt RHSKnownOne(BitWidth, 0); + computeKnownBits(RHS, RHSKnownZero, RHSKnownOne); + if (((LHSKnownOne & RHSKnownZero) != 0) || + ((LHSKnownZero & RHSKnownOne) != 0)) + return (Pred == ICmpInst::ICMP_EQ) + ? ConstantInt::getFalse(CI->getContext()) + : ConstantInt::getTrue(CI->getContext()); + } + } + // Special logic for binary operators. BinaryOperator *LBO = dyn_cast(LHS); BinaryOperator *RBO = dyn_cast(RHS); diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp index 2c6e6e3ffff3..7bd866e73e10 100644 --- a/lib/Analysis/LoopPass.cpp +++ b/lib/Analysis/LoopPass.cpp @@ -45,8 +45,10 @@ class PrintLoopPass : public LoopPass { for (Loop::block_iterator b = L->block_begin(), be = L->block_end(); b != be; ++b) { - assert((*b) != nullptr && "Expecting non-null block"); - (*b)->print(Out); + if (*b) + (*b)->print(Out); + else + Out << "Printing block"; } return false; } diff --git a/lib/Analysis/NoAliasAnalysis.cpp b/lib/Analysis/NoAliasAnalysis.cpp index 4e11e50e2899..139fa38b8a94 100644 --- a/lib/Analysis/NoAliasAnalysis.cpp +++ b/lib/Analysis/NoAliasAnalysis.cpp @@ -15,6 +15,7 @@ #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Pass.h" using namespace llvm; @@ -53,6 +54,13 @@ namespace { bool pointsToConstantMemory(const Location &Loc, bool OrLocal) override { return false; } + Location getArgLocation(ImmutableCallSite CS, unsigned ArgIdx, + ModRefResult &Mask) override { + Mask = ModRef; + return Location(CS.getArgument(ArgIdx), UnknownSize, + CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa)); + } + ModRefResult getModRefInfo(ImmutableCallSite CS, const Location &Loc) override { return ModRef; diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp index 7f88ae125019..08ebf0d85723 100644 --- a/lib/Analysis/RegionInfo.cpp +++ b/lib/Analysis/RegionInfo.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RegionInfoImpl.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopInfo.h" @@ -25,21 +26,26 @@ using namespace llvm; #define DEBUG_TYPE "region" +namespace llvm { +template class RegionBase>; +template class RegionNodeBase>; +template class RegionInfoBase>; +} + +STATISTIC(numRegions, "The # of regions"); +STATISTIC(numSimpleRegions, "The # of simple regions"); + // Always verify if expensive checking is enabled. -#ifdef XDEBUG -static bool VerifyRegionInfo = true; -#else -static bool VerifyRegionInfo = false; -#endif static cl::opt -VerifyRegionInfoX("verify-region-info", cl::location(VerifyRegionInfo), - cl::desc("Verify region info (time consuming)")); +VerifyRegionInfoX( + "verify-region-info", + cl::location(RegionInfoBase>::VerifyRegionInfo), + cl::desc("Verify region info (time consuming)")); -STATISTIC(numRegions, "The # of regions"); -STATISTIC(numSimpleRegions, "The # of simple regions"); -static cl::opt printStyle("print-region-style", +static cl::opt printStyleX("print-region-style", + cl::location(RegionInfo::printStyle), cl::Hidden, cl::desc("style of printing regions"), cl::values( @@ -49,812 +55,110 @@ static cl::opt printStyle("print-region-style", clEnumValN(Region::PrintRN, "rn", "print regions in detail with element_iterator"), clEnumValEnd)); -//===----------------------------------------------------------------------===// -/// Region Implementation -Region::Region(BasicBlock *Entry, BasicBlock *Exit, RegionInfo* RInfo, - DominatorTree *dt, Region *Parent) - : RegionNode(Parent, Entry, 1), RI(RInfo), DT(dt), exit(Exit) {} - -Region::~Region() { - // Free the cached nodes. - for (BBNodeMapT::iterator it = BBNodeMap.begin(), - ie = BBNodeMap.end(); it != ie; ++it) - delete it->second; - - // Only clean the cache for this Region. Caches of child Regions will be - // cleaned when the child Regions are deleted. - BBNodeMap.clear(); -} - -void Region::replaceEntry(BasicBlock *BB) { - entry.setPointer(BB); -} - -void Region::replaceExit(BasicBlock *BB) { - assert(exit && "No exit to replace!"); - exit = BB; -} - -void Region::replaceEntryRecursive(BasicBlock *NewEntry) { - std::vector RegionQueue; - BasicBlock *OldEntry = getEntry(); - - RegionQueue.push_back(this); - while (!RegionQueue.empty()) { - Region *R = RegionQueue.back(); - RegionQueue.pop_back(); - - R->replaceEntry(NewEntry); - for (Region::const_iterator RI = R->begin(), RE = R->end(); RI != RE; ++RI) - if ((*RI)->getEntry() == OldEntry) - RegionQueue.push_back(RI->get()); - } -} - -void Region::replaceExitRecursive(BasicBlock *NewExit) { - std::vector RegionQueue; - BasicBlock *OldExit = getExit(); - - RegionQueue.push_back(this); - while (!RegionQueue.empty()) { - Region *R = RegionQueue.back(); - RegionQueue.pop_back(); - - R->replaceExit(NewExit); - for (Region::const_iterator RI = R->begin(), RE = R->end(); RI != RE; ++RI) - if ((*RI)->getExit() == OldExit) - RegionQueue.push_back(RI->get()); - } -} - -bool Region::contains(const BasicBlock *B) const { - BasicBlock *BB = const_cast(B); - - if (!DT->getNode(BB)) - return false; - - BasicBlock *entry = getEntry(), *exit = getExit(); - - // Toplevel region. - if (!exit) - return true; - - return (DT->dominates(entry, BB) - && !(DT->dominates(exit, BB) && DT->dominates(entry, exit))); -} - -bool Region::contains(const Loop *L) const { - // BBs that are not part of any loop are element of the Loop - // described by the NULL pointer. This loop is not part of any region, - // except if the region describes the whole function. - if (!L) - return getExit() == nullptr; - - if (!contains(L->getHeader())) - return false; - - SmallVector ExitingBlocks; - L->getExitingBlocks(ExitingBlocks); - - for (SmallVectorImpl::iterator BI = ExitingBlocks.begin(), - BE = ExitingBlocks.end(); BI != BE; ++BI) - if (!contains(*BI)) - return false; - - return true; -} - -Loop *Region::outermostLoopInRegion(Loop *L) const { - if (!contains(L)) - return nullptr; - - while (L && contains(L->getParentLoop())) { - L = L->getParentLoop(); - } - - return L; -} - -Loop *Region::outermostLoopInRegion(LoopInfo *LI, BasicBlock* BB) const { - assert(LI && BB && "LI and BB cannot be null!"); - Loop *L = LI->getLoopFor(BB); - return outermostLoopInRegion(L); -} - -BasicBlock *Region::getEnteringBlock() const { - BasicBlock *entry = getEntry(); - BasicBlock *Pred; - BasicBlock *enteringBlock = nullptr; - - for (pred_iterator PI = pred_begin(entry), PE = pred_end(entry); PI != PE; - ++PI) { - Pred = *PI; - if (DT->getNode(Pred) && !contains(Pred)) { - if (enteringBlock) - return nullptr; - - enteringBlock = Pred; - } - } - - return enteringBlock; -} - -BasicBlock *Region::getExitingBlock() const { - BasicBlock *exit = getExit(); - BasicBlock *Pred; - BasicBlock *exitingBlock = nullptr; - - if (!exit) - return nullptr; - - for (pred_iterator PI = pred_begin(exit), PE = pred_end(exit); PI != PE; - ++PI) { - Pred = *PI; - if (contains(Pred)) { - if (exitingBlock) - return nullptr; - - exitingBlock = Pred; - } - } - - return exitingBlock; -} - -bool Region::isSimple() const { - return !isTopLevelRegion() && getEnteringBlock() && getExitingBlock(); -} - -std::string Region::getNameStr() const { - std::string exitName; - std::string entryName; - - if (getEntry()->getName().empty()) { - raw_string_ostream OS(entryName); - - getEntry()->printAsOperand(OS, false); - } else - entryName = getEntry()->getName(); - - if (getExit()) { - if (getExit()->getName().empty()) { - raw_string_ostream OS(exitName); - - getExit()->printAsOperand(OS, false); - } else - exitName = getExit()->getName(); - } else - exitName = ""; - - return entryName + " => " + exitName; -} - -void Region::verifyBBInRegion(BasicBlock *BB) const { - if (!contains(BB)) - llvm_unreachable("Broken region found!"); - - BasicBlock *entry = getEntry(), *exit = getExit(); - - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - if (!contains(*SI) && exit != *SI) - llvm_unreachable("Broken region found!"); - - if (entry != BB) - for (pred_iterator SI = pred_begin(BB), SE = pred_end(BB); SI != SE; ++SI) - if (!contains(*SI)) - llvm_unreachable("Broken region found!"); -} - -void Region::verifyWalk(BasicBlock *BB, std::set *visited) const { - BasicBlock *exit = getExit(); - - visited->insert(BB); - - verifyBBInRegion(BB); - - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - if (*SI != exit && visited->find(*SI) == visited->end()) - verifyWalk(*SI, visited); -} - -void Region::verifyRegion() const { - // Only do verification when user wants to, otherwise this expensive - // check will be invoked by PassManager. - if (!VerifyRegionInfo) return; - - std::set visited; - verifyWalk(getEntry(), &visited); -} - -void Region::verifyRegionNest() const { - for (Region::const_iterator RI = begin(), RE = end(); RI != RE; ++RI) - (*RI)->verifyRegionNest(); - - verifyRegion(); -} - -Region::element_iterator Region::element_begin() { - return GraphTraits::nodes_begin(this); -} - -Region::element_iterator Region::element_end() { - return GraphTraits::nodes_end(this); -} - -Region::const_element_iterator Region::element_begin() const { - return GraphTraits::nodes_begin(this); -} - -Region::const_element_iterator Region::element_end() const { - return GraphTraits::nodes_end(this); -} - -Region* Region::getSubRegionNode(BasicBlock *BB) const { - Region *R = RI->getRegionFor(BB); - - if (!R || R == this) - return nullptr; - - // If we pass the BB out of this region, that means our code is broken. - assert(contains(R) && "BB not in current region!"); - - while (contains(R->getParent()) && R->getParent() != this) - R = R->getParent(); - - if (R->getEntry() != BB) - return nullptr; - - return R; -} - -RegionNode* Region::getBBNode(BasicBlock *BB) const { - assert(contains(BB) && "Can get BB node out of this region!"); - - BBNodeMapT::const_iterator at = BBNodeMap.find(BB); - - if (at != BBNodeMap.end()) - return at->second; - - RegionNode *NewNode = new RegionNode(const_cast(this), BB); - BBNodeMap.insert(std::make_pair(BB, NewNode)); - return NewNode; -} - -RegionNode* Region::getNode(BasicBlock *BB) const { - assert(contains(BB) && "Can get BB node out of this region!"); - if (Region* Child = getSubRegionNode(BB)) - return Child->getNode(); - - return getBBNode(BB); -} - -void Region::transferChildrenTo(Region *To) { - for (iterator I = begin(), E = end(); I != E; ++I) { - (*I)->parent = To; - To->children.push_back(std::move(*I)); - } - children.clear(); -} - -void Region::addSubRegion(Region *SubRegion, bool moveChildren) { - assert(!SubRegion->parent && "SubRegion already has a parent!"); - assert(std::find_if(begin(), end(), [&](const std::unique_ptr &R) { - return R.get() == SubRegion; - }) == children.end() && - "Subregion already exists!"); - - SubRegion->parent = this; - children.push_back(std::unique_ptr(SubRegion)); - - if (!moveChildren) - return; - - assert(SubRegion->children.size() == 0 - && "SubRegions that contain children are not supported"); - - for (element_iterator I = element_begin(), E = element_end(); I != E; ++I) - if (!(*I)->isSubRegion()) { - BasicBlock *BB = (*I)->getNodeAs(); - - if (SubRegion->contains(BB)) - RI->setRegionFor(BB, SubRegion); - } - - std::vector> Keep; - for (iterator I = begin(), E = end(); I != E; ++I) - if (SubRegion->contains(I->get()) && I->get() != SubRegion) { - (*I)->parent = SubRegion; - SubRegion->children.push_back(std::move(*I)); - } else - Keep.push_back(std::move(*I)); - - children.clear(); - children.insert(children.begin(), - std::move_iterator(Keep.begin()), - std::move_iterator(Keep.end())); -} - - -Region *Region::removeSubRegion(Region *Child) { - assert(Child->parent == this && "Child is not a child of this region!"); - Child->parent = nullptr; - RegionSet::iterator I = std::find_if( - children.begin(), children.end(), - [&](const std::unique_ptr &R) { return R.get() == Child; }); - assert(I != children.end() && "Region does not exit. Unable to remove."); - children.erase(children.begin()+(I-begin())); - return Child; -} - -unsigned Region::getDepth() const { - unsigned Depth = 0; - - for (Region *R = parent; R != nullptr; R = R->parent) - ++Depth; - - return Depth; -} -Region *Region::getExpandedRegion() const { - unsigned NumSuccessors = exit->getTerminator()->getNumSuccessors(); - if (NumSuccessors == 0) - return nullptr; - - for (pred_iterator PI = pred_begin(getExit()), PE = pred_end(getExit()); - PI != PE; ++PI) - if (!DT->dominates(getEntry(), *PI)) - return nullptr; - - Region *R = RI->getRegionFor(exit); - - if (R->getEntry() != exit) { - if (exit->getTerminator()->getNumSuccessors() == 1) - return new Region(getEntry(), *succ_begin(exit), RI, DT); - else - return nullptr; - } - - while (R->getParent() && R->getParent()->getEntry() == exit) - R = R->getParent(); - - if (!DT->dominates(getEntry(), R->getExit())) - for (pred_iterator PI = pred_begin(getExit()), PE = pred_end(getExit()); - PI != PE; ++PI) - if (!DT->dominates(R->getExit(), *PI)) - return nullptr; - - return new Region(getEntry(), R->getExit(), RI, DT); -} - -void Region::print(raw_ostream &OS, bool print_tree, unsigned level, - enum PrintStyle Style) const { - if (print_tree) - OS.indent(level*2) << "[" << level << "] " << getNameStr(); - else - OS.indent(level*2) << getNameStr(); - - OS << "\n"; - - - if (Style != PrintNone) { - OS.indent(level*2) << "{\n"; - OS.indent(level*2 + 2); - - if (Style == PrintBB) { - for (const auto &BB : blocks()) - OS << BB->getName() << ", "; // TODO: remove the last "," - } else if (Style == PrintRN) { - for (const_element_iterator I = element_begin(), E = element_end(); I!=E; ++I) - OS << **I << ", "; // TODO: remove the last ", - } - - OS << "\n"; - } +//===----------------------------------------------------------------------===// +// Region implementation +// - if (print_tree) - for (const_iterator RI = begin(), RE = end(); RI != RE; ++RI) - (*RI)->print(OS, print_tree, level+1, Style); +Region::Region(BasicBlock *Entry, BasicBlock *Exit, + RegionInfo* RI, + DominatorTree *DT, Region *Parent) : + RegionBase>(Entry, Exit, RI, DT, Parent) { - if (Style != PrintNone) - OS.indent(level*2) << "} \n"; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void Region::dump() const { - print(dbgs(), true, getDepth(), printStyle.getValue()); -} -#endif - -void Region::clearNodeCache() { - // Free the cached nodes. - for (BBNodeMapT::iterator I = BBNodeMap.begin(), - IE = BBNodeMap.end(); I != IE; ++I) - delete I->second; - - BBNodeMap.clear(); - for (Region::iterator RI = begin(), RE = end(); RI != RE; ++RI) - (*RI)->clearNodeCache(); -} +Region::~Region() { } //===----------------------------------------------------------------------===// // RegionInfo implementation // -bool RegionInfo::isCommonDomFrontier(BasicBlock *BB, BasicBlock *entry, - BasicBlock *exit) const { - for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { - BasicBlock *P = *PI; - if (DT->dominates(entry, P) && !DT->dominates(exit, P)) - return false; - } - return true; -} - -bool RegionInfo::isRegion(BasicBlock *entry, BasicBlock *exit) const { - assert(entry && exit && "entry and exit must not be null!"); - typedef DominanceFrontier::DomSetType DST; - - DST *entrySuccs = &DF->find(entry)->second; - - // Exit is the header of a loop that contains the entry. In this case, - // the dominance frontier must only contain the exit. - if (!DT->dominates(entry, exit)) { - for (DST::iterator SI = entrySuccs->begin(), SE = entrySuccs->end(); - SI != SE; ++SI) - if (*SI != exit && *SI != entry) - return false; - - return true; - } - - DST *exitSuccs = &DF->find(exit)->second; - - // Do not allow edges leaving the region. - for (DST::iterator SI = entrySuccs->begin(), SE = entrySuccs->end(); - SI != SE; ++SI) { - if (*SI == exit || *SI == entry) - continue; - if (exitSuccs->find(*SI) == exitSuccs->end()) - return false; - if (!isCommonDomFrontier(*SI, entry, exit)) - return false; - } - - // Do not allow edges pointing into the region. - for (DST::iterator SI = exitSuccs->begin(), SE = exitSuccs->end(); - SI != SE; ++SI) - if (DT->properlyDominates(entry, *SI) && *SI != exit) - return false; +RegionInfo::RegionInfo() : + RegionInfoBase>() { - - return true; -} - -void RegionInfo::insertShortCut(BasicBlock *entry, BasicBlock *exit, - BBtoBBMap *ShortCut) const { - assert(entry && exit && "entry and exit must not be null!"); - - BBtoBBMap::iterator e = ShortCut->find(exit); - - if (e == ShortCut->end()) - // No further region at exit available. - (*ShortCut)[entry] = exit; - else { - // We found a region e that starts at exit. Therefore (entry, e->second) - // is also a region, that is larger than (entry, exit). Insert the - // larger one. - BasicBlock *BB = e->second; - (*ShortCut)[entry] = BB; - } } -DomTreeNode* RegionInfo::getNextPostDom(DomTreeNode* N, - BBtoBBMap *ShortCut) const { - BBtoBBMap::iterator e = ShortCut->find(N->getBlock()); - - if (e == ShortCut->end()) - return N->getIDom(); +RegionInfo::~RegionInfo() { - return PDT->getNode(e->second)->getIDom(); -} - -bool RegionInfo::isTrivialRegion(BasicBlock *entry, BasicBlock *exit) const { - assert(entry && exit && "entry and exit must not be null!"); - - unsigned num_successors = succ_end(entry) - succ_begin(entry); - - if (num_successors <= 1 && exit == *(succ_begin(entry))) - return true; - - return false; } void RegionInfo::updateStatistics(Region *R) { ++numRegions; // TODO: Slow. Should only be enabled if -stats is used. - if (R->isSimple()) ++numSimpleRegions; -} - -Region *RegionInfo::createRegion(BasicBlock *entry, BasicBlock *exit) { - assert(entry && exit && "entry and exit must not be null!"); - - if (isTrivialRegion(entry, exit)) - return nullptr; - - Region *region = new Region(entry, exit, this, DT); - BBtoRegion.insert(std::make_pair(entry, region)); - - #ifdef XDEBUG - region->verifyRegion(); - #else - DEBUG(region->verifyRegion()); - #endif - - updateStatistics(region); - return region; -} - -void RegionInfo::findRegionsWithEntry(BasicBlock *entry, BBtoBBMap *ShortCut) { - assert(entry); - - DomTreeNode *N = PDT->getNode(entry); - - if (!N) - return; - - Region *lastRegion= nullptr; - BasicBlock *lastExit = entry; - - // As only a BasicBlock that postdominates entry can finish a region, walk the - // post dominance tree upwards. - while ((N = getNextPostDom(N, ShortCut))) { - BasicBlock *exit = N->getBlock(); - - if (!exit) - break; - - if (isRegion(entry, exit)) { - Region *newRegion = createRegion(entry, exit); - - if (lastRegion) - newRegion->addSubRegion(lastRegion); - - lastRegion = newRegion; - lastExit = exit; - } - - // This can never be a region, so stop the search. - if (!DT->dominates(entry, exit)) - break; - } - - // Tried to create regions from entry to lastExit. Next time take a - // shortcut from entry to lastExit. - if (lastExit != entry) - insertShortCut(entry, lastExit, ShortCut); + if (R->isSimple()) + ++numSimpleRegions; } -void RegionInfo::scanForRegions(Function &F, BBtoBBMap *ShortCut) { - BasicBlock *entry = &(F.getEntryBlock()); - DomTreeNode *N = DT->getNode(entry); - - // Iterate over the dominance tree in post order to start with the small - // regions from the bottom of the dominance tree. If the small regions are - // detected first, detection of bigger regions is faster, as we can jump - // over the small regions. - for (po_iterator FI = po_begin(N), FE = po_end(N); FI != FE; - ++FI) { - findRegionsWithEntry(FI->getBlock(), ShortCut); - } -} +void RegionInfo::recalculate(Function &F, DominatorTree *DT_, + PostDominatorTree *PDT_, DominanceFrontier *DF_) { + DT = DT_; + PDT = PDT_; + DF = DF_; -Region *RegionInfo::getTopMostParent(Region *region) { - while (region->parent) - region = region->getParent(); - - return region; + TopLevelRegion = new Region(&F.getEntryBlock(), nullptr, + this, DT, nullptr); + updateStatistics(TopLevelRegion); + calculate(F); } -void RegionInfo::buildRegionsTree(DomTreeNode *N, Region *region) { - BasicBlock *BB = N->getBlock(); - - // Passed region exit - while (BB == region->getExit()) - region = region->getParent(); - - BBtoRegionMap::iterator it = BBtoRegion.find(BB); - - // This basic block is a start block of a region. It is already in the - // BBtoRegion relation. Only the child basic blocks have to be updated. - if (it != BBtoRegion.end()) { - Region *newRegion = it->second; - region->addSubRegion(getTopMostParent(newRegion)); - region = newRegion; - } else { - BBtoRegion[BB] = region; - } +//===----------------------------------------------------------------------===// +// RegionInfoPass implementation +// - for (DomTreeNode::iterator CI = N->begin(), CE = N->end(); CI != CE; ++CI) - buildRegionsTree(*CI, region); +RegionInfoPass::RegionInfoPass() : FunctionPass(ID) { + initializeRegionInfoPassPass(*PassRegistry::getPassRegistry()); } -void RegionInfo::releaseMemory() { - BBtoRegion.clear(); - if (TopLevelRegion) - delete TopLevelRegion; - TopLevelRegion = nullptr; -} +RegionInfoPass::~RegionInfoPass() { -RegionInfo::RegionInfo() : FunctionPass(ID) { - initializeRegionInfoPass(*PassRegistry::getPassRegistry()); - TopLevelRegion = nullptr; } -RegionInfo::~RegionInfo() { +bool RegionInfoPass::runOnFunction(Function &F) { releaseMemory(); -} -void RegionInfo::Calculate(Function &F) { - // ShortCut a function where for every BB the exit of the largest region - // starting with BB is stored. These regions can be threated as single BBS. - // This improves performance on linear CFGs. - BBtoBBMap ShortCut; + auto DT = &getAnalysis().getDomTree(); + auto PDT = &getAnalysis(); + auto DF = &getAnalysis(); - scanForRegions(F, &ShortCut); - BasicBlock *BB = &F.getEntryBlock(); - buildRegionsTree(DT->getNode(BB), TopLevelRegion); + RI.recalculate(F, DT, PDT, DF); + return false; } -bool RegionInfo::runOnFunction(Function &F) { - releaseMemory(); - - DT = &getAnalysis().getDomTree(); - PDT = &getAnalysis(); - DF = &getAnalysis(); - - TopLevelRegion = new Region(&F.getEntryBlock(), nullptr, this, DT, nullptr); - updateStatistics(TopLevelRegion); - - Calculate(F); +void RegionInfoPass::releaseMemory() { + RI.releaseMemory(); +} - return false; +void RegionInfoPass::verifyAnalysis() const { + RI.verifyAnalysis(); } -void RegionInfo::getAnalysisUsage(AnalysisUsage &AU) const { +void RegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequiredTransitive(); AU.addRequired(); AU.addRequired(); } -void RegionInfo::print(raw_ostream &OS, const Module *) const { - OS << "Region tree:\n"; - TopLevelRegion->print(OS, true, 0, printStyle.getValue()); - OS << "End region tree\n"; -} - -void RegionInfo::verifyAnalysis() const { - // Only do verification when user wants to, otherwise this expensive check - // will be invoked by PMDataManager::verifyPreservedAnalysis when - // a regionpass (marked PreservedAll) finish. - if (!VerifyRegionInfo) return; - - TopLevelRegion->verifyRegionNest(); -} - -// Region pass manager support. -Region *RegionInfo::getRegionFor(BasicBlock *BB) const { - BBtoRegionMap::const_iterator I= - BBtoRegion.find(BB); - return I != BBtoRegion.end() ? I->second : nullptr; -} - -void RegionInfo::setRegionFor(BasicBlock *BB, Region *R) { - BBtoRegion[BB] = R; -} - -Region *RegionInfo::operator[](BasicBlock *BB) const { - return getRegionFor(BB); +void RegionInfoPass::print(raw_ostream &OS, const Module *) const { + RI.print(OS); } -BasicBlock *RegionInfo::getMaxRegionExit(BasicBlock *BB) const { - BasicBlock *Exit = nullptr; - - while (true) { - // Get largest region that starts at BB. - Region *R = getRegionFor(BB); - while (R && R->getParent() && R->getParent()->getEntry() == BB) - R = R->getParent(); - - // Get the single exit of BB. - if (R && R->getEntry() == BB) - Exit = R->getExit(); - else if (++succ_begin(BB) == succ_end(BB)) - Exit = *succ_begin(BB); - else // No single exit exists. - return Exit; - - // Get largest region that starts at Exit. - Region *ExitR = getRegionFor(Exit); - while (ExitR && ExitR->getParent() - && ExitR->getParent()->getEntry() == Exit) - ExitR = ExitR->getParent(); - - for (pred_iterator PI = pred_begin(Exit), PE = pred_end(Exit); PI != PE; - ++PI) - if (!R->contains(*PI) && !ExitR->contains(*PI)) - break; - - // This stops infinite cycles. - if (DT->dominates(Exit, BB)) - break; - - BB = Exit; - } - - return Exit; -} - -Region* -RegionInfo::getCommonRegion(Region *A, Region *B) const { - assert (A && B && "One of the Regions is NULL"); - - if (A->contains(B)) return A; - - while (!B->contains(A)) - B = B->getParent(); - - return B; -} - -Region* -RegionInfo::getCommonRegion(SmallVectorImpl &Regions) const { - Region* ret = Regions.back(); - Regions.pop_back(); - - for (SmallVectorImpl::const_iterator I = Regions.begin(), - E = Regions.end(); I != E; ++I) - ret = getCommonRegion(ret, *I); - - return ret; -} - -Region* -RegionInfo::getCommonRegion(SmallVectorImpl &BBs) const { - Region* ret = getRegionFor(BBs.back()); - BBs.pop_back(); - - for (SmallVectorImpl::const_iterator I = BBs.begin(), - E = BBs.end(); I != E; ++I) - ret = getCommonRegion(ret, getRegionFor(*I)); - - return ret; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void RegionInfoPass::dump() const { + RI.dump(); } +#endif -void RegionInfo::splitBlock(BasicBlock* NewBB, BasicBlock *OldBB) -{ - Region *R = getRegionFor(OldBB); - - setRegionFor(NewBB, R); - - while (R->getEntry() == OldBB && !R->isTopLevelRegion()) { - R->replaceEntry(NewBB); - R = R->getParent(); - } - - setRegionFor(OldBB, R); -} +char RegionInfoPass::ID = 0; -char RegionInfo::ID = 0; -INITIALIZE_PASS_BEGIN(RegionInfo, "regions", +INITIALIZE_PASS_BEGIN(RegionInfoPass, "regions", "Detect single entry single exit regions", true, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) INITIALIZE_PASS_DEPENDENCY(DominanceFrontier) -INITIALIZE_PASS_END(RegionInfo, "regions", +INITIALIZE_PASS_END(RegionInfoPass, "regions", "Detect single entry single exit regions", true, true) // Create methods available outside of this file, to use them @@ -863,7 +167,7 @@ INITIALIZE_PASS_END(RegionInfo, "regions", namespace llvm { FunctionPass *createRegionInfoPass() { - return new RegionInfo(); + return new RegionInfoPass(); } } diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp index d11b3323cac6..de34b727a5a0 100644 --- a/lib/Analysis/RegionPass.cpp +++ b/lib/Analysis/RegionPass.cpp @@ -45,14 +45,14 @@ static void addRegionIntoQueue(Region &R, std::deque &RQ) { /// Pass Manager itself does not invalidate any analysis info. void RGPassManager::getAnalysisUsage(AnalysisUsage &Info) const { - Info.addRequired(); + Info.addRequired(); Info.setPreservesAll(); } /// run - Execute all of the passes scheduled for execution. Keep track of /// whether any of the passes modifies the function, and if so, return true. bool RGPassManager::runOnFunction(Function &F) { - RI = &getAnalysis(); + RI = &getAnalysis().getRegionInfo(); bool Changed = false; // Collect inherited analysis from Module level pass manager. @@ -196,8 +196,10 @@ class PrintRegionPass : public RegionPass { bool runOnRegion(Region *R, RGPassManager &RGM) override { Out << Banner; for (const auto &BB : R->blocks()) { - assert(BB != nullptr && "Expecting non-null Block"); - BB->print(Out); + if (BB) + BB->print(Out); + else + Out << "Printing Block"; } return false; diff --git a/lib/Analysis/RegionPrinter.cpp b/lib/Analysis/RegionPrinter.cpp index 893210a5d705..ad83113ec930 100644 --- a/lib/Analysis/RegionPrinter.cpp +++ b/lib/Analysis/RegionPrinter.cpp @@ -56,23 +56,24 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { }; template<> -struct DOTGraphTraits : public DOTGraphTraits { +struct DOTGraphTraits : public DOTGraphTraits { - DOTGraphTraits (bool isSimple=false) + DOTGraphTraits (bool isSimple = false) : DOTGraphTraits(isSimple) {} - static std::string getGraphName(RegionInfo *DT) { + static std::string getGraphName(RegionInfoPass *DT) { return "Region Graph"; } - std::string getNodeLabel(RegionNode *Node, RegionInfo *G) { + std::string getNodeLabel(RegionNode *Node, RegionInfoPass *G) { + RegionInfo &RI = G->getRegionInfo(); return DOTGraphTraits::getNodeLabel(Node, - G->getTopLevelRegion()); + reinterpret_cast(RI.getTopLevelRegion())); } std::string getEdgeAttributes(RegionNode *srcNode, - GraphTraits::ChildIteratorType CI, RegionInfo *RI) { - + GraphTraits::ChildIteratorType CI, RegionInfoPass *G) { + RegionInfo &RI = G->getRegionInfo(); RegionNode *destNode = *CI; if (srcNode->isSubRegion() || destNode->isSubRegion()) @@ -82,7 +83,7 @@ struct DOTGraphTraits : public DOTGraphTraits { BasicBlock *srcBB = srcNode->getNodeAs(); BasicBlock *destBB = destNode->getNodeAs(); - Region *R = RI->getRegionFor(destBB); + Region *R = RI.getRegionFor(destBB); while (R && R->getParent()) if (R->getParent()->getEntry() == destBB) @@ -98,7 +99,8 @@ struct DOTGraphTraits : public DOTGraphTraits { // Print the cluster of the subregions. This groups the single basic blocks // and adds a different background color for each group. - static void printRegionCluster(const Region &R, GraphWriter &GW, + static void printRegionCluster(const Region &R, + GraphWriter &GW, unsigned depth = 0) { raw_ostream &O = GW.getOStream(); O.indent(2 * depth) << "subgraph cluster_" << static_cast(&R) @@ -119,22 +121,23 @@ struct DOTGraphTraits : public DOTGraphTraits { for (Region::const_iterator RI = R.begin(), RE = R.end(); RI != RE; ++RI) printRegionCluster(**RI, GW, depth + 1); - RegionInfo *RI = R.getRegionInfo(); + const RegionInfo &RI = *static_cast(R.getRegionInfo()); for (const auto &BB : R.blocks()) - if (RI->getRegionFor(BB) == &R) + if (RI.getRegionFor(BB) == &R) O.indent(2 * (depth + 1)) << "Node" - << static_cast(RI->getTopLevelRegion()->getBBNode(BB)) + << static_cast(RI.getTopLevelRegion()->getBBNode(BB)) << ";\n"; O.indent(2 * depth) << "}\n"; } - static void addCustomGraphFeatures(const RegionInfo* RI, - GraphWriter &GW) { + static void addCustomGraphFeatures(const RegionInfoPass* RIP, + GraphWriter &GW) { + const RegionInfo &RI = RIP->getRegionInfo(); raw_ostream &O = GW.getOStream(); O << "\tcolorscheme = \"paired12\"\n"; - printRegionCluster(*RI->getTopLevelRegion(), GW, 4); + printRegionCluster(*RI.getTopLevelRegion(), GW, 4); } }; } //end namespace llvm @@ -142,28 +145,28 @@ struct DOTGraphTraits : public DOTGraphTraits { namespace { struct RegionViewer - : public DOTGraphTraitsViewer { + : public DOTGraphTraitsViewer { static char ID; - RegionViewer() : DOTGraphTraitsViewer("reg", ID){ + RegionViewer() : DOTGraphTraitsViewer("reg", ID){ initializeRegionViewerPass(*PassRegistry::getPassRegistry()); } }; char RegionViewer::ID = 0; struct RegionOnlyViewer - : public DOTGraphTraitsViewer { + : public DOTGraphTraitsViewer { static char ID; - RegionOnlyViewer() : DOTGraphTraitsViewer("regonly", ID) { + RegionOnlyViewer() : DOTGraphTraitsViewer("regonly", ID) { initializeRegionOnlyViewerPass(*PassRegistry::getPassRegistry()); } }; char RegionOnlyViewer::ID = 0; struct RegionPrinter - : public DOTGraphTraitsPrinter { + : public DOTGraphTraitsPrinter { static char ID; RegionPrinter() : - DOTGraphTraitsPrinter("reg", ID) { + DOTGraphTraitsPrinter("reg", ID) { initializeRegionPrinterPass(*PassRegistry::getPassRegistry()); } }; @@ -175,7 +178,7 @@ INITIALIZE_PASS(RegionPrinter, "dot-regions", INITIALIZE_PASS(RegionViewer, "view-regions", "View regions of function", true, true) - + INITIALIZE_PASS(RegionOnlyViewer, "view-regions-only", "View regions of function (with no function bodies)", true, true) @@ -183,10 +186,10 @@ INITIALIZE_PASS(RegionOnlyViewer, "view-regions-only", namespace { struct RegionOnlyPrinter - : public DOTGraphTraitsPrinter { + : public DOTGraphTraitsPrinter { static char ID; RegionOnlyPrinter() : - DOTGraphTraitsPrinter("reg", ID) { + DOTGraphTraitsPrinter("reg", ID) { initializeRegionOnlyPrinterPass(*PassRegistry::getPassRegistry()); } }; diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp index b5070434d14d..8c75b0db70f2 100644 --- a/lib/Analysis/ScalarEvolutionExpander.cpp +++ b/lib/Analysis/ScalarEvolutionExpander.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" @@ -1706,7 +1707,7 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, // Fold constant phis. They may be congruent to other constant phis and // would confuse the logic below that expects proper IVs. - if (Value *V = Phi->hasConstantValue()) { + if (Value *V = SimplifyInstruction(Phi, SE.DL, SE.TLI, SE.DT)) { Phi->replaceAllUsesWith(V); DeadInsts.push_back(Phi); ++NumElim; diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index 4f4875357828..e6d09f4e31f6 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include using namespace llvm; @@ -188,7 +189,8 @@ static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW, KnownOne.setBit(BitWidth - 1); } -void llvm::computeKnownBitsLoad(const MDNode &Ranges, APInt &KnownZero) { +void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges, + APInt &KnownZero) { unsigned BitWidth = KnownZero.getBitWidth(); unsigned NumRanges = Ranges.getNumOperands() / 2; assert(NumRanges >= 1); @@ -338,7 +340,7 @@ void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, default: break; case Instruction::Load: if (MDNode *MD = cast(I)->getMetadata(LLVMContext::MD_range)) - computeKnownBitsLoad(*MD, KnownZero); + computeKnownBitsFromRangeMetadata(*MD, KnownZero); break; case Instruction::And: { // If either the LHS or the RHS are Zero, the result is zero. @@ -414,6 +416,7 @@ void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, break; // Can't work with floating point. case Instruction::PtrToInt: case Instruction::IntToPtr: + case Instruction::AddrSpaceCast: // Pointers could be different sizes. // We can't handle these if we don't know the pointer size. if (!TD) break; // FALL THROUGH and handle them the same as zext/trunc. @@ -733,6 +736,12 @@ void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, break; } case Instruction::Call: + case Instruction::Invoke: + if (MDNode *MD = cast(I)->getMetadata(LLVMContext::MD_range)) + computeKnownBitsFromRangeMetadata(*MD, KnownZero); + // If a range metadata is attached to this IntrinsicInst, intersect the + // explicit range specified by the metadata and the implicit range of + // the intrinsic. if (IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { default: break; @@ -742,16 +751,16 @@ void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, // If this call is undefined for 0, the result will be less than 2^n. if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext())) LowBits -= 1; - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); break; } case Intrinsic::ctpop: { unsigned LowBits = Log2_32(BitWidth)+1; - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); break; } case Intrinsic::x86_sse42_crc32_64_64: - KnownZero = APInt::getHighBitsSet(64, 32); + KnownZero |= APInt::getHighBitsSet(64, 32); break; } } @@ -1724,7 +1733,8 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, } Ptr = GEP->getPointerOperand(); - } else if (Operator::getOpcode(Ptr) == Instruction::BitCast) { + } else if (Operator::getOpcode(Ptr) == Instruction::BitCast || + Operator::getOpcode(Ptr) == Instruction::AddrSpaceCast) { Ptr = cast(Ptr)->getOperand(0); } else if (GlobalAlias *GA = dyn_cast(Ptr)) { if (GA->mayBeOverridden()) @@ -1893,7 +1903,8 @@ llvm::GetUnderlyingObject(Value *V, const DataLayout *TD, unsigned MaxLookup) { for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) { if (GEPOperator *GEP = dyn_cast(V)) { V = GEP->getPointerOperand(); - } else if (Operator::getOpcode(V) == Instruction::BitCast) { + } else if (Operator::getOpcode(V) == Instruction::BitCast || + Operator::getOpcode(V) == Instruction::AddrSpaceCast) { V = cast(V)->getOperand(0); } else if (GlobalAlias *GA = dyn_cast(V)) { if (GA->mayBeOverridden()) @@ -1977,7 +1988,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V, return true; case Instruction::UDiv: case Instruction::URem: - // x / y is undefined if y == 0, but calcuations like x / 3 are safe. + // x / y is undefined if y == 0, but calculations like x / 3 are safe. return isKnownNonZero(Inst->getOperand(1), TD); case Instruction::SDiv: case Instruction::SRem: { @@ -2000,12 +2011,12 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V, // Speculative load may create a race that did not exist in the source. LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread)) return false; - return LI->getPointerOperand()->isDereferenceablePointer(); + return LI->getPointerOperand()->isDereferenceablePointer(TD); } case Instruction::Call: { if (const IntrinsicInst *II = dyn_cast(Inst)) { switch (II->getIntrinsicID()) { - // These synthetic intrinsics have no side-effects, and just mark + // These synthetic intrinsics have no side-effects and just mark // information about their operands. // FIXME: There are other no-op synthetic instructions that potentially // should be considered at least *safe* to speculate... @@ -2075,7 +2086,7 @@ bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) { return !GV->hasExternalWeakLinkage(); if (ImmutableCallSite CS = V) - if (CS.paramHasAttr(0, Attribute::NonNull)) + if (CS.isReturnNonNull()) return true; // operator new never returns null. diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp index 1334825a7d56..962298fcd542 100644 --- a/lib/AsmParser/LLLexer.cpp +++ b/lib/AsmParser/LLLexer.cpp @@ -209,6 +209,7 @@ lltok::Kind LLLexer::LexToken() { return LexToken(); case '+': return LexPositive(); case '@': return LexAt(); + case '$': return LexDollar(); case '%': return LexPercent(); case '"': return LexQuote(); case '.': @@ -222,13 +223,6 @@ lltok::Kind LLLexer::LexToken() { return lltok::dotdotdot; } return lltok::Error; - case '$': - if (const char *Ptr = isLabelTail(CurPtr)) { - CurPtr = Ptr; - StrVal.assign(TokStart, CurPtr-1); - return lltok::LabelStr; - } - return lltok::Error; case ';': SkipLineComment(); return LexToken(); @@ -307,6 +301,43 @@ lltok::Kind LLLexer::LexAt() { return lltok::Error; } +lltok::Kind LLLexer::LexDollar() { + if (const char *Ptr = isLabelTail(TokStart)) { + CurPtr = Ptr; + StrVal.assign(TokStart, CurPtr - 1); + return lltok::LabelStr; + } + + // Handle DollarStringConstant: $\"[^\"]*\" + if (CurPtr[0] == '"') { + ++CurPtr; + + while (1) { + int CurChar = getNextChar(); + + if (CurChar == EOF) { + Error("end of file in COMDAT variable name"); + return lltok::Error; + } + if (CurChar == '"') { + StrVal.assign(TokStart + 2, CurPtr - 1); + UnEscapeLexed(StrVal); + if (StringRef(StrVal).find_first_of(0) != StringRef::npos) { + Error("Null bytes are not allowed in names"); + return lltok::Error; + } + return lltok::ComdatVar; + } + } + } + + // Handle ComdatVarName: $[-a-zA-Z$._][-a-zA-Z$._0-9]* + if (ReadVarName()) + return lltok::ComdatVar; + + return lltok::Error; +} + /// ReadString - Read a string until the closing quote. lltok::Kind LLLexer::ReadString(lltok::Kind kind) { const char *Start = CurPtr; @@ -581,6 +612,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(byval); KEYWORD(inalloca); KEYWORD(cold); + KEYWORD(dereferenceable); KEYWORD(inlinehint); KEYWORD(inreg); KEYWORD(jumptable); @@ -618,6 +650,15 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(type); KEYWORD(opaque); + KEYWORD(comdat); + + // Comdat types + KEYWORD(any); + KEYWORD(exactmatch); + KEYWORD(largest); + KEYWORD(noduplicates); + KEYWORD(samesize); + KEYWORD(eq); KEYWORD(ne); KEYWORD(slt); KEYWORD(sgt); KEYWORD(sle); KEYWORD(sge); KEYWORD(ult); KEYWORD(ugt); KEYWORD(ule); KEYWORD(uge); KEYWORD(oeq); KEYWORD(one); KEYWORD(olt); KEYWORD(ogt); KEYWORD(ole); diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h index ad11d49b2594..d42de57a3d6e 100644 --- a/lib/AsmParser/LLLexer.h +++ b/lib/AsmParser/LLLexer.h @@ -81,6 +81,7 @@ namespace llvm { lltok::Kind LexDigitOrNegative(); lltok::Kind LexPositive(); lltok::Kind LexAt(); + lltok::Kind LexDollar(); lltok::Kind LexExclaim(); lltok::Kind LexPercent(); lltok::Kind LexQuote(); diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp index 0c188f983ff5..ac6e0e512fea 100644 --- a/lib/AsmParser/LLParser.cpp +++ b/lib/AsmParser/LLParser.cpp @@ -163,6 +163,11 @@ bool LLParser::ValidateEndOfModule() { return Error(I->second.second, "use of undefined type named '" + I->getKey() + "'"); + if (!ForwardRefComdats.empty()) + return Error(ForwardRefComdats.begin()->second, + "use of undefined comdat '$" + + ForwardRefComdats.begin()->first + "'"); + if (!ForwardRefVals.empty()) return Error(ForwardRefVals.begin()->second.second, "use of undefined value '@" + ForwardRefVals.begin()->first + @@ -238,6 +243,7 @@ bool LLParser::ParseTopLevelEntities() { case lltok::LocalVar: if (ParseNamedType()) return true; break; case lltok::GlobalID: if (ParseUnnamedGlobal()) return true; break; case lltok::GlobalVar: if (ParseNamedGlobal()) return true; break; + case lltok::ComdatVar: if (parseComdat()) return true; break; case lltok::exclaim: if (ParseStandaloneMetadata()) return true; break; case lltok::MetadataVar:if (ParseNamedMetadata()) return true; break; @@ -513,11 +519,62 @@ bool LLParser::ParseNamedGlobal() { UnnamedAddr); } +bool LLParser::parseComdat() { + assert(Lex.getKind() == lltok::ComdatVar); + std::string Name = Lex.getStrVal(); + LocTy NameLoc = Lex.getLoc(); + Lex.Lex(); + + if (ParseToken(lltok::equal, "expected '=' here")) + return true; + + if (ParseToken(lltok::kw_comdat, "expected comdat keyword")) + return TokError("expected comdat type"); + + Comdat::SelectionKind SK; + switch (Lex.getKind()) { + default: + return TokError("unknown selection kind"); + case lltok::kw_any: + SK = Comdat::Any; + break; + case lltok::kw_exactmatch: + SK = Comdat::ExactMatch; + break; + case lltok::kw_largest: + SK = Comdat::Largest; + break; + case lltok::kw_noduplicates: + SK = Comdat::NoDuplicates; + break; + case lltok::kw_samesize: + SK = Comdat::SameSize; + break; + } + Lex.Lex(); + + // See if the comdat was forward referenced, if so, use the comdat. + Module::ComdatSymTabType &ComdatSymTab = M->getComdatSymbolTable(); + Module::ComdatSymTabType::iterator I = ComdatSymTab.find(Name); + if (I != ComdatSymTab.end() && !ForwardRefComdats.erase(Name)) + return Error(NameLoc, "redefinition of comdat '$" + Name + "'"); + + Comdat *C; + if (I != ComdatSymTab.end()) + C = &I->second; + else + C = M->getOrInsertComdat(Name); + C->setSelectionKind(SK); + + return false; +} + // MDString: // ::= '!' STRINGCONSTANT bool LLParser::ParseMDString(MDString *&Result) { std::string Str; if (ParseStringConstant(Str)) return true; + llvm::UpgradeMDStringConstant(Str); Result = MDString::get(Context, Str); return false; } @@ -837,7 +894,13 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc, if (ParseOptionalAlignment(Alignment)) return true; GV->setAlignment(Alignment); } else { - TokError("unknown global variable property!"); + Comdat *C; + if (parseOptionalComdat(C)) + return true; + if (C) + GV->setComdat(C); + else + return TokError("unknown global variable property!"); } } @@ -989,6 +1052,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, "invalid use of attribute on a function"); break; case lltok::kw_byval: + case lltok::kw_dereferenceable: case lltok::kw_inalloca: case lltok::kw_nest: case lltok::kw_noalias: @@ -1095,6 +1159,24 @@ GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) { } +//===----------------------------------------------------------------------===// +// Comdat Reference/Resolution Routines. +//===----------------------------------------------------------------------===// + +Comdat *LLParser::getComdat(const std::string &Name, LocTy Loc) { + // Look this name up in the comdat symbol table. + Module::ComdatSymTabType &ComdatSymTab = M->getComdatSymbolTable(); + Module::ComdatSymTabType::iterator I = ComdatSymTab.find(Name); + if (I != ComdatSymTab.end()) + return &I->second; + + // Otherwise, create a new forward reference for this value and remember it. + Comdat *C = M->getOrInsertComdat(Name); + ForwardRefComdats[Name] = Loc; + return C; +} + + //===----------------------------------------------------------------------===// // Helper Routines. //===----------------------------------------------------------------------===// @@ -1131,6 +1213,16 @@ bool LLParser::ParseUInt32(unsigned &Val) { return false; } +/// ParseUInt64 +/// ::= uint64 +bool LLParser::ParseUInt64(uint64_t &Val) { + if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned()) + return TokError("expected integer"); + Val = Lex.getAPSIntVal().getLimitedValue(); + Lex.Lex(); + return false; +} + /// ParseTLSModel /// := 'localdynamic' /// := 'initialexec' @@ -1203,6 +1295,13 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { continue; } case lltok::kw_byval: B.addAttribute(Attribute::ByVal); break; + case lltok::kw_dereferenceable: { + uint64_t Bytes; + if (ParseOptionalDereferenceableBytes(Bytes)) + return true; + B.addDereferenceableAttr(Bytes); + continue; + } case lltok::kw_inalloca: B.addAttribute(Attribute::InAlloca); break; case lltok::kw_inreg: B.addAttribute(Attribute::InReg); break; case lltok::kw_nest: B.addAttribute(Attribute::Nest); break; @@ -1260,6 +1359,13 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { switch (Token) { default: // End of attributes. return HaveError; + case lltok::kw_dereferenceable: { + uint64_t Bytes; + if (ParseOptionalDereferenceableBytes(Bytes)) + return true; + B.addDereferenceableAttr(Bytes); + continue; + } case lltok::kw_inreg: B.addAttribute(Attribute::InReg); break; case lltok::kw_noalias: B.addAttribute(Attribute::NoAlias); break; case lltok::kw_nonnull: B.addAttribute(Attribute::NonNull); break; @@ -1525,6 +1631,26 @@ bool LLParser::ParseOptionalAlignment(unsigned &Alignment) { return false; } +/// ParseOptionalDereferenceableBytes +/// ::= /* empty */ +/// ::= 'dereferenceable' '(' 4 ')' +bool LLParser::ParseOptionalDereferenceableBytes(uint64_t &Bytes) { + Bytes = 0; + if (!EatIfPresent(lltok::kw_dereferenceable)) + return false; + LocTy ParenLoc = Lex.getLoc(); + if (!EatIfPresent(lltok::lparen)) + return Error(ParenLoc, "expected '('"); + LocTy DerefLoc = Lex.getLoc(); + if (ParseUInt64(Bytes)) return true; + ParenLoc = Lex.getLoc(); + if (!EatIfPresent(lltok::rparen)) + return Error(ParenLoc, "expected ')'"); + if (!Bytes) + return Error(DerefLoc, "dereferenceable bytes must be non-zero"); + return false; +} + /// ParseOptionalCommaAlign /// ::= /// ::= ',' align 4 @@ -2789,6 +2915,19 @@ bool LLParser::ParseGlobalTypeAndValue(Constant *&V) { ParseGlobalValue(Ty, V); } +bool LLParser::parseOptionalComdat(Comdat *&C) { + C = nullptr; + if (!EatIfPresent(lltok::kw_comdat)) + return false; + if (Lex.getKind() != lltok::ComdatVar) + return TokError("expected comdat variable"); + LocTy Loc = Lex.getLoc(); + StringRef Name = Lex.getStrVal(); + C = getComdat(Name, Loc); + Lex.Lex(); + return false; +} + /// ParseGlobalValueVector /// ::= /*empty*/ /// ::= TypeAndValue (',' TypeAndValue)* @@ -3089,6 +3228,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { bool UnnamedAddr; LocTy UnnamedAddrLoc; Constant *Prefix = nullptr; + Comdat *C; if (ParseArgumentList(ArgList, isVarArg) || ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr, @@ -3097,6 +3237,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { BuiltinLoc) || (EatIfPresent(lltok::kw_section) && ParseStringConstant(Section)) || + parseOptionalComdat(C) || ParseOptionalAlignment(Alignment) || (EatIfPresent(lltok::kw_gc) && ParseStringConstant(GC)) || @@ -3199,6 +3340,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { Fn->setUnnamedAddr(UnnamedAddr); Fn->setAlignment(Alignment); Fn->setSection(Section); + Fn->setComdat(C); if (!GC.empty()) Fn->setGC(GC.c_str()); Fn->setPrefixData(Prefix); ForwardRefAttrGroups[Fn] = FwdRefAttrGrps; diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h index f7d69d267d5d..7203bb245d0f 100644 --- a/lib/AsmParser/LLParser.h +++ b/lib/AsmParser/LLParser.h @@ -34,6 +34,7 @@ namespace llvm { class Instruction; class Constant; class GlobalValue; + class Comdat; class MDString; class MDNode; class StructType; @@ -122,6 +123,9 @@ namespace llvm { std::map > ForwardRefValIDs; std::vector NumberedVals; + // Comdat forward reference information. + std::map ForwardRefComdats; + // References to blockaddress. The key is the function ValID, the value is // a list of references to blocks in that function. std::map > > @@ -154,6 +158,10 @@ namespace llvm { GlobalValue *GetGlobalVal(const std::string &N, Type *Ty, LocTy Loc); GlobalValue *GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc); + /// Get a Comdat with the specified name, creating a forward reference + /// record if needed. + Comdat *getComdat(const std::string &N, LocTy Loc); + // Helper Routines. bool ParseToken(lltok::Kind T, const char *ErrMsg); bool EatIfPresent(lltok::Kind T) { @@ -194,6 +202,11 @@ namespace llvm { Loc = Lex.getLoc(); return ParseUInt32(Val); } + bool ParseUInt64(uint64_t &Val); + bool ParseUInt64(uint64_t &Val, LocTy &Loc) { + Loc = Lex.getLoc(); + return ParseUInt64(Val); + } bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM); bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM); @@ -211,6 +224,7 @@ namespace llvm { bool ParseOptionalDLLStorageClass(unsigned &DLLStorageClass); bool ParseOptionalCallingConv(CallingConv::ID &CC); bool ParseOptionalAlignment(unsigned &Alignment); + bool ParseOptionalDereferenceableBytes(uint64_t &Bytes); bool ParseScopeAndOrdering(bool isAtomic, SynchronizationScope &Scope, AtomicOrdering &Ordering); bool ParseOrdering(AtomicOrdering &Ordering); @@ -247,6 +261,7 @@ namespace llvm { bool ParseAlias(const std::string &Name, LocTy Loc, unsigned Visibility, unsigned DLLStorageClass, GlobalVariable::ThreadLocalMode TLM, bool UnnamedAddr); + bool parseComdat(); bool ParseStandaloneMetadata(); bool ParseNamedMetadata(); bool ParseMDString(MDString *&Result); @@ -358,6 +373,7 @@ namespace llvm { bool ParseGlobalValue(Type *Ty, Constant *&V); bool ParseGlobalTypeAndValue(Constant *&V); bool ParseGlobalValueVector(SmallVectorImpl &Elts); + bool parseOptionalComdat(Comdat *&C); bool ParseMetadataListValue(ValID &ID, PerFunctionState *PFS); bool ParseMetadataValue(ValID &ID, PerFunctionState *PFS); bool ParseMDNodeVector(SmallVectorImpl &, PerFunctionState *PFS); diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h index af8b0da78bf8..2f02606f93e7 100644 --- a/lib/AsmParser/LLToken.h +++ b/lib/AsmParser/LLToken.h @@ -106,6 +106,7 @@ namespace lltok { kw_byval, kw_inalloca, kw_cold, + kw_dereferenceable, kw_inlinehint, kw_inreg, kw_jumptable, @@ -142,6 +143,15 @@ namespace lltok { kw_type, kw_opaque, + kw_comdat, + + // Comdat types + kw_any, + kw_exactmatch, + kw_largest, + kw_noduplicates, + kw_samesize, + kw_eq, kw_ne, kw_slt, kw_sgt, kw_sle, kw_sge, kw_ult, kw_ugt, kw_ule, kw_uge, kw_oeq, kw_one, kw_olt, kw_ogt, kw_ole, kw_oge, kw_ord, kw_uno, kw_ueq, kw_une, @@ -180,6 +190,7 @@ namespace lltok { // String valued tokens (StrVal). LabelStr, // foo: GlobalVar, // @foo @"foo" + ComdatVar, // $foo LocalVar, // %foo %"foo" MetadataVar, // !foo StringConstant, // "foo" diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp index 067cac8578cd..91bb51c24224 100644 --- a/lib/AsmParser/Parser.cpp +++ b/lib/AsmParser/Parser.cpp @@ -41,21 +41,21 @@ Module *llvm::ParseAssembly(MemoryBuffer *F, Module *llvm::ParseAssemblyFile(const std::string &Filename, SMDiagnostic &Err, LLVMContext &Context) { - std::unique_ptr File; - if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) { + ErrorOr> FileOrErr = + MemoryBuffer::getFileOrSTDIN(Filename); + if (std::error_code EC = FileOrErr.getError()) { Err = SMDiagnostic(Filename, SourceMgr::DK_Error, - "Could not open input file: " + ec.message()); + "Could not open input file: " + EC.message()); return nullptr; } - return ParseAssembly(File.release(), nullptr, Err, Context); + return ParseAssembly(FileOrErr.get().release(), nullptr, Err, Context); } Module *llvm::ParseAssemblyString(const char *AsmString, Module *M, SMDiagnostic &Err, LLVMContext &Context) { MemoryBuffer *F = - MemoryBuffer::getMemBuffer(StringRef(AsmString, strlen(AsmString)), - ""); + MemoryBuffer::getMemBuffer(StringRef(AsmString), ""); return ParseAssembly(F, M, Err, Context); } diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index 9c398277d42d..47a39539e20f 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -39,12 +39,11 @@ void BitcodeReader::materializeForwardReferencedFunctions() { } void BitcodeReader::FreeState() { - if (BufferOwned) - delete Buffer; Buffer = nullptr; std::vector().swap(TypeList); ValueList.clear(); MDValueList.clear(); + std::vector().swap(ComdatList); std::vector().swap(MAttributes); std::vector().swap(FunctionBBs); @@ -205,6 +204,22 @@ static SynchronizationScope GetDecodedSynchScope(unsigned Val) { } } +static Comdat::SelectionKind getDecodedComdatSelectionKind(unsigned Val) { + switch (Val) { + default: // Map unknown selection kinds to any. + case bitc::COMDAT_SELECTION_KIND_ANY: + return Comdat::Any; + case bitc::COMDAT_SELECTION_KIND_EXACT_MATCH: + return Comdat::ExactMatch; + case bitc::COMDAT_SELECTION_KIND_LARGEST: + return Comdat::Largest; + case bitc::COMDAT_SELECTION_KIND_NO_DUPLICATES: + return Comdat::NoDuplicates; + case bitc::COMDAT_SELECTION_KIND_SAME_SIZE: + return Comdat::SameSize; + } +} + static void UpgradeDLLImportExportLinkage(llvm::GlobalValue *GV, unsigned Val) { switch (Val) { case 5: GV->setDLLStorageClass(GlobalValue::DLLImportStorageClass); break; @@ -573,6 +588,8 @@ static Attribute::AttrKind GetAttrFromCode(uint64_t Code) { return Attribute::NonLazyBind; case bitc::ATTR_KIND_NON_NULL: return Attribute::NonNull; + case bitc::ATTR_KIND_DEREFERENCEABLE: + return Attribute::Dereferenceable; case bitc::ATTR_KIND_NO_RED_ZONE: return Attribute::NoRedZone; case bitc::ATTR_KIND_NO_RETURN: @@ -668,14 +685,16 @@ std::error_code BitcodeReader::ParseAttributeGroupBlock() { return EC; B.addAttribute(Kind); - } else if (Record[i] == 1) { // Align attribute + } else if (Record[i] == 1) { // Integer attribute Attribute::AttrKind Kind; if (std::error_code EC = ParseAttrKind(Record[++i], &Kind)) return EC; if (Kind == Attribute::Alignment) B.addAlignmentAttr(Record[++i]); - else + else if (Kind == Attribute::StackAlignment) B.addStackAlignmentAttr(Record[++i]); + else if (Kind == Attribute::Dereferenceable) + B.addDereferenceableAttr(Record[++i]); } else { // String attribute assert((Record[i] == 3 || Record[i] == 4) && "Invalid attribute group entry"); @@ -1064,7 +1083,8 @@ std::error_code BitcodeReader::ParseMetadata() { break; } case bitc::METADATA_STRING: { - SmallString<8> String(Record.begin(), Record.end()); + std::string String(Record.begin(), Record.end()); + llvm::UpgradeMDStringConstant(String); Value *V = MDString::get(Context, String); MDValueList.AssignValue(V, NextMDValueNo++); break; @@ -1839,6 +1859,20 @@ std::error_code BitcodeReader::ParseModule(bool Resume) { GCTable.push_back(S); break; } + case bitc::MODULE_CODE_COMDAT: { // COMDAT: [selection_kind, name] + if (Record.size() < 2) + return Error(InvalidRecord); + Comdat::SelectionKind SK = getDecodedComdatSelectionKind(Record[0]); + unsigned ComdatNameSize = Record[1]; + std::string ComdatName; + ComdatName.reserve(ComdatNameSize); + for (unsigned i = 0; i != ComdatNameSize; ++i) + ComdatName += (char)Record[2 + i]; + Comdat *C = TheModule->getOrInsertComdat(ComdatName); + C->setSelectionKind(SK); + ComdatList.push_back(C); + break; + } // GLOBALVAR: [pointer type, isconst, initid, // linkage, alignment, section, visibility, threadlocal, // unnamed_addr, dllstorageclass] @@ -1899,6 +1933,12 @@ std::error_code BitcodeReader::ParseModule(bool Resume) { // Remember which value to use for the global initializer. if (unsigned InitID = Record[2]) GlobalInits.push_back(std::make_pair(NewGV, InitID-1)); + + if (Record.size() > 11) + if (unsigned ComdatID = Record[11]) { + assert(ComdatID <= ComdatList.size()); + NewGV->setComdat(ComdatList[ComdatID - 1]); + } break; } // FUNCTION: [type, callingconv, isproto, linkage, paramattr, @@ -1952,6 +1992,12 @@ std::error_code BitcodeReader::ParseModule(bool Resume) { else UpgradeDLLImportExportLinkage(Func, Record[3]); + if (Record.size() > 12) + if (unsigned ComdatID = Record[12]) { + assert(ComdatID <= ComdatList.size()); + Func->setComdat(ComdatList[ComdatID - 1]); + } + ValueList.push_back(Func); // If this is a function with a body, remember the prototype we are @@ -2074,12 +2120,13 @@ std::error_code BitcodeReader::ParseBitcodeInto(Module *M) { } } -std::error_code BitcodeReader::ParseModuleTriple(std::string &Triple) { +ErrorOr BitcodeReader::parseModuleTriple() { if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID)) return Error(InvalidRecord); SmallVector Record; + std::string Triple; // Read all the records for this module. while (1) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); @@ -2089,7 +2136,7 @@ std::error_code BitcodeReader::ParseModuleTriple(std::string &Triple) { case BitstreamEntry::Error: return Error(MalformedBlock); case BitstreamEntry::EndBlock: - return std::error_code(); + return Triple; case BitstreamEntry::Record: // The interesting case. break; @@ -2108,9 +2155,10 @@ std::error_code BitcodeReader::ParseModuleTriple(std::string &Triple) { } Record.clear(); } + llvm_unreachable("Exit infinite loop"); } -std::error_code BitcodeReader::ParseTriple(std::string &Triple) { +ErrorOr BitcodeReader::parseTriple() { if (std::error_code EC = InitStream()) return EC; @@ -2136,7 +2184,7 @@ std::error_code BitcodeReader::ParseTriple(std::string &Triple) { case BitstreamEntry::SubBlock: if (Entry.ID == bitc::MODULE_BLOCK_ID) - return ParseModuleTriple(Triple); + return parseModuleTriple(); // Ignore other sub-blocks. if (Stream.SkipBlock()) @@ -2845,10 +2893,14 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) { dyn_cast_or_null(getTypeByID(Record[0])); Type *OpTy = getTypeByID(Record[1]); Value *Size = getFnValueByID(Record[2], OpTy); - unsigned Align = Record[3]; + unsigned AlignRecord = Record[3]; + bool InAlloca = AlignRecord & (1 << 5); + unsigned Align = AlignRecord & ((1 << 5) - 1); if (!Ty || !Size) return Error(InvalidRecord); - I = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1); + AllocaInst *AI = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1); + AI->setUsedWithInAlloca(InAlloca); + I = AI; InstructionList.push_back(I); break; } @@ -3150,6 +3202,7 @@ std::error_code BitcodeReader::FindFunctionInStream( // GVMaterializer implementation //===----------------------------------------------------------------------===// +void BitcodeReader::releaseBuffer() { Buffer.release(); } bool BitcodeReader::isMaterializable(const GlobalValue *GV) const { if (const Function *F = dyn_cast(GV)) { @@ -3374,10 +3427,9 @@ const std::error_category &BitcodeReader::BitcodeErrorCategory() { /// getLazyBitcodeModule - lazy function-at-a-time loading from a file. /// ErrorOr llvm::getLazyBitcodeModule(MemoryBuffer *Buffer, - LLVMContext &Context, - bool BufferOwned) { + LLVMContext &Context) { Module *M = new Module(Buffer->getBufferIdentifier(), Context); - BitcodeReader *R = new BitcodeReader(Buffer, Context, BufferOwned); + BitcodeReader *R = new BitcodeReader(Buffer, Context); M->setMaterializer(R); if (std::error_code EC = R->ParseBitcodeInto(M)) { R->releaseBuffer(); // Never take ownership on error. @@ -3409,13 +3461,12 @@ Module *llvm::getStreamedBitcodeModule(const std::string &name, ErrorOr llvm::parseBitcodeFile(MemoryBuffer *Buffer, LLVMContext &Context) { - ErrorOr ModuleOrErr = getLazyBitcodeModule(Buffer, Context, false); + ErrorOr ModuleOrErr = getLazyBitcodeModule(Buffer, Context); if (!ModuleOrErr) return ModuleOrErr; Module *M = ModuleOrErr.get(); - // Read in the entire module, and destroy the BitcodeReader. - if (std::error_code EC = M->materializeAllPermanently()) { + if (std::error_code EC = M->materializeAllPermanently(true)) { delete M; return EC; } @@ -3427,15 +3478,12 @@ ErrorOr llvm::parseBitcodeFile(MemoryBuffer *Buffer, } std::string llvm::getBitcodeTargetTriple(MemoryBuffer *Buffer, - LLVMContext& Context, - std::string *ErrMsg) { - BitcodeReader *R = new BitcodeReader(Buffer, Context, /*BufferOwned*/ false); - - std::string Triple(""); - if (std::error_code EC = R->ParseTriple(Triple)) - if (ErrMsg) - *ErrMsg = EC.message(); - + LLVMContext &Context) { + BitcodeReader *R = new BitcodeReader(Buffer, Context); + ErrorOr Triple = R->parseTriple(); + R->releaseBuffer(); delete R; - return Triple; + if (Triple.getError()) + return ""; + return Triple.get(); } diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h index 6aa3e0e5adf7..1d4869a2d5a7 100644 --- a/lib/Bitcode/Reader/BitcodeReader.h +++ b/lib/Bitcode/Reader/BitcodeReader.h @@ -26,6 +26,7 @@ #include namespace llvm { + class Comdat; class MemoryBuffer; class LLVMContext; @@ -125,8 +126,7 @@ class BitcodeReaderMDValueList { class BitcodeReader : public GVMaterializer { LLVMContext &Context; Module *TheModule; - MemoryBuffer *Buffer; - bool BufferOwned; + std::unique_ptr Buffer; std::unique_ptr StreamFile; BitstreamCursor Stream; DataStreamer *LazyStreamer; @@ -136,6 +136,7 @@ class BitcodeReader : public GVMaterializer { std::vector TypeList; BitcodeReaderValueList ValueList; BitcodeReaderMDValueList MDValueList; + std::vector ComdatList; SmallVector InstructionList; SmallVector, 64> UseListRecords; @@ -223,25 +224,21 @@ class BitcodeReader : public GVMaterializer { return std::error_code(E, BitcodeErrorCategory()); } - explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C, bool BufferOwned) - : Context(C), TheModule(nullptr), Buffer(buffer), - BufferOwned(BufferOwned), LazyStreamer(nullptr), NextUnreadBit(0), - SeenValueSymbolTable(false), ValueList(C), MDValueList(C), - SeenFirstFunctionBody(false), UseRelativeIDs(false) {} + explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C) + : Context(C), TheModule(nullptr), Buffer(buffer), LazyStreamer(nullptr), + NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C), + MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false) {} explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C) - : Context(C), TheModule(nullptr), Buffer(nullptr), BufferOwned(false), - LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false), - ValueList(C), MDValueList(C), SeenFirstFunctionBody(false), - UseRelativeIDs(false) {} + : Context(C), TheModule(nullptr), Buffer(nullptr), LazyStreamer(streamer), + NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C), + MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false) {} ~BitcodeReader() { FreeState(); } void materializeForwardReferencedFunctions(); void FreeState(); - void releaseBuffer() { - Buffer = nullptr; - } + void releaseBuffer() override; bool isMaterializable(const GlobalValue *GV) const override; bool isDematerializable(const GlobalValue *GV) const override; @@ -255,7 +252,7 @@ class BitcodeReader : public GVMaterializer { /// @brief Cheap mechanism to just extract module triple /// @returns true if an error occurred. - std::error_code ParseTriple(std::string &Triple); + ErrorOr parseTriple(); static uint64_t decodeSignRotatedValue(uint64_t V); @@ -357,7 +354,7 @@ class BitcodeReader : public GVMaterializer { std::error_code ResolveGlobalAndAliasInits(); std::error_code ParseMetadata(); std::error_code ParseMetadataAttachment(); - std::error_code ParseModuleTriple(std::string &Triple); + ErrorOr parseModuleTriple(); std::error_code ParseUseLists(); std::error_code InitStream(); std::error_code InitStreamFromBuffer(); diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index 3ba7358ae5b9..b2e49486d70f 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -201,6 +201,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_NON_LAZY_BIND; case Attribute::NonNull: return bitc::ATTR_KIND_NON_NULL; + case Attribute::Dereferenceable: + return bitc::ATTR_KIND_DEREFERENCEABLE; case Attribute::NoRedZone: return bitc::ATTR_KIND_NO_RED_ZONE; case Attribute::NoReturn: @@ -272,7 +274,7 @@ static void WriteAttributeGroupTable(const ValueEnumerator &VE, if (Attr.isEnumAttribute()) { Record.push_back(0); Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum())); - } else if (Attr.isAlignAttribute()) { + } else if (Attr.isIntAttribute()) { Record.push_back(1); Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum())); Record.push_back(Attr.getValueAsInt()); @@ -524,6 +526,35 @@ static unsigned getEncodedThreadLocalMode(const GlobalValue &GV) { llvm_unreachable("Invalid TLS model"); } +static unsigned getEncodedComdatSelectionKind(const Comdat &C) { + switch (C.getSelectionKind()) { + case Comdat::Any: + return bitc::COMDAT_SELECTION_KIND_ANY; + case Comdat::ExactMatch: + return bitc::COMDAT_SELECTION_KIND_EXACT_MATCH; + case Comdat::Largest: + return bitc::COMDAT_SELECTION_KIND_LARGEST; + case Comdat::NoDuplicates: + return bitc::COMDAT_SELECTION_KIND_NO_DUPLICATES; + case Comdat::SameSize: + return bitc::COMDAT_SELECTION_KIND_SAME_SIZE; + } + llvm_unreachable("Invalid selection kind"); +} + +static void writeComdats(const ValueEnumerator &VE, BitstreamWriter &Stream) { + SmallVector Vals; + for (const Comdat *C : VE.getComdats()) { + // COMDAT: [selection_kind, name] + Vals.push_back(getEncodedComdatSelectionKind(*C)); + Vals.push_back(C->getName().size()); + for (char Chr : C->getName()) + Vals.push_back((unsigned char)Chr); + Stream.EmitRecord(bitc::MODULE_CODE_COMDAT, Vals, /*AbbrevToUse=*/0); + Vals.clear(); + } +} + // Emit top-level description of module, including target triple, inline asm, // descriptors for global variables, and function prototype info. static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE, @@ -625,12 +656,14 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE, if (GV.isThreadLocal() || GV.getVisibility() != GlobalValue::DefaultVisibility || GV.hasUnnamedAddr() || GV.isExternallyInitialized() || - GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass) { + GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass || + GV.hasComdat()) { Vals.push_back(getEncodedVisibility(GV)); Vals.push_back(getEncodedThreadLocalMode(GV)); Vals.push_back(GV.hasUnnamedAddr()); Vals.push_back(GV.isExternallyInitialized()); Vals.push_back(getEncodedDLLStorageClass(GV)); + Vals.push_back(GV.hasComdat() ? VE.getComdatID(GV.getComdat()) : 0); } else { AbbrevToUse = SimpleGVarAbbrev; } @@ -656,6 +689,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE, Vals.push_back(F.hasPrefixData() ? (VE.getValueID(F.getPrefixData()) + 1) : 0); Vals.push_back(getEncodedDLLStorageClass(F)); + Vals.push_back(F.hasComdat() ? VE.getComdatID(F.getComdat()) : 0); unsigned AbbrevToUse = 0; Stream.EmitRecord(bitc::MODULE_CODE_FUNCTION, Vals, AbbrevToUse); @@ -1399,13 +1433,20 @@ static void WriteInstruction(const Instruction &I, unsigned InstID, break; } - case Instruction::Alloca: + case Instruction::Alloca: { Code = bitc::FUNC_CODE_INST_ALLOCA; Vals.push_back(VE.getTypeID(I.getType())); Vals.push_back(VE.getTypeID(I.getOperand(0)->getType())); Vals.push_back(VE.getValueID(I.getOperand(0))); // size. - Vals.push_back(Log2_32(cast(I).getAlignment())+1); + const AllocaInst &AI = cast(I); + unsigned AlignRecord = Log2_32(AI.getAlignment()) + 1; + assert(Log2_32(Value::MaximumAlignment) + 1 < 1 << 5 && + "not enough bits for maximum alignment"); + assert(AlignRecord < 1 << 5 && "alignment greater than 1 << 64"); + AlignRecord |= AI.isUsedWithInAlloca() << 5; + Vals.push_back(AlignRecord); break; + } case Instruction::Load: if (cast(I).isAtomic()) { @@ -1915,6 +1956,8 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream) { // Emit information describing all of the types in the module. WriteTypeTable(VE, Stream); + writeComdats(VE, Stream); + // Emit top-level description of module, including target triple, inline asm, // descriptors for global variables, and function prototype info. WriteModuleInfo(M, VE, Stream); diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp index befe15bb4587..15f8034a3691 100644 --- a/lib/Bitcode/Writer/ValueEnumerator.cpp +++ b/lib/Bitcode/Writer/ValueEnumerator.cpp @@ -117,6 +117,12 @@ unsigned ValueEnumerator::getInstructionID(const Instruction *Inst) const { return I->second; } +unsigned ValueEnumerator::getComdatID(const Comdat *C) const { + unsigned ComdatID = Comdats.idFor(C); + assert(ComdatID && "Comdat not found!"); + return ComdatID; +} + void ValueEnumerator::setInstructionID(const Instruction *I) { InstructionMap[I] = InstructionCount++; } @@ -307,6 +313,10 @@ void ValueEnumerator::EnumerateValue(const Value *V) { return; } + if (auto *GO = dyn_cast(V)) + if (const Comdat *C = GO->getComdat()) + Comdats.insert(C); + // Enumerate the type of this value. EnumerateType(V->getType()); diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h index d1ca15f45d02..1c9f38e07b44 100644 --- a/lib/Bitcode/Writer/ValueEnumerator.h +++ b/lib/Bitcode/Writer/ValueEnumerator.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/UniqueVector.h" #include "llvm/IR/Attributes.h" #include @@ -25,6 +26,7 @@ class Type; class Value; class Instruction; class BasicBlock; +class Comdat; class Function; class Module; class MDNode; @@ -48,6 +50,10 @@ class ValueEnumerator { typedef DenseMap ValueMapType; ValueMapType ValueMap; ValueList Values; + + typedef UniqueVector ComdatSetType; + ComdatSetType Comdats; + ValueList MDValues; SmallVector FunctionLocalMDs; ValueMapType MDValueMap; @@ -139,6 +145,9 @@ class ValueEnumerator { return AttributeGroups; } + const ComdatSetType &getComdats() const { return Comdats; } + unsigned getComdatID(const Comdat *C) const; + /// getGlobalBasicBlockID - This returns the function-specific ID for the /// specified basic block. This is relatively expensive information, so it /// should only be used by rare constructs such as address-of-label. diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp index a3e65284e803..0eabee30e964 100644 --- a/lib/CodeGen/Analysis.cpp +++ b/lib/CodeGen/Analysis.cpp @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/DataLayout.h" @@ -475,7 +475,7 @@ static bool nextRealType(SmallVectorImpl &SubTypes, /// between it and the return. /// /// This function only tests target-independent requirements. -bool llvm::isInTailCallPosition(ImmutableCallSite CS, const SelectionDAG &DAG) { +bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) { const Instruction *I = CS.getInstruction(); const BasicBlock *ExitBB = I->getParent(); const TerminatorInst *Term = ExitBB->getTerminator(); @@ -490,8 +490,7 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const SelectionDAG &DAG) { // longjmp on x86), it can end up causing miscompilation that has not // been fully understood. if (!Ret && - (!DAG.getTarget().Options.GuaranteedTailCallOpt || - !isa(Term))) + (!TM.Options.GuaranteedTailCallOpt || !isa(Term))) return false; // If I will have a chain, make sure no other instruction that will have a @@ -510,7 +509,7 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const SelectionDAG &DAG) { } return returnTypeIsEligibleForTailCall(ExitBB->getParent(), I, Ret, - *DAG.getTarget().getTargetLowering()); + *TM.getTargetLowering()); } bool llvm::returnTypeIsEligibleForTailCall(const Function *F, diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 996dc2122f49..424e759caa8c 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -47,7 +47,6 @@ #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/GlobalStatus.h" @@ -244,7 +243,7 @@ bool AsmPrinter::doInitialization(Module &M) { case ExceptionHandling::ARM: ES = new ARMException(this); break; - case ExceptionHandling::Win64: + case ExceptionHandling::WinEH: ES = new Win64Exception(this); break; } @@ -710,13 +709,12 @@ AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() { } bool AsmPrinter::needsSEHMoves() { - return MAI->getExceptionHandlingType() == ExceptionHandling::Win64 && + return MAI->getExceptionHandlingType() == ExceptionHandling::WinEH && MF->getFunction()->needsUnwindTableEntry(); } void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) { - ExceptionHandling::ExceptionsType ExceptionHandlingType = - MAI->getExceptionHandlingType(); + ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType(); if (ExceptionHandlingType != ExceptionHandling::DwarfCFI && ExceptionHandlingType != ExceptionHandling::ARM) return; @@ -1064,23 +1062,13 @@ void AsmPrinter::EmitConstantPool() { const MachineConstantPoolEntry &CPE = CP[i]; unsigned Align = CPE.getAlignment(); - SectionKind Kind; - switch (CPE.getRelocationInfo()) { - default: llvm_unreachable("Unknown section kind"); - case 2: Kind = SectionKind::getReadOnlyWithRel(); break; - case 1: - Kind = SectionKind::getReadOnlyWithRelLocal(); - break; - case 0: - switch (TM.getDataLayout()->getTypeAllocSize(CPE.getType())) { - case 4: Kind = SectionKind::getMergeableConst4(); break; - case 8: Kind = SectionKind::getMergeableConst8(); break; - case 16: Kind = SectionKind::getMergeableConst16();break; - default: Kind = SectionKind::getMergeableConst(); break; - } - } + SectionKind Kind = CPE.getSectionKind(TM.getDataLayout()); + + const Constant *C = nullptr; + if (!CPE.isMachineConstantPoolEntry()) + C = CPE.Val.ConstVal; - const MCSection *S = getObjFileLowering().getSectionForConstant(Kind); + const MCSection *S = getObjFileLowering().getSectionForConstant(Kind, C); // The number of sections are small, just do a linear search from the // last section to the first. @@ -1103,13 +1091,22 @@ void AsmPrinter::EmitConstantPool() { } // Now print stuff into the calculated sections. + const MCSection *CurSection = nullptr; + unsigned Offset = 0; for (unsigned i = 0, e = CPSections.size(); i != e; ++i) { - OutStreamer.SwitchSection(CPSections[i].S); - EmitAlignment(Log2_32(CPSections[i].Alignment)); - - unsigned Offset = 0; for (unsigned j = 0, ee = CPSections[i].CPEs.size(); j != ee; ++j) { unsigned CPI = CPSections[i].CPEs[j]; + MCSymbol *Sym = GetCPISymbol(CPI); + if (!Sym->isUndefined()) + continue; + + if (CurSection != CPSections[i].S) { + OutStreamer.SwitchSection(CPSections[i].S); + EmitAlignment(Log2_32(CPSections[i].Alignment)); + CurSection = CPSections[i].S; + Offset = 0; + } + MachineConstantPoolEntry CPE = CP[CPI]; // Emit inter-object padding for alignment. @@ -1119,8 +1116,8 @@ void AsmPrinter::EmitConstantPool() { Type *Ty = CPE.getType(); Offset = NewOffset + TM.getDataLayout()->getTypeAllocSize(Ty); - OutStreamer.EmitLabel(GetCPISymbol(CPI)); + OutStreamer.EmitLabel(Sym); if (CPE.isMachineConstantPoolEntry()) EmitMachineConstantPoolValue(CPE.Val.MachineCPVal); else @@ -1158,7 +1155,8 @@ void AsmPrinter::EmitJumpTableInfo() { } else { // Otherwise, drop it in the readonly section. const MCSection *ReadOnlySection = - getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly()); + getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly(), + /*C=*/nullptr); OutStreamer.SwitchSection(ReadOnlySection); JTInDiffSection = true; } @@ -1867,8 +1865,10 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) { SmallString<8> StrVal; CFP->getValueAPF().toString(StrVal); - assert(CFP->getType() != nullptr && "Expecting non-null Type"); - CFP->getType()->print(AP.OutStreamer.GetCommentOS()); + if (CFP->getType()) + CFP->getType()->print(AP.OutStreamer.GetCommentOS()); + else + AP.OutStreamer.GetCommentOS() << "Printing Type"; AP.OutStreamer.GetCommentOS() << ' ' << StrVal << '\n'; } @@ -1881,7 +1881,8 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) { // PPC's long double has odd notions of endianness compared to how LLVM // handles it: p[0] goes first for *big* endian on PPC. - if (AP.TM.getDataLayout()->isBigEndian() != CFP->getType()->isPPC_FP128Ty()) { + if (AP.TM.getDataLayout()->isBigEndian() && + !CFP->getType()->isPPC_FP128Ty()) { int Chunk = API.getNumWords() - 1; if (TrailingBytes) diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp index e2d95272c2c5..74215aa695dd 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp @@ -62,22 +62,12 @@ void DwarfCFIException::endModule() { return; // Emit references to all used personality functions - bool AtLeastOne = false; const std::vector &Personalities = MMI->getPersonalities(); for (size_t i = 0, e = Personalities.size(); i != e; ++i) { if (!Personalities[i]) continue; MCSymbol *Sym = Asm->getSymbol(Personalities[i]); TLOF.emitPersonalityValue(Asm->OutStreamer, Asm->TM, Sym); - AtLeastOne = true; - } - - if (AtLeastOne && !TLOF.isFunctionEHFrameSymbolPrivate()) { - // This is a temporary hack to keep sections in the same order they - // were before. This lets us produce bit identical outputs while - // transitioning to CFI. - Asm->OutStreamer.SwitchSection( - const_cast(TLOF).getEHFrameSection()); } } @@ -122,9 +112,17 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) { TLOF.getCFIPersonalitySymbol(Per, *Asm->Mang, Asm->TM, MMI); Asm->OutStreamer.EmitCFIPersonality(Sym, PerEncoding); - Asm->OutStreamer.EmitDebugLabel - (Asm->GetTempSymbol("eh_func_begin", - Asm->getFunctionNumber())); + MCSymbol *EHBegin = + Asm->GetTempSymbol("eh_func_begin", Asm->getFunctionNumber()); + if (Asm->MAI->useAssignmentForEHBegin()) { + MCContext &Ctx = Asm->OutContext; + MCSymbol *CurPos = Ctx.CreateTempSymbol(); + Asm->OutStreamer.EmitLabel(CurPos); + Asm->OutStreamer.EmitAssignment(EHBegin, + MCSymbolRefExpr::Create(CurPos, Ctx)); + } else { + Asm->OutStreamer.EmitLabel(EHBegin); + } // Provide LSDA information. if (!shouldEmitLSDA) diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index a88aebd2d22f..ac1c0ffb3cc0 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -36,6 +36,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/LEB128.h" @@ -98,10 +99,6 @@ DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden, clEnumVal(Disable, "Disabled"), clEnumValEnd), cl::init(Default)); -static cl::opt -DwarfVersionNumber("dwarf-version", cl::Hidden, - cl::desc("Generate DWARF for dwarf version."), cl::init(0)); - static const char *const DWARFGroupName = "DWARF Emission"; static const char *const DbgTimerName = "DWARF Debug Writer"; @@ -209,9 +206,12 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) else HasDwarfPubSections = DwarfPubSections == Enable; + unsigned DwarfVersionNumber = Asm->TM.Options.MCOptions.DwarfVersion; DwarfVersion = DwarfVersionNumber ? DwarfVersionNumber : MMI->getModule()->getDwarfVersion(); + Asm->OutStreamer.getContext().setDwarfVersion(DwarfVersion); + { NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled); beginModule(); @@ -1039,9 +1039,9 @@ void DwarfDebug::endModule() { emitDebugInfoDWO(); emitDebugAbbrevDWO(); emitDebugLineDWO(); + emitDebugLocDWO(); // Emit DWO addresses. AddrPool.emit(*Asm, Asm->getObjFileLowering().getDwarfAddrSection()); - emitDebugLocDWO(); } else // Emit info into a debug loc section. emitDebugLoc(); @@ -1555,8 +1555,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { // Construct abstract scopes. for (LexicalScope *AScope : LScopes.getAbstractScopesList()) { DISubprogram SP(AScope->getScopeNode()); - if (!SP.isSubprogram()) - continue; + assert(SP.isSubprogram()); // Collect info for variables that were optimized out. DIArray Variables = SP.getVariables(); for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) { diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp index 4768a43e9a6f..81285d55d636 100644 --- a/lib/CodeGen/AsmPrinter/Win64Exception.cpp +++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp @@ -72,14 +72,14 @@ void Win64Exception::beginFunction(const MachineFunction *MF) { if (!shouldEmitPersonality && !shouldEmitMoves) return; - Asm->OutStreamer.EmitWin64EHStartProc(Asm->CurrentFnSym); + Asm->OutStreamer.EmitWinCFIStartProc(Asm->CurrentFnSym); if (!shouldEmitPersonality) return; - MCSymbol *GCCHandlerSym = - Asm->GetExternalSymbolSymbol("_GCC_specific_handler"); - Asm->OutStreamer.EmitWin64EHHandler(GCCHandlerSym, true, true); + const MCSymbol *PersHandlerSym = + TLOF.getCFIPersonalitySymbol(Per, *Asm->Mang, Asm->TM, MMI); + Asm->OutStreamer.EmitWinEHHandler(PersHandlerSym, true, true); Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin", Asm->getFunctionNumber())); @@ -98,17 +98,10 @@ void Win64Exception::endFunction(const MachineFunction *) { MMI->TidyLandingPads(); if (shouldEmitPersonality) { - const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); - const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()]; - const MCSymbol *Sym = - TLOF.getCFIPersonalitySymbol(Per, *Asm->Mang, Asm->TM, MMI); - Asm->OutStreamer.PushSection(); - Asm->OutStreamer.EmitWin64EHHandlerData(); - Asm->OutStreamer.EmitValue(MCSymbolRefExpr::Create(Sym, Asm->OutContext), - 4); + Asm->OutStreamer.EmitWinEHHandlerData(); emitExceptionTable(); Asm->OutStreamer.PopSection(); } - Asm->OutStreamer.EmitWin64EHEndProc(); + Asm->OutStreamer.EmitWinCFIEndProc(); } diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp index 2212941861b8..6a5c431d4fd1 100644 --- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp +++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp @@ -308,7 +308,7 @@ void WinCodeViewLineTables::endFunction(const MachineFunction *MF) { return; const Function *GV = MF->getFunction(); - assert(FnDebugInfo.count(GV) == true); + assert(FnDebugInfo.count(GV)); assert(CurFn == &FnDebugInfo[GV]); if (CurFn->Instrs.empty()) { diff --git a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp index 4c4150bfec90..421946ded40b 100644 --- a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp +++ b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp @@ -21,17 +21,19 @@ #include "llvm/Support/Debug.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" + using namespace llvm; #define DEBUG_TYPE "arm-atomic-expand" namespace { class AtomicExpandLoadLinked : public FunctionPass { - const TargetLowering *TLI; + const TargetMachine *TM; public: static char ID; // Pass identification, replacement for typeid explicit AtomicExpandLoadLinked(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TLI(TM ? TM->getTargetLowering() : nullptr) { + : FunctionPass(ID), TM(TM) { initializeAtomicExpandLoadLinkedPass(*PassRegistry::getPassRegistry()); } @@ -59,7 +61,7 @@ FunctionPass *llvm::createAtomicExpandLoadLinkedPass(const TargetMachine *TM) { } bool AtomicExpandLoadLinked::runOnFunction(Function &F) { - if (!TLI) + if (!TM || !TM->getSubtargetImpl()->enableAtomicExpandLoadLinked()) return false; SmallVector AtomicInsts; @@ -76,7 +78,7 @@ bool AtomicExpandLoadLinked::runOnFunction(Function &F) { bool MadeChange = false; for (Instruction *Inst : AtomicInsts) { - if (!TLI->shouldExpandAtomicInIR(Inst)) + if (!TM->getTargetLowering()->shouldExpandAtomicInIR(Inst)) continue; if (AtomicRMWInst *AI = dyn_cast(Inst)) @@ -98,13 +100,14 @@ bool AtomicExpandLoadLinked::expandAtomicLoad(LoadInst *LI) { // Load instructions don't actually need a leading fence, even in the // SequentiallyConsistent case. AtomicOrdering MemOpOrder = - TLI->getInsertFencesForAtomic() ? Monotonic : LI->getOrdering(); + TM->getTargetLowering()->getInsertFencesForAtomic() ? Monotonic + : LI->getOrdering(); // The only 64-bit load guaranteed to be single-copy atomic by the ARM ARM is // an ldrexd (A3.5.3). IRBuilder<> Builder(LI); - Value *Val = - TLI->emitLoadLinked(Builder, LI->getPointerOperand(), MemOpOrder); + Value *Val = TM->getTargetLowering()->emitLoadLinked( + Builder, LI->getPointerOperand(), MemOpOrder); insertTrailingFence(Builder, LI->getOrdering()); @@ -165,7 +168,8 @@ bool AtomicExpandLoadLinked::expandAtomicRMW(AtomicRMWInst *AI) { // Start the main loop block now that we've taken care of the preliminaries. Builder.SetInsertPoint(LoopBB); - Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); + Value *Loaded = + TM->getTargetLowering()->emitLoadLinked(Builder, Addr, MemOpOrder); Value *NewVal; switch (AI->getOperation()) { @@ -182,7 +186,7 @@ bool AtomicExpandLoadLinked::expandAtomicRMW(AtomicRMWInst *AI) { NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new"); break; case AtomicRMWInst::Nand: - NewVal = Builder.CreateAnd(Loaded, Builder.CreateNot(AI->getValOperand()), + NewVal = Builder.CreateNot(Builder.CreateAnd(Loaded, AI->getValOperand()), "new"); break; case AtomicRMWInst::Or: @@ -211,8 +215,8 @@ bool AtomicExpandLoadLinked::expandAtomicRMW(AtomicRMWInst *AI) { llvm_unreachable("Unknown atomic op"); } - Value *StoreSuccess = - TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); + Value *StoreSuccess = TM->getTargetLowering()->emitStoreConditional( + Builder, NewVal, Addr, MemOpOrder); Value *TryAgain = Builder.CreateICmpNE( StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); @@ -278,7 +282,8 @@ bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // Start the main loop block now that we've taken care of the preliminaries. Builder.SetInsertPoint(LoopBB); - Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); + Value *Loaded = + TM->getTargetLowering()->emitLoadLinked(Builder, Addr, MemOpOrder); Value *ShouldStore = Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store"); @@ -287,7 +292,7 @@ bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB); Builder.SetInsertPoint(TryStoreBB); - Value *StoreSuccess = TLI->emitStoreConditional( + Value *StoreSuccess = TM->getTargetLowering()->emitStoreConditional( Builder, CI->getNewValOperand(), Addr, MemOpOrder); StoreSuccess = Builder.CreateICmpEQ( StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); @@ -352,7 +357,7 @@ bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { AtomicOrdering AtomicExpandLoadLinked::insertLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord) { - if (!TLI->getInsertFencesForAtomic()) + if (!TM->getTargetLowering()->getInsertFencesForAtomic()) return Ord; if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent) @@ -365,7 +370,7 @@ AtomicOrdering AtomicExpandLoadLinked::insertLeadingFence(IRBuilder<> &Builder, void AtomicExpandLoadLinked::insertTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord) { - if (!TLI->getInsertFencesForAtomic()) + if (!TM->getTargetLowering()->getInsertFencesForAtomic()) return; if (Ord == Acquire || Ord == AcquireRelease) diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index 7f31b1a982fc..b2737bf754f9 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -39,6 +39,9 @@ class BasicTTI final : public ImmutablePass, public TargetTransformInfo { /// are set if the result needs to be inserted and/or extracted from vectors. unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; + /// Estimate the cost overhead of SK_Alternate shuffle. + unsigned getAltShuffleOverhead(Type *Ty) const; + const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); } public: @@ -327,8 +330,28 @@ unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return OpCost; } +unsigned BasicTTI::getAltShuffleOverhead(Type *Ty) const { + assert(Ty->isVectorTy() && "Can only shuffle vectors"); + unsigned Cost = 0; + // Shuffle cost is equal to the cost of extracting element from its argument + // plus the cost of inserting them onto the result vector. + + // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from index + // 0 of first vector, index 1 of second vector,index 2 of first vector and + // finally index 3 of second vector and insert them at index <0,1,2,3> of + // result vector. + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + return Cost; +} + unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { + if (Kind == SK_Alternate) { + return getAltShuffleOverhead(Tp); + } return 1; } diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 57c24e823c1d..b71b30cc1cca 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -51,6 +51,7 @@ add_llvm_library(LLVMCodeGen MachineCodeEmitter.cpp MachineCopyPropagation.cpp MachineDominators.cpp + MachineDominanceFrontier.cpp MachineFunction.cpp MachineFunctionAnalysis.cpp MachineFunctionPass.cpp @@ -64,6 +65,7 @@ add_llvm_library(LLVMCodeGen MachinePassRegistry.cpp MachinePostDominators.cpp MachineRegisterInfo.cpp + MachineRegionInfo.cpp MachineSSAUpdater.cpp MachineScheduler.cpp MachineSink.cpp diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index ccac40c66961..d5039b2e8517 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -2036,7 +2036,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, case Instruction::Shl: { // Can only handle X*C and X << C. ConstantInt *RHS = dyn_cast(AddrInst->getOperand(1)); - if (!RHS) return false; + if (!RHS) + return false; int64_t Scale = RHS->getSExtValue(); if (Opcode == Instruction::Shl) Scale = 1LL << Scale; @@ -2130,8 +2131,11 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, return true; } case Instruction::SExt: { + Instruction *SExt = dyn_cast(AddrInst); + if (!SExt) + return false; + // Try to move this sext out of the way of the addressing mode. - Instruction *SExt = cast(AddrInst); // Ask for a method for doing so. TypePromotionHelper::Action TPH = TypePromotionHelper::getAction( SExt, InsertedTruncs, TLI, PromotedInsts); diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp index 822636fcf133..d3ffcc78471b 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -146,8 +146,8 @@ static const SDep *CriticalPathStep(const SUnit *SU) { void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) { // It's not safe to change register allocation for source operands of - // that have special allocation requirements. Also assume all registers - // used in a call must not be changed (ABI). + // instructions that have special allocation requirements. Also assume all + // registers used in a call must not be changed (ABI). // FIXME: The issue with predicated instruction is more complex. We are being // conservative here because the kill markers cannot be trusted after // if-conversion: @@ -200,6 +200,28 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) { if (Classes[Reg] != reinterpret_cast(-1)) RegRefs.insert(std::make_pair(Reg, &MO)); + // If this reg is tied and live (Classes[Reg] is set to -1), we can't change + // it or any of its sub or super regs. We need to use KeepRegs to mark the + // reg because not all uses of the same reg within an instruction are + // necessarily tagged as tied. + // Example: an x86 "xor %eax, %eax" will have one source operand tied to the + // def register but not the second (see PR20020 for details). + // FIXME: can this check be relaxed to account for undef uses + // of a register? In the above 'xor' example, the uses of %eax are undef, so + // earlier instructions could still replace %eax even though the 'xor' + // itself can't be changed. + if (MI->isRegTiedToUseOperand(i) && + Classes[Reg] == reinterpret_cast(-1)) { + for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); + SubRegs.isValid(); ++SubRegs) { + KeepRegs.set(*SubRegs); + } + for (MCSuperRegIterator SuperRegs(Reg, TRI); + SuperRegs.isValid(); ++SuperRegs) { + KeepRegs.set(*SuperRegs); + } + } + if (MO.isUse() && Special) { if (!KeepRegs.test(Reg)) { for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); @@ -236,9 +258,15 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI, unsigned Reg = MO.getReg(); if (Reg == 0) continue; if (!MO.isDef()) continue; + + // If we've already marked this reg as unchangeable, carry on. + if (KeepRegs.test(Reg)) continue; + // Ignore two-addr defs. if (MI->isRegTiedToUseOperand(i)) continue; + // FIXME: we should use a SubRegIterator that includes self (as above), so + // we don't have to repeat all this code for the reg itself. DefIndices[Reg] = Count; KillIndices[Reg] = ~0u; assert(((KillIndices[Reg] == ~0u) != @@ -281,6 +309,9 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI, RegRefs.insert(std::make_pair(Reg, &MO)); + // FIXME: we should use an MCRegAliasIterator that includes self so we don't + // have to repeat all this code for the reg itself. + // It wasn't previously live but now it is, this is a kill. if (KillIndices[Reg] == ~0u) { KillIndices[Reg] = Count; @@ -309,7 +340,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI, // the two-address instruction also defines NewReg, as may happen with // pre/postincrement loads. In this case, both the use and def operands are in // RegRefs because the def is inserted by PrescanInstruction and not erased -// during ScanInstruction. So checking for an instructions with definitions of +// during ScanInstruction. So checking for an instruction with definitions of // both NewReg and AntiDepReg covers it. bool CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin, @@ -325,7 +356,7 @@ CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin, if (RefOper->isDef() && RefOper->isEarlyClobber()) return true; - // Handle cases in which this instructions defines NewReg. + // Handle cases in which this instruction defines NewReg. MachineInstr *MI = RefOper->getParent(); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &CheckOper = MI->getOperand(i); @@ -343,11 +374,11 @@ CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin, return true; // Don't allow an instruction using AntiDepReg to be earlyclobbered by - // NewReg + // NewReg. if (CheckOper.isEarlyClobber()) return true; - // Don't allow inline asm to define NewReg at all. Who know what it's + // Don't allow inline asm to define NewReg at all. Who knows what it's // doing with it. if (MI->isInlineAsm()) return true; @@ -494,8 +525,7 @@ BreakAntiDependencies(const std::vector& SUnits, // as we go to help determine which registers are available. unsigned Broken = 0; unsigned Count = InsertPosIndex - 1; - for (MachineBasicBlock::iterator I = End, E = Begin; - I != E; --Count) { + for (MachineBasicBlock::iterator I = End, E = Begin; I != E; --Count) { MachineInstr *MI = --I; if (MI->isDebugValue()) continue; @@ -526,7 +556,7 @@ BreakAntiDependencies(const std::vector& SUnits, // Don't break anti-dependencies on non-allocatable registers. AntiDepReg = 0; else if (KeepRegs.test(AntiDepReg)) - // Don't break anti-dependencies if an use down below requires + // Don't break anti-dependencies if a use down below requires // this exact register. AntiDepReg = 0; else { @@ -564,8 +594,7 @@ BreakAntiDependencies(const std::vector& SUnits, // If MI's defs have a special allocation requirement, don't allow // any def registers to be changed. Also assume all registers // defined in a call must not be changed (ABI). - if (MI->isCall() || MI->hasExtraDefRegAllocReq() || - TII->isPredicated(MI)) + if (MI->isCall() || MI->hasExtraDefRegAllocReq() || TII->isPredicated(MI)) // If this instruction's defs have special allocation requirement, don't // break this anti-dependency. AntiDepReg = 0; diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h index 1949a48d98f3..45e4ff5c78b9 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.h +++ b/lib/CodeGen/CriticalAntiDepBreaker.h @@ -55,12 +55,12 @@ class TargetRegisterInfo; typedef std::multimap::const_iterator RegRefIter; - /// KillIndices - The index of the most recent kill (proceding bottom-up), + /// KillIndices - The index of the most recent kill (proceeding bottom-up), /// or ~0u if the register is not live. std::vector KillIndices; - /// DefIndices - The index of the most recent complete def (proceding bottom - /// up), or ~0u if the register is live. + /// DefIndices - The index of the most recent complete def (proceeding + /// bottom up), or ~0u if the register is live. std::vector DefIndices; /// KeepRegs - A set of registers which are live and cannot be changed to diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp index d52fcbfa41ed..5572a062912f 100644 --- a/lib/CodeGen/GlobalMerge.cpp +++ b/lib/CodeGen/GlobalMerge.cpp @@ -199,19 +199,17 @@ bool GlobalMerge::doMerge(SmallVectorImpl &Globals, ? GlobalValue::ExternalLinkage : GlobalValue::InternalLinkage; - // If merged variables have external linkage, we use symbol name of the - // first variable merged as the suffix of global symbol name. This would - // be able to avoid the link-time naming conflict for globalm symbols. - Twine MergedGVName = HasExternal - ? "_MergedGlobals_" + TheFirstExternal->getName() - : "_MergedGlobals"; - StructType *MergedTy = StructType::get(M.getContext(), Tys); Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); + // If merged variables have external linkage, we use symbol name of the + // first variable merged as the suffix of global symbol name. This would + // be able to avoid the link-time naming conflict for globalm symbols. GlobalVariable *MergedGV = new GlobalVariable( - M, MergedTy, isConst, Linkage, MergedInit, MergedGVName, nullptr, - GlobalVariable::NotThreadLocal, AddrSpace); + M, MergedTy, isConst, Linkage, MergedInit, + HasExternal ? "_MergedGlobals_" + TheFirstExternal->getName() + : "_MergedGlobals", + nullptr, GlobalVariable::NotThreadLocal, AddrSpace); for (size_t k = i; k < j; ++k) { GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage(); diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp index 29062434f00e..df96b945a8d9 100644 --- a/lib/CodeGen/LLVMTargetMachine.cpp +++ b/lib/CodeGen/LLVMTargetMachine.cpp @@ -209,7 +209,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM, case CGFT_Null: // The Null output is intended for use for performance analysis and testing, // not real users. - AsmStreamer.reset(createNullStreamer(*Context)); + AsmStreamer.reset(getTarget().createNullStreamer(*Context)); break; } diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 0ec5c338a248..08fef5ffaf44 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -332,7 +332,7 @@ void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const { } } -void MachineBasicBlock::printAsOperand(raw_ostream &OS, bool /*PrintType*/) { +void MachineBasicBlock::printAsOperand(raw_ostream &OS, bool /*PrintType*/) const { OS << "BB#" << getNumber(); } diff --git a/lib/CodeGen/MachineDominanceFrontier.cpp b/lib/CodeGen/MachineDominanceFrontier.cpp new file mode 100644 index 000000000000..0bee84668f01 --- /dev/null +++ b/lib/CodeGen/MachineDominanceFrontier.cpp @@ -0,0 +1,54 @@ +//===- MachineDominanceFrontier.cpp ---------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/Analysis/DominanceFrontierImpl.h" +#include "llvm/CodeGen/Passes.h" + + +using namespace llvm; + +namespace llvm { +template class DominanceFrontierBase; +template class ForwardDominanceFrontierBase; +} + + +char MachineDominanceFrontier::ID = 0; + +INITIALIZE_PASS_BEGIN(MachineDominanceFrontier, "machine-domfrontier", + "Machine Dominance Frontier Construction", true, true) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(MachineDominanceFrontier, "machine-domfrontier", + "Machine Dominance Frontier Construction", true, true) + +MachineDominanceFrontier::MachineDominanceFrontier() + : MachineFunctionPass(ID), + Base() { + initializeMachineDominanceFrontierPass(*PassRegistry::getPassRegistry()); +} + +char &llvm::MachineDominanceFrontierID = MachineDominanceFrontier::ID; + +bool MachineDominanceFrontier::runOnMachineFunction(MachineFunction &) { + releaseMemory(); + Base.analyze(getAnalysis().getBase()); + return false; +} + +void MachineDominanceFrontier::releaseMemory() { + Base.releaseMemory(); +} + +void MachineDominanceFrontier::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); +} diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index eb3d71fef5c7..7e9b7559517d 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -457,7 +457,7 @@ unsigned MachineFunction::addLiveIn(unsigned PReg, /// getJTISymbol - Return the MCSymbol for the specified non-empty jump table. /// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a /// normal 'L' label is returned. -MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx, +MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx, bool isLinkerPrivate) const { const DataLayout *DL = getTarget().getDataLayout(); assert(JumpTableInfo && "No jump tables"); @@ -530,10 +530,9 @@ int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment, /// int MachineFrameInfo::CreateSpillStackObject(uint64_t Size, unsigned Alignment) { - Alignment = - clampStackAlignment(!getFrameLowering()->isStackRealignable() || - !RealignOption, - Alignment, getFrameLowering()->getStackAlignment()); + Alignment = clampStackAlignment( + !getFrameLowering()->isStackRealignable() || !RealignOption, Alignment, + getFrameLowering()->getStackAlignment()); CreateStackObject(Size, Alignment, true); int Index = (int)Objects.size() - NumFixedObjects - 1; ensureMaxAlignment(Alignment); @@ -548,10 +547,9 @@ int MachineFrameInfo::CreateSpillStackObject(uint64_t Size, int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment, const AllocaInst *Alloca) { HasVarSizedObjects = true; - Alignment = - clampStackAlignment(!getFrameLowering()->isStackRealignable() || - !RealignOption, - Alignment, getFrameLowering()->getStackAlignment()); + Alignment = clampStackAlignment( + !getFrameLowering()->isStackRealignable() || !RealignOption, Alignment, + getFrameLowering()->getStackAlignment()); Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca)); ensureMaxAlignment(Alignment); return (int)Objects.size()-NumFixedObjects-1; @@ -571,16 +569,30 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset, // object is 16-byte aligned. unsigned StackAlign = getFrameLowering()->getStackAlignment(); unsigned Align = MinAlign(SPOffset, StackAlign); - Align = - clampStackAlignment(!getFrameLowering()->isStackRealignable() || - !RealignOption, - Align, getFrameLowering()->getStackAlignment()); + Align = clampStackAlignment(!getFrameLowering()->isStackRealignable() || + !RealignOption, + Align, getFrameLowering()->getStackAlignment()); Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, /*isSS*/ false, /*Alloca*/ nullptr)); return -++NumFixedObjects; } +/// CreateFixedSpillStackObject - Create a spill slot at a fixed location +/// on the stack. Returns an index with a negative value. +int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size, + int64_t SPOffset) { + unsigned StackAlign = getFrameLowering()->getStackAlignment(); + unsigned Align = MinAlign(SPOffset, StackAlign); + Align = clampStackAlignment(!getFrameLowering()->isStackRealignable() || + !RealignOption, + Align, getFrameLowering()->getStackAlignment()); + Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, + /*Immutable*/ true, + /*isSS*/ true, + /*Alloca*/ nullptr)); + return -++NumFixedObjects; +} BitVector MachineFrameInfo::getPristineRegs(const MachineBasicBlock *MBB) const { @@ -824,6 +836,37 @@ unsigned MachineConstantPoolEntry::getRelocationInfo() const { return Val.ConstVal->getRelocationInfo(); } +SectionKind +MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const { + SectionKind Kind; + switch (getRelocationInfo()) { + default: + llvm_unreachable("Unknown section kind"); + case 2: + Kind = SectionKind::getReadOnlyWithRel(); + break; + case 1: + Kind = SectionKind::getReadOnlyWithRelLocal(); + break; + case 0: + switch (DL->getTypeAllocSize(getType())) { + case 4: + Kind = SectionKind::getMergeableConst4(); + break; + case 8: + Kind = SectionKind::getMergeableConst8(); + break; + case 16: + Kind = SectionKind::getMergeableConst16(); + break; + default: + Kind = SectionKind::getMergeableConst(); + break; + } + } + return Kind; +} + MachineConstantPool::~MachineConstantPool() { for (unsigned i = 0, e = Constants.size(); i != e; ++i) if (Constants[i].isMachineConstantPoolEntry()) @@ -849,11 +892,10 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B, if (isa(A->getType()) || isa(A->getType()) || isa(B->getType()) || isa(B->getType())) return false; - + // For now, only support constants with the same size. uint64_t StoreSize = TD->getTypeStoreSize(A->getType()); - if (StoreSize != TD->getTypeStoreSize(B->getType()) || - StoreSize > 128) + if (StoreSize != TD->getTypeStoreSize(B->getType()) || StoreSize > 128) return false; Type *IntTy = IntegerType::get(A->getContext(), StoreSize*8); @@ -882,7 +924,7 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B, /// an existing one. User must specify the log2 of the minimum required /// alignment for the object. /// -unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C, +unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C, unsigned Alignment) { assert(Alignment && "Alignment must be specified!"); if (Alignment > PoolAlignment) PoolAlignment = Alignment; diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp new file mode 100644 index 000000000000..c6b6802b0eb1 --- /dev/null +++ b/lib/CodeGen/MachineRegionInfo.cpp @@ -0,0 +1,138 @@ + +#include "llvm/CodeGen/MachineRegionInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/RegionInfoImpl.h" + +using namespace llvm; + +STATISTIC(numMachineRegions, "The # of machine regions"); +STATISTIC(numMachineSimpleRegions, "The # of simple machine regions"); + +namespace llvm { +template class RegionBase>; +template class RegionNodeBase>; +template class RegionInfoBase>; +} + +//===----------------------------------------------------------------------===// +// MachineRegion implementation +// + +MachineRegion::MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit, + MachineRegionInfo* RI, + MachineDominatorTree *DT, MachineRegion *Parent) : + RegionBase>(Entry, Exit, RI, DT, Parent) { + +} + +MachineRegion::~MachineRegion() { } + +//===----------------------------------------------------------------------===// +// MachineRegionInfo implementation +// + +MachineRegionInfo::MachineRegionInfo() : + RegionInfoBase>() { + +} + +MachineRegionInfo::~MachineRegionInfo() { + +} + +void MachineRegionInfo::updateStatistics(MachineRegion *R) { + ++numMachineRegions; + + // TODO: Slow. Should only be enabled if -stats is used. + if (R->isSimple()) + ++numMachineSimpleRegions; +} + +void MachineRegionInfo::recalculate(MachineFunction &F, + MachineDominatorTree *DT_, + MachinePostDominatorTree *PDT_, + MachineDominanceFrontier *DF_) { + DT = DT_; + PDT = PDT_; + DF = DF_; + + MachineBasicBlock *Entry = GraphTraits::getEntryNode(&F); + + TopLevelRegion = new MachineRegion(Entry, nullptr, this, DT, nullptr); + updateStatistics(TopLevelRegion); + calculate(F); +} + +//===----------------------------------------------------------------------===// +// MachineRegionInfoPass implementation +// + +MachineRegionInfoPass::MachineRegionInfoPass() : MachineFunctionPass(ID) { + initializeMachineRegionInfoPassPass(*PassRegistry::getPassRegistry()); +} + +MachineRegionInfoPass::~MachineRegionInfoPass() { + +} + +bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) { + releaseMemory(); + + auto DT = &getAnalysis(); + auto PDT = &getAnalysis(); + auto DF = &getAnalysis(); + + RI.recalculate(F, DT, PDT, DF); + return false; +} + +void MachineRegionInfoPass::releaseMemory() { + RI.releaseMemory(); +} + +void MachineRegionInfoPass::verifyAnalysis() const { + // Only do verification when user wants to, otherwise this expensive check + // will be invoked by PMDataManager::verifyPreservedAnalysis when + // a regionpass (marked PreservedAll) finish. + if (MachineRegionInfo::VerifyRegionInfo) + RI.verifyAnalysis(); +} + +void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequiredTransitive(); + AU.addRequired(); + AU.addRequired(); +} + +void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const { + RI.print(OS); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void MachineRegionInfoPass::dump() const { + RI.dump(); +} +#endif + +char MachineRegionInfoPass::ID = 0; + +INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, "regions", + "Detect single entry single exit regions", true, true) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) +INITIALIZE_PASS_END(MachineRegionInfoPass, "regions", + "Detect single entry single exit regions", true, true) + +// Create methods available outside of this file, to use them +// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by +// the link time optimization. + +namespace llvm { + FunctionPass *createMachineRegionInfoPass() { + return new MachineRegionInfoPass(); + } +} + diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 0baf2a6c1c21..44191f785386 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -478,14 +478,13 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const { // unimplemented } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ReadyQueue::dump() { dbgs() << Name << ": "; for (unsigned i = 0, e = Queue.size(); i < e; ++i) dbgs() << Queue[i]->NodeNum << " "; dbgs() << "\n"; } -#endif //===----------------------------------------------------------------------===// // ScheduleDAGMI - Basic machine instruction scheduling. This is @@ -1687,8 +1686,16 @@ bool SchedBoundary::checkHazard(SUnit *SU) { for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC), PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { - if (getNextResourceCycle(PI->ProcResourceIdx, PI->Cycles) > CurrCycle) + unsigned NRCycle = getNextResourceCycle(PI->ProcResourceIdx, PI->Cycles); + if (NRCycle > CurrCycle) { +#ifndef NDEBUG + MaxObservedStall = std::max(PI->Cycles, MaxObservedStall); +#endif + DEBUG(dbgs() << " SU(" << SU->NodeNum << ") " + << SchedModel->getResourceName(PI->ProcResourceIdx) + << "=" << NRCycle << "c\n"); return true; + } } } return false; @@ -1946,10 +1953,12 @@ void SchedBoundary::bumpNode(SUnit *SU) { PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { unsigned PIdx = PI->ProcResourceIdx; if (SchedModel->getProcResource(PIdx)->BufferSize == 0) { - ReservedCycles[PIdx] = isTop() ? NextCycle + PI->Cycles : NextCycle; -#ifndef NDEBUG - MaxObservedStall = std::max(PI->Cycles, MaxObservedStall); -#endif + if (isTop()) { + ReservedCycles[PIdx] = + std::max(getNextResourceCycle(PIdx, 0), NextCycle + PI->Cycles); + } + else + ReservedCycles[PIdx] = NextCycle; } } } @@ -2052,8 +2061,10 @@ SUnit *SchedBoundary::pickOnlyChoice() { } } for (unsigned i = 0; Available.empty(); ++i) { - assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) && - "permanent hazard"); (void)i; +// FIXME: Re-enable assert once PR20057 is resolved. +// assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) && +// "permanent hazard"); + (void)i; bumpCycle(CurrCycle + 1); releasePending(); } diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp index 9568e238a2ca..249b2d0f6bb9 100644 --- a/lib/CodeGen/Passes.cpp +++ b/lib/CodeGen/Passes.cpp @@ -30,11 +30,6 @@ using namespace llvm; -namespace llvm { -extern cl::opt EnableStackMapLiveness; -extern cl::opt EnablePatchPointLiveness; -} - static cl::opt DisablePostRA("disable-post-ra", cl::Hidden, cl::desc("Disable Post Regalloc")); static cl::opt DisableBranchFold("disable-branch-fold", cl::Hidden, @@ -421,7 +416,7 @@ void TargetPassConfig::addPassesToHandleExceptions() { // FALLTHROUGH case ExceptionHandling::DwarfCFI: case ExceptionHandling::ARM: - case ExceptionHandling::Win64: + case ExceptionHandling::WinEH: addPass(createDwarfEHPass(TM)); break; case ExceptionHandling::None: @@ -566,8 +561,7 @@ void TargetPassConfig::addMachinePasses() { if (addPreEmitPass()) printAndVerify("After PreEmit passes"); - if (EnableStackMapLiveness || EnablePatchPointLiveness) - addPass(&StackMapLivenessID); + addPass(&StackMapLivenessID); } /// Add passes that optimize machine instructions in SSA form. diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp index eeee93a8895e..716cb1f46f15 100644 --- a/lib/CodeGen/PeepholeOptimizer.cpp +++ b/lib/CodeGen/PeepholeOptimizer.cpp @@ -91,6 +91,10 @@ static cl::opt DisablePeephole("disable-peephole", cl::Hidden, cl::init(false), cl::desc("Disable the peephole optimizer")); +static cl::opt +DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(true), + cl::desc("Disable advanced copy optimization")); + STATISTIC(NumReuse, "Number of extension results reused"); STATISTIC(NumCmps, "Number of compares eliminated"); STATISTIC(NumImmFold, "Number of move immediate folded"); @@ -137,6 +141,105 @@ namespace { bool isLoadFoldable(MachineInstr *MI, SmallSet &FoldAsLoadDefCandidates); }; + + /// \brief Helper class to track the possible sources of a value defined by + /// a (chain of) copy related instructions. + /// Given a definition (instruction and definition index), this class + /// follows the use-def chain to find successive suitable sources. + /// The given source can be used to rewrite the definition into + /// def = COPY src. + /// + /// For instance, let us consider the following snippet: + /// v0 = + /// v2 = INSERT_SUBREG v1, v0, sub0 + /// def = COPY v2.sub0 + /// + /// Using a ValueTracker for def = COPY v2.sub0 will give the following + /// suitable sources: + /// v2.sub0 and v0. + /// Then, def can be rewritten into def = COPY v0. + class ValueTracker { + private: + /// The current point into the use-def chain. + const MachineInstr *Def; + /// The index of the definition in Def. + unsigned DefIdx; + /// The sub register index of the definition. + unsigned DefSubReg; + /// The register where the value can be found. + unsigned Reg; + /// Specifiy whether or not the value tracking looks through + /// complex instructions. When this is false, the value tracker + /// bails on everything that is not a copy or a bitcast. + /// + /// Note: This could have been implemented as a specialized version of + /// the ValueTracker class but that would have complicated the code of + /// the users of this class. + bool UseAdvancedTracking; + /// Optional MachineRegisterInfo used to perform some complex + /// tracking. + const MachineRegisterInfo *MRI; + + /// \brief Dispatcher to the right underlying implementation of + /// getNextSource. + bool getNextSourceImpl(unsigned &SrcIdx, unsigned &SrcSubReg); + /// \brief Specialized version of getNextSource for Copy instructions. + bool getNextSourceFromCopy(unsigned &SrcIdx, unsigned &SrcSubReg); + /// \brief Specialized version of getNextSource for Bitcast instructions. + bool getNextSourceFromBitcast(unsigned &SrcIdx, unsigned &SrcSubReg); + /// \brief Specialized version of getNextSource for RegSequence + /// instructions. + bool getNextSourceFromRegSequence(unsigned &SrcIdx, unsigned &SrcSubReg); + /// \brief Specialized version of getNextSource for InsertSubreg + /// instructions. + bool getNextSourceFromInsertSubreg(unsigned &SrcIdx, unsigned &SrcSubReg); + /// \brief Specialized version of getNextSource for ExtractSubreg + /// instructions. + bool getNextSourceFromExtractSubreg(unsigned &SrcIdx, unsigned &SrcSubReg); + /// \brief Specialized version of getNextSource for SubregToReg + /// instructions. + bool getNextSourceFromSubregToReg(unsigned &SrcIdx, unsigned &SrcSubReg); + + public: + /// \brief Create a ValueTracker instance for the value defines by \p MI + /// at the operand index \p DefIdx. + /// \p DefSubReg represents the sub register index the value tracker will + /// track. It does not need to match the sub register index used in \p MI. + /// \p UseAdvancedTracking specifies whether or not the value tracker looks + /// through complex instructions. By default (false), it handles only copy + /// and bitcast instructions. + /// \p MRI useful to perform some complex checks. + ValueTracker(const MachineInstr &MI, unsigned DefIdx, unsigned DefSubReg, + bool UseAdvancedTracking = false, + const MachineRegisterInfo *MRI = nullptr) + : Def(&MI), DefIdx(DefIdx), DefSubReg(DefSubReg), + UseAdvancedTracking(UseAdvancedTracking), MRI(MRI) { + assert(Def->getOperand(DefIdx).isDef() && + Def->getOperand(DefIdx).isReg() && + "Definition does not match machine instruction"); + // Initially the value is in the defined register. + Reg = Def->getOperand(DefIdx).getReg(); + } + + /// \brief Following the use-def chain, get the next available source + /// for the tracked value. + /// When the returned value is not nullptr, getReg() gives the register + /// that contain the tracked value. + /// \note The sub register index returned in \p SrcSubReg must be used + /// on that getReg() to access the actual value. + /// \return Unless the returned value is nullptr (i.e., no source found), + /// \p SrcIdx gives the index of the next source in the returned + /// instruction and \p SrcSubReg the index to be used on that source to + /// get the tracked value. When nullptr is returned, no alternative source + /// has been found. + const MachineInstr *getNextSource(unsigned &SrcIdx, unsigned &SrcSubReg); + + /// \brief Get the last register where the initial value can be found. + /// Initially this is the register of the definition. + /// Then, after each successful call to getNextSource, this is the + /// register of the last source. + unsigned getReg() const { return Reg; } + }; } char PeepholeOptimizer::ID = 0; @@ -443,31 +546,32 @@ bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) { unsigned Src; unsigned SrcSubReg; bool ShouldRewrite = false; - MachineInstr *Copy = MI; const TargetRegisterInfo &TRI = *TM->getRegisterInfo(); - // Follow the chain of copies until we reach the top or find a - // more suitable source. + // Follow the chain of copies until we reach the top of the use-def chain + // or find a more suitable source. + ValueTracker ValTracker(*MI, DefIdx, DefSubReg, !DisableAdvCopyOpt, MRI); do { - unsigned CopyDefIdx, CopySrcIdx; - if (!getCopyOrBitcastDefUseIdx(*Copy, CopyDefIdx, CopySrcIdx)) + unsigned CopySrcIdx, CopySrcSubReg; + if (!ValTracker.getNextSource(CopySrcIdx, CopySrcSubReg)) break; - const MachineOperand &MO = Copy->getOperand(CopySrcIdx); - assert(MO.isReg() && "Copies must be between registers."); - Src = MO.getReg(); - + Src = ValTracker.getReg(); + SrcSubReg = CopySrcSubReg; + + // Do not extend the live-ranges of physical registers as they add + // constraints to the register allocator. + // Moreover, if we want to extend the live-range of a physical register, + // unlike SSA virtual register, we will have to check that they are not + // redefine before the related use. if (TargetRegisterInfo::isPhysicalRegister(Src)) break; const TargetRegisterClass *SrcRC = MRI->getRegClass(Src); - SrcSubReg = MO.getSubReg(); // If this source does not incur a cross register bank copy, use it. ShouldRewrite = shareSameRegisterFile(TRI, DefRC, DefSubReg, SrcRC, SrcSubReg); - // Follow the chain of copies: get the definition of Src. - Copy = MRI->getVRegDef(Src); - } while (!ShouldRewrite && Copy && (Copy->isCopy() || Copy->isBitcast())); + } while (!ShouldRewrite); // If we did not find a more suitable source, there is nothing to optimize. if (!ShouldRewrite || Src == MI->getOperand(SrcIdx).getReg()) @@ -483,6 +587,9 @@ bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) { MRI->replaceRegWith(Def, NewVR); MRI->clearKillFlags(NewVR); + // We extended the lifetime of Src. + // Clear the kill flags to account for that. + MRI->clearKillFlags(Src); MI->eraseFromParent(); ++NumCopiesBitcasts; return true; @@ -673,3 +780,251 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { return Changed; } + +bool ValueTracker::getNextSourceFromCopy(unsigned &SrcIdx, + unsigned &SrcSubReg) { + assert(Def->isCopy() && "Invalid definition"); + // Copy instruction are supposed to be: Def = Src. + // If someone breaks this assumption, bad things will happen everywhere. + assert(Def->getDesc().getNumOperands() == 2 && "Invalid number of operands"); + + if (Def->getOperand(DefIdx).getSubReg() != DefSubReg) + // If we look for a different subreg, it means we want a subreg of src. + // Bails as we do not support composing subreg yet. + return false; + // Otherwise, we want the whole source. + SrcIdx = 1; + SrcSubReg = Def->getOperand(SrcIdx).getSubReg(); + return true; +} + +bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcIdx, + unsigned &SrcSubReg) { + assert(Def->isBitcast() && "Invalid definition"); + + // Bail if there are effects that a plain copy will not expose. + if (Def->hasUnmodeledSideEffects()) + return false; + + // Bitcasts with more than one def are not supported. + if (Def->getDesc().getNumDefs() != 1) + return false; + if (Def->getOperand(DefIdx).getSubReg() != DefSubReg) + // If we look for a different subreg, it means we want a subreg of the src. + // Bails as we do not support composing subreg yet. + return false; + + SrcIdx = Def->getDesc().getNumOperands(); + for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx; + ++OpIdx) { + const MachineOperand &MO = Def->getOperand(OpIdx); + if (!MO.isReg() || !MO.getReg()) + continue; + assert(!MO.isDef() && "We should have skipped all the definitions by now"); + if (SrcIdx != EndOpIdx) + // Multiple sources? + return false; + SrcIdx = OpIdx; + } + SrcSubReg = Def->getOperand(SrcIdx).getSubReg(); + return true; +} + +bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcIdx, + unsigned &SrcSubReg) { + assert(Def->isRegSequence() && "Invalid definition"); + + if (Def->getOperand(DefIdx).getSubReg()) + // If we are composing subreg, bails out. + // The case we are checking is Def. = REG_SEQUENCE. + // This should almost never happen as the SSA property is tracked at + // the register level (as opposed to the subreg level). + // I.e., + // Def.sub0 = + // Def.sub1 = + // is a valid SSA representation for Def.sub0 and Def.sub1, but not for + // Def. Thus, it must not be generated. + // However, some code could theoretically generates a single + // Def.sub0 (i.e, not defining the other subregs) and we would + // have this case. + // If we can ascertain (or force) that this never happens, we could + // turn that into an assertion. + return false; + + // We are looking at: + // Def = REG_SEQUENCE v0, sub0, v1, sub1, ... + // Check if one of the operand defines the subreg we are interested in. + for (unsigned OpIdx = DefIdx + 1, EndOpIdx = Def->getNumOperands(); + OpIdx != EndOpIdx; OpIdx += 2) { + const MachineOperand &MOSubIdx = Def->getOperand(OpIdx + 1); + assert(MOSubIdx.isImm() && + "One of the subindex of the reg_sequence is not an immediate"); + if (MOSubIdx.getImm() == DefSubReg) { + assert(Def->getOperand(OpIdx).isReg() && + "One of the source of the reg_sequence is not a register"); + SrcIdx = OpIdx; + SrcSubReg = Def->getOperand(SrcIdx).getSubReg(); + return true; + } + } + + // If the subreg we are tracking is super-defined by another subreg, + // we could follow this value. However, this would require to compose + // the subreg and we do not do that for now. + return false; +} + +bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcIdx, + unsigned &SrcSubReg) { + assert(Def->isInsertSubreg() && "Invalid definition"); + if (Def->getOperand(DefIdx).getSubReg()) + // If we are composing subreg, bails out. + // Same remark as getNextSourceFromRegSequence. + // I.e., this may be turned into an assert. + return false; + + // We are looking at: + // Def = INSERT_SUBREG v0, v1, sub1 + // There are two cases: + // 1. DefSubReg == sub1, get v1. + // 2. DefSubReg != sub1, the value may be available through v0. + + // #1 Check if the inserted register matches the require sub index. + unsigned InsertedSubReg = Def->getOperand(3).getImm(); + if (InsertedSubReg == DefSubReg) { + SrcIdx = 2; + SrcSubReg = Def->getOperand(SrcIdx).getSubReg(); + return true; + } + // #2 Otherwise, if the sub register we are looking for is not partial + // defined by the inserted element, we can look through the main + // register (v0). + // To check the overlapping we need a MRI and a TRI. + if (!MRI) + return false; + + const MachineOperand &MODef = Def->getOperand(DefIdx); + const MachineOperand &MOBase = Def->getOperand(1); + // If the result register (Def) and the base register (v0) do not + // have the same register class or if we have to compose + // subregisters, bails out. + if (MRI->getRegClass(MODef.getReg()) != MRI->getRegClass(MOBase.getReg()) || + MOBase.getSubReg()) + return false; + + // Get the TRI and check if inserted sub register overlaps with the + // sub register we are tracking. + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + if (!TRI || + (TRI->getSubRegIndexLaneMask(DefSubReg) & + TRI->getSubRegIndexLaneMask(InsertedSubReg)) != 0) + return false; + // At this point, the value is available in v0 via the same subreg + // we used for Def. + SrcIdx = 1; + SrcSubReg = DefSubReg; + return true; +} + +bool ValueTracker::getNextSourceFromExtractSubreg(unsigned &SrcIdx, + unsigned &SrcSubReg) { + assert(Def->isExtractSubreg() && "Invalid definition"); + // We are looking at: + // Def = EXTRACT_SUBREG v0, sub0 + + // Bails if we have to compose sub registers. + // Indeed, if DefSubReg != 0, we would have to compose it with sub0. + if (DefSubReg) + return false; + + // Bails if we have to compose sub registers. + // Likewise, if v0.subreg != 0, we would have to compose v0.subreg with sub0. + if (Def->getOperand(1).getSubReg()) + return false; + // Otherwise, the value is available in the v0.sub0. + SrcIdx = 1; + SrcSubReg = Def->getOperand(2).getImm(); + return true; +} + +bool ValueTracker::getNextSourceFromSubregToReg(unsigned &SrcIdx, + unsigned &SrcSubReg) { + assert(Def->isSubregToReg() && "Invalid definition"); + // We are looking at: + // Def = SUBREG_TO_REG Imm, v0, sub0 + + // Bails if we have to compose sub registers. + // If DefSubReg != sub0, we would have to check that all the bits + // we track are included in sub0 and if yes, we would have to + // determine the right subreg in v0. + if (DefSubReg != Def->getOperand(3).getImm()) + return false; + // Bails if we have to compose sub registers. + // Likewise, if v0.subreg != 0, we would have to compose it with sub0. + if (Def->getOperand(2).getSubReg()) + return false; + + SrcIdx = 2; + SrcSubReg = Def->getOperand(3).getImm(); + return true; +} + +bool ValueTracker::getNextSourceImpl(unsigned &SrcIdx, unsigned &SrcSubReg) { + assert(Def && "This method needs a valid definition"); + + assert( + (DefIdx < Def->getDesc().getNumDefs() || Def->getDesc().isVariadic()) && + Def->getOperand(DefIdx).isDef() && "Invalid DefIdx"); + if (Def->isCopy()) + return getNextSourceFromCopy(SrcIdx, SrcSubReg); + if (Def->isBitcast()) + return getNextSourceFromBitcast(SrcIdx, SrcSubReg); + // All the remaining cases involve "complex" instructions. + // Bails if we did not ask for the advanced tracking. + if (!UseAdvancedTracking) + return false; + if (Def->isRegSequence()) + return getNextSourceFromRegSequence(SrcIdx, SrcSubReg); + if (Def->isInsertSubreg()) + return getNextSourceFromInsertSubreg(SrcIdx, SrcSubReg); + if (Def->isExtractSubreg()) + return getNextSourceFromExtractSubreg(SrcIdx, SrcSubReg); + if (Def->isSubregToReg()) + return getNextSourceFromSubregToReg(SrcIdx, SrcSubReg); + return false; +} + +const MachineInstr *ValueTracker::getNextSource(unsigned &SrcIdx, + unsigned &SrcSubReg) { + // If we reach a point where we cannot move up in the use-def chain, + // there is nothing we can get. + if (!Def) + return nullptr; + + const MachineInstr *PrevDef = nullptr; + // Try to find the next source. + if (getNextSourceImpl(SrcIdx, SrcSubReg)) { + // Update definition, definition index, and subregister for the + // next call of getNextSource. + const MachineOperand &MO = Def->getOperand(SrcIdx); + assert(MO.isReg() && !MO.isDef() && "Source is invalid"); + // Update the current register. + Reg = MO.getReg(); + // Update the return value before moving up in the use-def chain. + PrevDef = Def; + // If we can still move up in the use-def chain, move to the next + // defintion. + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) { + Def = MRI->getVRegDef(Reg); + DefIdx = MRI->def_begin(Reg).getOperandNo(); + DefSubReg = SrcSubReg; + return PrevDef; + } + } + // If we end up here, this means we will not be able to find another source + // for the next iteration. + // Make sure any new call to getNextSource bails out early by cutting the + // use-def chain. + Def = nullptr; + return PrevDef; +} diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp index db3933e5eedf..a1ab3445fa4f 100644 --- a/lib/CodeGen/PostRASchedulerList.cpp +++ b/lib/CodeGen/PostRASchedulerList.cpp @@ -98,6 +98,11 @@ namespace { } bool runOnMachineFunction(MachineFunction &Fn) override; + + bool enablePostRAScheduler( + const TargetSubtargetInfo &ST, CodeGenOpt::Level OptLevel, + TargetSubtargetInfo::AntiDepBreakMode &Mode, + TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const; }; char PostRAScheduler::ID = 0; @@ -245,6 +250,17 @@ void SchedulePostRATDList::dumpSchedule() const { } #endif +bool PostRAScheduler::enablePostRAScheduler( + const TargetSubtargetInfo &ST, + CodeGenOpt::Level OptLevel, + TargetSubtargetInfo::AntiDepBreakMode &Mode, + TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const { + Mode = ST.getAntiDepBreakMode(); + ST.getCriticalPathRCs(CriticalPathRCs); + return ST.enablePostMachineScheduler() && + OptLevel >= ST.getOptLevelToEnablePostRAScheduler(); +} + bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { if (skipOptnoneFunction(*Fn.getFunction())) return false; @@ -267,9 +283,10 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { } else { // Check that post-RA scheduling is enabled for this target. // This may upgrade the AntiDepMode. - const TargetSubtargetInfo &ST = Fn.getTarget().getSubtarget(); - if (!ST.enablePostRAScheduler(PassConfig->getOptLevel(), AntiDepMode, - CriticalPathRCs)) + const TargetSubtargetInfo &ST = + Fn.getTarget().getSubtarget(); + if (!enablePostRAScheduler(ST, PassConfig->getOptLevel(), + AntiDepMode, CriticalPathRCs)) return false; } diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index 1ba2c7418f21..b98d210e9d57 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -268,51 +268,56 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &F) { } } - if (CSI.empty()) - return; // Early exit if no callee saved registers are modified! - - unsigned NumFixedSpillSlots; - const TargetFrameLowering::SpillSlot *FixedSpillSlots = - TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots); + if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) { + // If target doesn't implement this, use generic code. + + if (CSI.empty()) + return; // Early exit if no callee saved registers are modified! + + unsigned NumFixedSpillSlots; + const TargetFrameLowering::SpillSlot *FixedSpillSlots = + TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots); + + // Now that we know which registers need to be saved and restored, allocate + // stack slots for them. + for (std::vector::iterator I = CSI.begin(), E = CSI.end(); + I != E; ++I) { + unsigned Reg = I->getReg(); + const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + + int FrameIdx; + if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) { + I->setFrameIdx(FrameIdx); + continue; + } - // Now that we know which registers need to be saved and restored, allocate - // stack slots for them. - for (std::vector::iterator - I = CSI.begin(), E = CSI.end(); I != E; ++I) { - unsigned Reg = I->getReg(); - const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + // Check to see if this physreg must be spilled to a particular stack slot + // on this target. + const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots; + while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots && + FixedSlot->Reg != Reg) + ++FixedSlot; + + if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) { + // Nope, just spill it anywhere convenient. + unsigned Align = RC->getAlignment(); + unsigned StackAlign = TFI->getStackAlignment(); + + // We may not be able to satisfy the desired alignment specification of + // the TargetRegisterClass if the stack alignment is smaller. Use the + // min. + Align = std::min(Align, StackAlign); + FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true); + if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; + if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; + } else { + // Spill it to the stack where we must. + FrameIdx = + MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset); + } - int FrameIdx; - if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) { I->setFrameIdx(FrameIdx); - continue; } - - // Check to see if this physreg must be spilled to a particular stack slot - // on this target. - const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots; - while (FixedSlot != FixedSpillSlots+NumFixedSpillSlots && - FixedSlot->Reg != Reg) - ++FixedSlot; - - if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) { - // Nope, just spill it anywhere convenient. - unsigned Align = RC->getAlignment(); - unsigned StackAlign = TFI->getStackAlignment(); - - // We may not be able to satisfy the desired alignment specification of - // the TargetRegisterClass if the stack alignment is smaller. Use the - // min. - Align = std::min(Align, StackAlign); - FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true); - if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; - if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; - } else { - // Spill it to the stack where we must. - FrameIdx = MFI->CreateFixedObject(RC->getSize(), FixedSlot->Offset, true); - } - - I->setFrameIdx(FrameIdx); } MFI->setCalleeSavedInfo(CSI); diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp index b722098c5c75..6bc678e85211 100644 --- a/lib/CodeGen/RegAllocBasic.cpp +++ b/lib/CodeGen/RegAllocBasic.cpp @@ -157,7 +157,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { } void RABasic::releaseMemory() { - SpillerInstance.reset(nullptr); + SpillerInstance.reset(); } diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index aa7c1785035b..dee990c2eb5f 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -44,6 +44,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include using namespace llvm; @@ -79,6 +80,12 @@ ExhaustiveSearch("exhaustive-register-search", cl::NotHidden, cl::desc("Exhaustive Search for registers bypassing the depth " "and interference cutoffs of last chance recoloring")); +static cl::opt EnableLocalReassignment( + "enable-local-reassign", cl::Hidden, + cl::desc("Local reassignment can yield better allocation decisions, but " + "may be compile time intensive"), + cl::init(false)); + // FIXME: Find a good default for this flag and remove the flag. static cl::opt CSRFirstTimeCost("regalloc-csr-first-time-cost", @@ -285,6 +292,10 @@ class RAGreedy : public MachineFunctionPass, /// Callee-save register cost, calculated once per machine function. BlockFrequency CSRCost; + /// Run or not the local reassignment heuristic. This information is + /// obtained from the TargetSubtargetInfo. + bool EnableLocalReassign; + public: RAGreedy(); @@ -475,7 +486,7 @@ void RAGreedy::LRE_DidCloneVirtReg(unsigned New, unsigned Old) { } void RAGreedy::releaseMemory() { - SpillerInstance.reset(nullptr); + SpillerInstance.reset(); ExtraRegInfo.clear(); GlobalCand.clear(); } @@ -731,7 +742,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg, // Evicting another local live range in this case could lead to suboptimal // coloring. if (!MaxCost.isMax() && IsLocal && LIS->intervalIsInOneMBB(*Intf) && - !canReassign(*Intf, PhysReg)) { + (!EnableLocalReassign || !canReassign(*Intf, PhysReg))) { return false; } } @@ -2308,9 +2319,14 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { << "********** Function: " << mf.getName() << '\n'); MF = &mf; - TRI = MF->getTarget().getRegisterInfo(); - TII = MF->getTarget().getInstrInfo(); + const TargetMachine &TM = MF->getTarget(); + TRI = TM.getRegisterInfo(); + TII = TM.getInstrInfo(); RCI.runOnMachineFunction(mf); + + EnableLocalReassign = EnableLocalReassignment || + TM.getSubtargetImpl()->enableRALocalReassignment(TM.getOptLevel()); + if (VerifyEnabled) MF->verify(this, "Before greedy register allocator"); diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp index b8d23250064b..8a3b53fd08e5 100644 --- a/lib/CodeGen/RegAllocPBQP.cpp +++ b/lib/CodeGen/RegAllocPBQP.cpp @@ -88,8 +88,8 @@ class RegAllocPBQP : public MachineFunctionPass { static char ID; /// Construct a PBQP register allocator. - RegAllocPBQP(std::unique_ptr &b, char *cPassID=nullptr) - : MachineFunctionPass(ID), builder(b.release()), customPassID(cPassID) { + RegAllocPBQP(std::unique_ptr b, char *cPassID = nullptr) + : MachineFunctionPass(ID), builder(std::move(b)), customPassID(cPassID) { initializeSlotIndexesPass(*PassRegistry::getPassRegistry()); initializeLiveIntervalsPass(*PassRegistry::getPassRegistry()); initializeLiveStacksPass(*PassRegistry::getPassRegistry()); @@ -614,18 +614,18 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { } FunctionPass * -llvm::createPBQPRegisterAllocator(std::unique_ptr &builder, +llvm::createPBQPRegisterAllocator(std::unique_ptr builder, char *customPassID) { - return new RegAllocPBQP(builder, customPassID); + return new RegAllocPBQP(std::move(builder), customPassID); } FunctionPass* llvm::createDefaultPBQPRegisterAllocator() { std::unique_ptr Builder; if (pbqpCoalescing) - Builder.reset(new PBQPBuilderWithCoalescing()); + Builder = llvm::make_unique(); else - Builder.reset(new PBQPBuilder()); - return createPBQPRegisterAllocator(Builder); + Builder = llvm::make_unique(); + return createPBQPRegisterAllocator(std::move(Builder)); } #undef DEBUG_TYPE diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp index 5aaeb874d68c..e04a3cf077ff 100644 --- a/lib/CodeGen/RegisterCoalescer.cpp +++ b/lib/CodeGen/RegisterCoalescer.cpp @@ -1037,6 +1037,22 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { return false; } + if (CP.getNewRC()) { + auto SrcRC = MRI->getRegClass(CP.getSrcReg()); + auto DstRC = MRI->getRegClass(CP.getDstReg()); + unsigned SrcIdx = CP.getSrcIdx(); + unsigned DstIdx = CP.getDstIdx(); + if (CP.isFlipped()) { + std::swap(SrcIdx, DstIdx); + std::swap(SrcRC, DstRC); + } + if (!TRI->shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx, + CP.getNewRC())) { + DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n"); + return false; + } + } + // Dead code elimination. This really should be handled by MachineDCE, but // sometimes dead copies slip through, and we can't generate invalid live // ranges. diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp index b2909e0a226d..617e45989c66 100644 --- a/lib/CodeGen/RegisterPressure.cpp +++ b/lib/CodeGen/RegisterPressure.cpp @@ -41,7 +41,7 @@ static void decreaseSetPressure(std::vector &CurrSetPressure, } } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void llvm::dumpRegSetPressure(ArrayRef SetPressure, const TargetRegisterInfo *TRI) { bool Empty = true; @@ -55,6 +55,7 @@ void llvm::dumpRegSetPressure(ArrayRef SetPressure, dbgs() << "\n"; } +LLVM_DUMP_METHOD void RegisterPressure::dump(const TargetRegisterInfo *TRI) const { dbgs() << "Max Pressure: "; dumpRegSetPressure(MaxSetPressure, TRI); @@ -68,6 +69,7 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const { dbgs() << '\n'; } +LLVM_DUMP_METHOD void RegPressureTracker::dump() const { if (!isTopClosed() || !isBottomClosed()) { dbgs() << "Curr Pressure: "; @@ -75,7 +77,6 @@ void RegPressureTracker::dump() const { } P.dump(TRI); } -#endif /// Increase the current pressure as impacted by these registers and bump /// the high water mark if needed. diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 92a9a30f24c2..0f8b21c1c1f3 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -1508,7 +1508,7 @@ void SchedDFSResult::scheduleTree(unsigned SubtreeID) { } } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const { OS << InstrCount << " / " << Length << " = "; if (!Length) @@ -1517,16 +1517,17 @@ void ILPValue::print(raw_ostream &OS) const { OS << format("%g", ((double)InstrCount / Length)); } +LLVM_DUMP_METHOD void ILPValue::dump() const { dbgs() << *this << '\n'; } namespace llvm { +LLVM_DUMP_METHOD raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) { Val.print(OS); return OS; } } // namespace llvm -#endif // !NDEBUG || LLVM_ENABLE_DUMP diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0f5018484d93..c9a5919aaa87 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -104,19 +104,19 @@ namespace { // contain duplicate or removed nodes. When choosing a node to // visit, we pop off the order stack until we find an item that is // also in the contents set. All operations are O(log N). - SmallPtrSet WorkListContents; - SmallVector WorkListOrder; + SmallPtrSet WorklistContents; + SmallVector WorklistOrder; // AA - Used for DAG load/store alias analysis. AliasAnalysis &AA; - /// AddUsersToWorkList - When an instruction is simplified, add all users of + /// AddUsersToWorklist - When an instruction is simplified, add all users of /// the instruction to the work lists because they might get more simplified /// now. /// - void AddUsersToWorkList(SDNode *N) { + void AddUsersToWorklist(SDNode *N) { for (SDNode *Node : N->uses()) - AddToWorkList(Node); + AddToWorklist(Node); } /// visit - call the node-specific routine that knows how to fold each @@ -124,17 +124,22 @@ namespace { SDValue visit(SDNode *N); public: - /// AddToWorkList - Add to the work list making sure its instance is at the + /// AddToWorklist - Add to the work list making sure its instance is at the /// back (next to be processed.) - void AddToWorkList(SDNode *N) { - WorkListContents.insert(N); - WorkListOrder.push_back(N); + void AddToWorklist(SDNode *N) { + // Skip handle nodes as they can't usefully be combined and confuse the + // zero-use deletion strategy. + if (N->getOpcode() == ISD::HANDLENODE) + return; + + WorklistContents.insert(N); + WorklistOrder.push_back(N); } - /// removeFromWorkList - remove all instances of N from the worklist. + /// removeFromWorklist - remove all instances of N from the worklist. /// - void removeFromWorkList(SDNode *N) { - WorkListContents.erase(N); + void removeFromWorklist(SDNode *N) { + WorklistContents.erase(N); } SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, @@ -169,16 +174,6 @@ namespace { bool CombineToPostIndexedLoadStore(SDNode *N); bool SliceUpLoad(SDNode *N); - /// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed - /// load. - /// - /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced. - /// \param InVecVT type of the input vector to EVE with bitcasts resolved. - /// \param EltNo index of the vector element to load. - /// \param OriginalLoad load that EVE came from to be replaced. - /// \returns EVE on success SDValue() on failure. - SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad( - SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad); void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad); SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace); SDValue SExtPromoteOperand(SDValue Op, EVT PVT); @@ -391,16 +386,16 @@ namespace { namespace { -/// WorkListRemover - This class is a DAGUpdateListener that removes any deleted +/// WorklistRemover - This class is a DAGUpdateListener that removes any deleted /// nodes from the worklist. -class WorkListRemover : public SelectionDAG::DAGUpdateListener { +class WorklistRemover : public SelectionDAG::DAGUpdateListener { DAGCombiner &DC; public: - explicit WorkListRemover(DAGCombiner &dc) + explicit WorklistRemover(DAGCombiner &dc) : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} void NodeDeleted(SDNode *N, SDNode *E) override { - DC.removeFromWorkList(N); + DC.removeFromWorklist(N); } }; } @@ -410,11 +405,11 @@ class WorkListRemover : public SelectionDAG::DAGUpdateListener { //===----------------------------------------------------------------------===// void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) { - ((DAGCombiner*)DC)->AddToWorkList(N); + ((DAGCombiner*)DC)->AddToWorklist(N); } void TargetLowering::DAGCombinerInfo::RemoveFromWorklist(SDNode *N) { - ((DAGCombiner*)DC)->removeFromWorkList(N); + ((DAGCombiner*)DC)->removeFromWorklist(N); } SDValue TargetLowering::DAGCombinerInfo:: @@ -655,10 +650,14 @@ static ConstantSDNode *isConstOrConstSplat(SDValue N) { return CN; if (BuildVectorSDNode *BV = dyn_cast(N)) { - ConstantSDNode *CN = BV->getConstantSplatValue(); + BitVector UndefElements; + ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements); // BuildVectors can truncate their operands. Ignore that case here. - if (CN && CN->getValueType(0) == N.getValueType().getScalarType()) + // FIXME: We blindly ignore splats which include undef which is overly + // pessimistic. + if (CN && UndefElements.none() && + CN->getValueType(0) == N.getValueType().getScalarType()) return CN; } @@ -683,7 +682,7 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL, SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); if (!OpNode.getNode()) return SDValue(); - AddToWorkList(OpNode.getNode()); + AddToWorklist(OpNode.getNode()); return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); } } @@ -704,7 +703,7 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL, SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N1.getOperand(0), N0); if (!OpNode.getNode()) return SDValue(); - AddToWorkList(OpNode.getNode()); + AddToWorklist(OpNode.getNode()); return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1)); } } @@ -726,14 +725,14 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, assert((!To[i].getNode() || N->getValueType(i) == To[i].getValueType()) && "Cannot combine value to value of different type!")); - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesWith(N, To); if (AddTo) { // Push the new nodes and any users onto the worklist for (unsigned i = 0, e = NumTo; i != e; ++i) { if (To[i].getNode()) { - AddToWorkList(To[i].getNode()); - AddUsersToWorkList(To[i].getNode()); + AddToWorklist(To[i].getNode()); + AddUsersToWorklist(To[i].getNode()); } } } @@ -744,7 +743,7 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, if (N->use_empty()) { // Nodes can be reintroduced into the worklist. Make sure we do not // process a node that has been replaced. - removeFromWorkList(N); + removeFromWorklist(N); // Finally, since the node is now dead, remove it from the graph. DAG.DeleteNode(N); @@ -756,24 +755,24 @@ void DAGCombiner:: CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { // Replace all uses. If any nodes become isomorphic to other nodes and // are deleted, make sure to remove them from our worklist. - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New); // Push the new node and any (possibly new) users onto the worklist. - AddToWorkList(TLO.New.getNode()); - AddUsersToWorkList(TLO.New.getNode()); + AddToWorklist(TLO.New.getNode()); + AddUsersToWorklist(TLO.New.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. if (TLO.Old.getNode()->use_empty()) { - removeFromWorkList(TLO.Old.getNode()); + removeFromWorklist(TLO.Old.getNode()); // If the operands of this node are only used by the node, they will now // be dead. Make sure to visit them first to delete dead nodes early. for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i) if (TLO.Old.getNode()->getOperand(i).getNode()->hasOneUse()) - AddToWorkList(TLO.Old.getNode()->getOperand(i).getNode()); + AddToWorklist(TLO.Old.getNode()->getOperand(i).getNode()); DAG.DeleteNode(TLO.Old.getNode()); } @@ -789,7 +788,7 @@ bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { return false; // Revisit the node. - AddToWorkList(Op.getNode()); + AddToWorklist(Op.getNode()); // Replace the old value with the new one. ++NodesCombined; @@ -813,12 +812,12 @@ void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) { dbgs() << "\nWith: "; Trunc.getNode()->dump(&DAG); dbgs() << '\n'); - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1)); - removeFromWorkList(Load); + removeFromWorklist(Load); DAG.DeleteNode(Load); - AddToWorkList(Trunc.getNode()); + AddToWorklist(Trunc.getNode()); } SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) { @@ -868,7 +867,7 @@ SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) { SDValue NewOp = PromoteOperand(Op, PVT, Replace); if (!NewOp.getNode()) return SDValue(); - AddToWorkList(NewOp.getNode()); + AddToWorklist(NewOp.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); @@ -883,7 +882,7 @@ SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) { SDValue NewOp = PromoteOperand(Op, PVT, Replace); if (!NewOp.getNode()) return SDValue(); - AddToWorkList(NewOp.getNode()); + AddToWorklist(NewOp.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); @@ -930,9 +929,9 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { return SDValue(); } - AddToWorkList(NN0.getNode()); + AddToWorklist(NN0.getNode()); if (NN1.getNode()) - AddToWorkList(NN1.getNode()); + AddToWorklist(NN1.getNode()); if (Replace0) ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); @@ -982,7 +981,7 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) { if (!N0.getNode()) return SDValue(); - AddToWorkList(N0.getNode()); + AddToWorklist(N0.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); @@ -1062,12 +1061,12 @@ bool DAGCombiner::PromoteLoad(SDValue Op) { dbgs() << "\nTo: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1)); - removeFromWorkList(N); + removeFromWorklist(N); DAG.DeleteNode(N); - AddToWorkList(Result.getNode()); + AddToWorklist(Result.getNode()); return true; } return false; @@ -1087,7 +1086,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) { // Add all the dag nodes to the worklist. for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), E = DAG.allnodes_end(); I != E; ++I) - AddToWorkList(I); + AddToWorklist(I); // Create a dummy node (which is not added to allnodes), that adds a reference // to the root node, preventing it from being deleted, and tracking any @@ -1100,23 +1099,23 @@ void DAGCombiner::Run(CombineLevel AtLevel) { // while the worklist isn't empty, find a node and // try and combine it. - while (!WorkListContents.empty()) { + while (!WorklistContents.empty()) { SDNode *N; - // The WorkListOrder holds the SDNodes in order, but it may contain + // The WorklistOrder holds the SDNodes in order, but it may contain // duplicates. // In order to avoid a linear scan, we use a set (O(log N)) to hold what the // worklist *should* contain, and check the node we want to visit is should // actually be visited. do { - N = WorkListOrder.pop_back_val(); - } while (!WorkListContents.erase(N)); + N = WorklistOrder.pop_back_val(); + } while (!WorklistContents.erase(N)); // If N has no uses, it is dead. Make sure to revisit all N's operands once // N is deleted from the DAG, since they too may now be dead or may have a // reduced number of uses, allowing other xforms. - if (N->use_empty() && N != &Dummy) { + if (N->use_empty()) { for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) - AddToWorkList(N->getOperand(i).getNode()); + AddToWorklist(N->getOperand(i).getNode()); DAG.DeleteNode(N); continue; @@ -1148,7 +1147,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) { // Transfer debug value. DAG.TransferDbgValues(SDValue(N, 0), RV); - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); if (N->getNumValues() == RV.getNode()->getNumValues()) DAG.ReplaceAllUsesWith(N, RV.getNode()); else { @@ -1159,14 +1158,14 @@ void DAGCombiner::Run(CombineLevel AtLevel) { } // Push the new node and any users onto the worklist - AddToWorkList(RV.getNode()); - AddUsersToWorkList(RV.getNode()); + AddToWorklist(RV.getNode()); + AddUsersToWorklist(RV.getNode()); // Add any uses of the old node to the worklist in case this node is the // last one that uses them. They may become dead after this node is // deleted. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) - AddToWorkList(N->getOperand(i).getNode()); + AddToWorklist(N->getOperand(i).getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to @@ -1174,7 +1173,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) { if (N->use_empty()) { // Nodes can be reintroduced into the worklist. Make sure we do not // process a node that has been replaced. - removeFromWorkList(N); + removeFromWorklist(N); // Finally, since the node is now dead, remove it from the graph. DAG.DeleteNode(N); @@ -1398,7 +1397,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { // Queue up for processing. TFs.push_back(Op.getNode()); // Clean up in case the token factor is removed. - AddToWorkList(Op.getNode()); + AddToWorklist(Op.getNode()); Changed = true; break; } @@ -1436,18 +1435,18 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { /// MERGE_VALUES can always be eliminated. SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) { - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); // Replacing results may cause a different MERGE_VALUES to suddenly // be CSE'd with N, and carry its uses with it. Iterate until no // uses remain, to ensure that the node can be safely deleted. // First add the users of this node to the work list so that they // can be tried again once they have new operands. - AddUsersToWorkList(N); + AddUsersToWorklist(N); do { for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i)); } while (!N->use_empty()); - removeFromWorkList(N); + removeFromWorklist(N); DAG.DeleteNode(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -1929,7 +1928,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { isa(N0.getOperand(1)))) { SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1)); - AddToWorkList(C3.getNode()); + AddToWorklist(C3.getNode()); return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3); } @@ -2022,7 +2021,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, DAG.getConstant(VT.getScalarSizeInBits() - 1, getShiftAmountTy(N0.getValueType()))); - AddToWorkList(SGN.getNode()); + AddToWorklist(SGN.getNode()); // Add (N0 < 0) ? abs2 - 1 : 0; SDValue SRL = @@ -2030,8 +2029,8 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { DAG.getConstant(VT.getScalarSizeInBits() - lg2, getShiftAmountTy(SGN.getValueType()))); SDValue ADD = DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, SRL); - AddToWorkList(SRL.getNode()); - AddToWorkList(ADD.getNode()); // Divide by pow2 + AddToWorklist(SRL.getNode()); + AddToWorklist(ADD.getNode()); // Divide by pow2 SDValue SRA = DAG.getNode(ISD::SRA, SDLoc(N), VT, ADD, DAG.getConstant(lg2, getShiftAmountTy(ADD.getValueType()))); @@ -2040,7 +2039,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { if (N1C->getAPIntValue().isNonNegative()) return SRA; - AddToWorkList(SRA.getNode()); + AddToWorklist(SRA.getNode()); return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), SRA); } @@ -2092,7 +2091,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { DAG.getConstant(SHC->getAPIntValue() .logBase2(), ADDVT)); - AddToWorkList(Add.getNode()); + AddToWorklist(Add.getNode()); return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, Add); } } @@ -2134,13 +2133,13 @@ SDValue DAGCombiner::visitSREM(SDNode *N) { // X%C to the equivalent of X-X/C*C. if (N1C && !N1C->isNullValue()) { SDValue Div = DAG.getNode(ISD::SDIV, SDLoc(N), VT, N0, N1); - AddToWorkList(Div.getNode()); + AddToWorklist(Div.getNode()); SDValue OptimizedDiv = combine(Div.getNode()); if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) { SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, OptimizedDiv, N1); SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul); - AddToWorkList(Mul.getNode()); + AddToWorklist(Mul.getNode()); return Sub; } } @@ -2177,7 +2176,7 @@ SDValue DAGCombiner::visitUREM(SDNode *N) { DAG.getNode(ISD::ADD, SDLoc(N), VT, N1, DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT)); - AddToWorkList(Add.getNode()); + AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, Add); } } @@ -2187,13 +2186,13 @@ SDValue DAGCombiner::visitUREM(SDNode *N) { // X%C to the equivalent of X-X/C*C. if (N1C && !N1C->isNullValue()) { SDValue Div = DAG.getNode(ISD::UDIV, SDLoc(N), VT, N0, N1); - AddToWorkList(Div.getNode()); + AddToWorklist(Div.getNode()); SDValue OptimizedDiv = combine(Div.getNode()); if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) { SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, OptimizedDiv, N1); SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul); - AddToWorkList(Mul.getNode()); + AddToWorklist(Mul.getNode()); return Sub; } } @@ -2316,7 +2315,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, if (LoExists) { SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), ArrayRef(N->op_begin(), N->op_end())); - AddToWorkList(Lo.getNode()); + AddToWorklist(Lo.getNode()); SDValue LoOpt = combine(Lo.getNode()); if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() && (!LegalOperations || @@ -2327,7 +2326,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, if (HiExists) { SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), ArrayRef(N->op_begin(), N->op_end())); - AddToWorkList(Hi.getNode()); + AddToWorklist(Hi.getNode()); SDValue HiOpt = combine(Hi.getNode()); if (HiOpt.getNode() && HiOpt != Hi && (!LegalOperations || @@ -2466,7 +2465,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0), N0.getOperand(0).getValueType(), N0.getOperand(0), N1.getOperand(0)); - AddToWorkList(ORNode.getNode()); + AddToWorklist(ORNode.getNode()); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode); } @@ -2480,7 +2479,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0), N0.getOperand(0).getValueType(), N0.getOperand(0), N1.getOperand(0)); - AddToWorkList(ORNode.getNode()); + AddToWorklist(ORNode.getNode()); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode, N0.getOperand(1)); } @@ -2505,7 +2504,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) { SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1); SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op); - AddToWorkList(Op.getNode()); + AddToWorklist(Op.getNode()); return BC; } } @@ -2552,7 +2551,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) { SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0->getOperand(0), N1->getOperand(0)); - AddToWorkList(NewNode.getNode()); + AddToWorklist(NewNode.getNode()); return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp, &SVN0->getMask()[0]); } @@ -2573,7 +2572,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) { SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0->getOperand(1), N1->getOperand(1)); - AddToWorkList(NewNode.getNode()); + AddToWorklist(NewNode.getNode()); return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode, &SVN0->getMask()[0]); } @@ -2764,21 +2763,21 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (cast(LR)->isNullValue() && Op1 == ISD::SETEQ) { SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), LR.getValueType(), LL, RL); - AddToWorkList(ORNode.getNode()); + AddToWorklist(ORNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1); } // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1) if (cast(LR)->isAllOnesValue() && Op1 == ISD::SETEQ) { SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0), LR.getValueType(), LL, RL); - AddToWorkList(ANDNode.getNode()); + AddToWorklist(ANDNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ANDNode, LR, Op1); } // fold (and (setgt X, -1), (setgt Y, -1)) -> (setgt (or X, Y), -1) if (cast(LR)->isAllOnesValue() && Op1 == ISD::SETGT) { SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), LR.getValueType(), LL, RL); - AddToWorkList(ORNode.getNode()); + AddToWorklist(ORNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1); } } @@ -2791,7 +2790,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { cast(RR)->isNullValue()))) { SDValue ADDNode = DAG.getNode(ISD::ADD, SDLoc(N0), LL.getValueType(), LL, DAG.getConstant(1, LL.getValueType())); - AddToWorkList(ADDNode.getNode()); + AddToWorklist(ADDNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ADDNode, DAG.getConstant(2, LL.getValueType()), ISD::SETUGE); } @@ -2839,7 +2838,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); - AddToWorkList(N); + AddToWorklist(N); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -2859,7 +2858,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); - AddToWorkList(N); + AddToWorklist(N); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -2890,7 +2889,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, LN0->getChain(), LN0->getBasePtr(), ExtVT, LN0->getMemOperand()); - AddToWorkList(N); + AddToWorklist(N); CombineTo(LN0, NewLoad, NewLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -2917,7 +2916,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { Alignment = MinAlign(Alignment, PtrOff); } - AddToWorkList(NewPtr.getNode()); + AddToWorklist(NewPtr.getNode()); EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT; SDValue Load = @@ -2926,7 +2925,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { LN0->getPointerInfo(), ExtVT, LN0->isVolatile(), LN0->isNonTemporal(), Alignment, LN0->getTBAAInfo()); - AddToWorkList(N); + AddToWorklist(N); CombineTo(LN0, Load, Load.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -3251,6 +3250,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // Do this only if the resulting shuffle is legal. if (isa(N0) && isa(N1) && + // Avoid folding a node with illegal type. + TLI.isTypeLegal(VT) && N0->getOperand(1) == N1->getOperand(1) && ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode())) { bool CanFold = true; @@ -3362,7 +3363,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) { SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR), LR.getValueType(), LL, RL); - AddToWorkList(ORNode.getNode()); + AddToWorklist(ORNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1); } // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1) @@ -3371,7 +3372,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) { SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR), LR.getValueType(), LL, RL); - AddToWorkList(ANDNode.getNode()); + AddToWorklist(ANDNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ANDNode, LR, Op1); } } @@ -3757,7 +3758,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { SDValue V = N0.getOperand(0); V = DAG.getNode(ISD::XOR, SDLoc(N0), V.getValueType(), V, DAG.getConstant(1, V.getValueType())); - AddToWorkList(V.getNode()); + AddToWorklist(V.getNode()); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V); } @@ -3769,7 +3770,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND; LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS - AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode()); + AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode()); return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } } @@ -3781,7 +3782,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND; LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS - AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode()); + AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode()); return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } } @@ -3790,7 +3791,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { N0->getOperand(1) == N1) { SDValue X = N0->getOperand(0); SDValue NotX = DAG.getNOT(SDLoc(X), X, VT); - AddToWorkList(NotX.getNode()); + AddToWorklist(NotX.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1); } // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2)) @@ -3954,14 +3955,14 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // If setcc produces all-one true value then: // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<isConstant()) { - if (N0.getOpcode() == ISD::AND && - TLI.getBooleanContents(true) == - TargetLowering::ZeroOrNegativeOneBooleanContent) { + if (N0.getOpcode() == ISD::AND) { SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); BuildVectorSDNode *N01CV = dyn_cast(N01); - if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC) { + if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC && + TLI.getBooleanContents(N00.getOperand(0).getValueType()) == + TargetLowering::ZeroOrNegativeOneBooleanContent) { SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV); if (C.getNode()) return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C); @@ -4055,7 +4056,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { EVT CountVT = NewOp0.getOperand(1).getValueType(); SDValue NewSHL = DAG.getNode(ISD::SHL, SDLoc(N), NewOp0.getValueType(), NewOp0, DAG.getConstant(c2, CountVT)); - AddToWorkList(NewSHL.getNode()); + AddToWorklist(NewSHL.getNode()); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL); } } @@ -4341,7 +4342,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { SDValue SmallShift = DAG.getNode(ISD::SRL, SDLoc(N0), SmallVT, N0.getOperand(0), DAG.getConstant(ShiftAmt, getShiftAmountTy(SmallVT))); - AddToWorkList(SmallShift.getNode()); + AddToWorklist(SmallShift.getNode()); APInt Mask = APInt::getAllOnesValue(OpSizeInBits).lshr(ShiftAmt); return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, SmallShift), @@ -4383,7 +4384,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (ShAmt) { Op = DAG.getNode(ISD::SRL, SDLoc(N0), VT, Op, DAG.getConstant(ShAmt, getShiftAmountTy(Op.getValueType()))); - AddToWorkList(Op.getNode()); + AddToWorklist(Op.getNode()); } return DAG.getNode(ISD::XOR, SDLoc(N), VT, @@ -4435,12 +4436,12 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (N->hasOneUse()) { SDNode *Use = *N->use_begin(); if (Use->getOpcode() == ISD::BRCOND) - AddToWorkList(Use); + AddToWorklist(Use); else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) { // Also look pass the truncate. Use = *Use->use_begin(); if (Use->getOpcode() == ISD::BRCOND) - AddToWorkList(Use); + AddToWorklist(Use); } } @@ -4520,11 +4521,20 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { if (VT == MVT::i1 && N1C && N1C->getAPIntValue() == 1) return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2); // fold (select C, 0, 1) -> (xor C, 1) + // We can't do this reliably if integer based booleans have different contents + // to floating point based booleans. This is because we can't tell whether we + // have an integer-based boolean or a floating-point-based boolean unless we + // can find the SETCC that produced it and inspect its operands. This is + // fairly easy if C is the SETCC node, but it can potentially be + // undiscoverable (or not reasonably discoverable). For example, it could be + // in another basic block or it could require searching a complicated + // expression. if (VT.isInteger() && - (VT0 == MVT::i1 || - (VT0.isInteger() && - TLI.getBooleanContents(false) == - TargetLowering::ZeroOrOneBooleanContent)) && + (VT0 == MVT::i1 || (VT0.isInteger() && + TLI.getBooleanContents(false, false) == + TLI.getBooleanContents(false, true) && + TLI.getBooleanContents(false, false) == + TargetLowering::ZeroOrOneBooleanContent)) && N1C && N2C && N1C->isNullValue() && N2C->getAPIntValue() == 1) { SDValue XORNode; if (VT == VT0) @@ -4532,7 +4542,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { N0, DAG.getConstant(1, VT0)); XORNode = DAG.getNode(ISD::XOR, SDLoc(N0), VT0, N0, DAG.getConstant(1, VT0)); - AddToWorkList(XORNode.getNode()); + AddToWorklist(XORNode.getNode()); if (VT.bitsGT(VT0)) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, XORNode); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, XORNode); @@ -4540,13 +4550,13 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // fold (select C, 0, X) -> (and (not C), X) if (VT == VT0 && VT == MVT::i1 && N1C && N1C->isNullValue()) { SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); - AddToWorkList(NOTNode.getNode()); + AddToWorklist(NOTNode.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, NOTNode, N2); } // fold (select C, X, 1) -> (or (not C), X) if (VT == VT0 && VT == MVT::i1 && N2C && N2C->getAPIntValue() == 1) { SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); - AddToWorkList(NOTNode.getNode()); + AddToWorklist(NOTNode.getNode()); return DAG.getNode(ISD::OR, SDLoc(N), VT, NOTNode, N1); } // fold (select C, X, 0) -> (and C, X) @@ -4677,8 +4687,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { ISD::SRA, DL, VT, LHS, DAG.getConstant(VT.getScalarType().getSizeInBits() - 1, VT)); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift); - AddToWorkList(Shift.getNode()); - AddToWorkList(Add.getNode()); + AddToWorklist(Shift.getNode()); + AddToWorklist(Add.getNode()); return DAG.getNode(ISD::XOR, DL, VT, Add, Shift); } } @@ -4705,8 +4715,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { // Add the new VSELECT nodes to the work list in case they need to be split // again. - AddToWorkList(Lo.getNode()); - AddToWorkList(Hi.getNode()); + AddToWorklist(Lo.getNode()); + AddToWorklist(Hi.getNode()); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } @@ -4748,7 +4758,7 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) { SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, CC, SDLoc(N), false); if (SCC.getNode()) { - AddToWorkList(SCC.getNode()); + AddToWorklist(SCC.getNode()); if (ConstantSDNode *SCCC = dyn_cast(SCC.getNode())) { if (!SCCC->isNullValue()) @@ -4945,7 +4955,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. - AddToWorkList(oye); + AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -5073,12 +5083,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { } if (N0.getOpcode() == ISD::SETCC) { + EVT N0VT = N0.getOperand(0).getValueType(); // sext(setcc) -> sext_in_reg(vsetcc) for vectors. // Only do this before legalize for now. if (VT.isVector() && !LegalOperations && - TLI.getBooleanContents(true) == - TargetLowering::ZeroOrNegativeOneBooleanContent) { - EVT N0VT = N0.getOperand(0).getValueType(); + TLI.getBooleanContents(N0VT) == + TargetLowering::ZeroOrNegativeOneBooleanContent) { // On some architectures (such as SSE/NEON/etc) the SETCC result type is // of the same size as the compared operands. Only optimize sext(setcc()) // if this is the case. @@ -5226,7 +5236,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. - AddToWorkList(oye); + AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -5244,7 +5254,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. - AddToWorkList(oye); + AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -5252,10 +5262,10 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue Op = N0.getOperand(0); if (Op.getValueType().bitsLT(VT)) { Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); - AddToWorkList(Op.getNode()); + AddToWorklist(Op.getNode()); } else if (Op.getValueType().bitsGT(VT)) { Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); - AddToWorkList(Op.getNode()); + AddToWorklist(Op.getNode()); } return DAG.getZeroExtendInReg(Op, SDLoc(N), N0.getValueType().getScalarType()); @@ -5474,7 +5484,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. - AddToWorkList(oye); + AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -5515,8 +5525,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { // scalars. if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && ISD::isUNINDEXEDLoad(N0.getNode()) && - ((!LegalOperations && !cast(N0)->isVolatile()) || - TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) { + TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) @@ -5773,7 +5782,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LN0), PtrType, LN0->getBasePtr(), DAG.getConstant(PtrOff, PtrType)); - AddToWorkList(NewPtr.getNode()); + AddToWorklist(NewPtr.getNode()); SDValue Load; if (ExtType == ISD::NON_EXTLOAD) @@ -5788,7 +5797,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { NewAlign, LN0->getTBAAInfo()); // Replace the old load's chain with the new load's chain. - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); // Shift the result left, if we've swallowed a left shift. @@ -5887,7 +5896,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); - AddToWorkList(ExtLoad.getNode()); + AddToWorklist(ExtLoad.getNode()); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use @@ -6009,6 +6018,19 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } } + // trunc (select c, a, b) -> select c, (trunc a), (trunc b) + if (N0.getOpcode() == ISD::SELECT) { + EVT SrcVT = N0.getValueType(); + if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) && + TLI.isTruncateFree(SrcVT, VT)) { + SDLoc SL(N0); + SDValue Cond = N0.getOperand(0); + SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); + SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2)); + return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1); + } + } + // Fold a series of buildvector, bitcast, and truncate if possible. // For example fold // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to @@ -6108,7 +6130,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { continue; } SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V); - AddToWorkList(NV.getNode()); + AddToWorklist(NV.getNode()); Opnds.push_back(NV); } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds); @@ -6210,6 +6232,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile() && + // Do not remove the cast if the types differ in endian layout. + TLI.hasBigEndianPartOrdering(N0.getValueType()) == + TLI.hasBigEndianPartOrdering(VT) && (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) && TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) { LoadSDNode *LN0 = cast(N0); @@ -6223,7 +6248,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(), OrigAlign, LN0->getTBAAInfo()); - AddToWorkList(N); + AddToWorklist(N); CombineTo(N0.getNode(), DAG.getNode(ISD::BITCAST, SDLoc(N0), N0.getValueType(), Load), @@ -6241,7 +6266,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { !VT.isVector() && !N0.getValueType().isVector()) { SDValue NewConv = DAG.getNode(ISD::BITCAST, SDLoc(N0), VT, N0.getOperand(0)); - AddToWorkList(NewConv.getNode()); + AddToWorklist(NewConv.getNode()); APInt SignBit = APInt::getSignBit(VT.getSizeInBits()); if (N0.getOpcode() == ISD::FNEG) @@ -6264,34 +6289,34 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { if (isTypeLegal(IntXVT)) { SDValue X = DAG.getNode(ISD::BITCAST, SDLoc(N0), IntXVT, N0.getOperand(1)); - AddToWorkList(X.getNode()); + AddToWorklist(X.getNode()); // If X has a different width than the result/lhs, sext it or truncate it. unsigned VTWidth = VT.getSizeInBits(); if (OrigXWidth < VTWidth) { X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X); - AddToWorkList(X.getNode()); + AddToWorklist(X.getNode()); } else if (OrigXWidth > VTWidth) { // To get the sign bit in the right place, we have to shift it right // before truncating. X = DAG.getNode(ISD::SRL, SDLoc(X), X.getValueType(), X, DAG.getConstant(OrigXWidth-VTWidth, X.getValueType())); - AddToWorkList(X.getNode()); + AddToWorklist(X.getNode()); X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); - AddToWorkList(X.getNode()); + AddToWorklist(X.getNode()); } APInt SignBit = APInt::getSignBit(VT.getSizeInBits()); X = DAG.getNode(ISD::AND, SDLoc(X), VT, X, DAG.getConstant(SignBit, VT)); - AddToWorkList(X.getNode()); + AddToWorklist(X.getNode()); SDValue Cst = DAG.getNode(ISD::BITCAST, SDLoc(N0), VT, N0.getOperand(0)); Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT, Cst, DAG.getConstant(~SignBit, VT)); - AddToWorkList(Cst.getNode()); + AddToWorklist(Cst.getNode()); return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst); } @@ -6347,7 +6372,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op); Ops.push_back(DAG.getNode(ISD::BITCAST, SDLoc(BV), DstEltVT, Op)); - AddToWorkList(Ops.back().getNode()); + AddToWorklist(Ops.back().getNode()); } return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops); } @@ -6862,7 +6887,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { if (N1CFP->isExactlyValue(-1.0) && (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { SDValue RHSNeg = DAG.getNode(ISD::FNEG, dl, VT, N0); - AddToWorkList(RHSNeg.getNode()); + AddToWorklist(RHSNeg.getNode()); return DAG.getNode(ISD::FADD, dl, VT, N2, RHSNeg); } } @@ -7146,7 +7171,7 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) { SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT, N0.getOperand(0), N1); - AddToWorkList(Tmp.getNode()); + AddToWorklist(Tmp.getNode()); return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, Tmp, N0.getOperand(1)); } @@ -7197,8 +7222,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && - ((!LegalOperations && !cast(N0)->isVolatile()) || - TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) { + TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), @@ -7239,7 +7263,7 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { if (IntVT.isInteger() && !IntVT.isVector()) { Int = DAG.getNode(ISD::XOR, SDLoc(N0), IntVT, Int, DAG.getConstant(APInt::getSignBit(IntVT.getSizeInBits()), IntVT)); - AddToWorkList(Int.getNode()); + AddToWorklist(Int.getNode()); return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Int); } @@ -7331,7 +7355,7 @@ SDValue DAGCombiner::visitFABS(SDNode *N) { if (IntVT.isInteger() && !IntVT.isVector()) { Int = DAG.getNode(ISD::AND, SDLoc(N0), IntVT, Int, DAG.getConstant(~APInt::getSignBit(IntVT.getSizeInBits()), IntVT)); - AddToWorkList(Int.getNode()); + AddToWorklist(Int.getNode()); return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Int); } @@ -7414,13 +7438,13 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { CombineTo(N, NewBRCond, false); // Truncate is dead. if (Trunc) { - removeFromWorkList(Trunc); + removeFromWorklist(Trunc); DAG.DeleteNode(Trunc); } // Replace the uses of SRL with SETCC - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N1, SetCC); - removeFromWorkList(N1.getNode()); + removeFromWorklist(N1.getNode()); DAG.DeleteNode(N1.getNode()); return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -7448,9 +7472,9 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { dbgs() << "\nWith: "; Tmp.getNode()->dump(&DAG); dbgs() << '\n'); - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N1, Tmp); - removeFromWorkList(TheXor); + removeFromWorklist(TheXor); DAG.DeleteNode(TheXor); return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, Tmp, N2); @@ -7479,9 +7503,9 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { Op0, Op1, Equal ? ISD::SETEQ : ISD::SETNE); // Replace the uses of XOR with SETCC - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N1, SetCC); - removeFromWorkList(N1.getNode()); + removeFromWorklist(N1.getNode()); DAG.DeleteNode(N1.getNode()); return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, SetCC, N2); @@ -7507,7 +7531,7 @@ SDValue DAGCombiner::visitBR_CC(SDNode *N) { SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()), CondLHS, CondRHS, CC->get(), SDLoc(N), false); - if (Simp.getNode()) AddToWorkList(Simp.getNode()); + if (Simp.getNode()) AddToWorklist(Simp.getNode()); // fold to a simpler setcc if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC) @@ -7717,7 +7741,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); if (isLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); @@ -7776,13 +7800,13 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { SDLoc(OtherUses[i]), OtherUses[i]->getValueType(0), NewOp1, NewOp2); DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse); - removeFromWorkList(OtherUses[i]); + removeFromWorklist(OtherUses[i]); DAG.DeleteNode(OtherUses[i]); } // Replace the uses of Ptr with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0)); - removeFromWorkList(Ptr.getNode()); + removeFromWorklist(Ptr.getNode()); DAG.DeleteNode(Ptr.getNode()); return true; @@ -7887,7 +7911,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); if (isLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); @@ -7901,7 +7925,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { // Replace the uses of Use with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), Result.getValue(isLoad ? 1 : 0)); - removeFromWorkList(Op); + removeFromWorklist(Op); DAG.DeleteNode(Op); return true; } @@ -7934,11 +7958,11 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG); dbgs() << "\n"); - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); if (N->use_empty()) { - removeFromWorkList(N); + removeFromWorklist(N); DAG.DeleteNode(N); } @@ -7954,12 +7978,12 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG); dbgs() << " and 2 other values\n"); - WorkListRemover DeadNodes(*this); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), DAG.getUNDEF(N->getValueType(1))); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain); - removeFromWorkList(N); + removeFromWorklist(N); DAG.DeleteNode(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -8026,7 +8050,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { MVT::Other, Chain, ReplLoad.getValue(1)); // Make sure the new and old chains are cleaned up. - AddToWorkList(Token.getNode()); + AddToWorklist(Token.getNode()); // Replace uses with load result and token factor. Don't add users // to work list. @@ -8833,10 +8857,10 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { ST->getPointerInfo().getWithOffset(PtrOff), false, false, NewAlign); - AddToWorkList(NewPtr.getNode()); - AddToWorkList(NewLD.getNode()); - AddToWorkList(NewVal.getNode()); - WorkListRemover DeadNodes(*this); + AddToWorklist(NewPtr.getNode()); + AddToWorklist(NewLD.getNode()); + AddToWorklist(NewVal.getNode()); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); ++OpsNarrowed; return NewST; @@ -8891,9 +8915,9 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { ST->getPointerInfo(), false, false, STAlign); - AddToWorkList(NewLD.getNode()); - AddToWorkList(NewST.getNode()); - WorkListRemover DeadNodes(*this); + AddToWorklist(NewLD.getNode()); + AddToWorklist(NewST.getNode()); + WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1)); ++LdStFP2Int; return NewST; @@ -9285,7 +9309,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // Since we know that St is redundant, just iterate. while (!St->use_empty()) DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); - removeFromWorkList(St); + removeFromWorklist(St); DAG.DeleteNode(St); } @@ -9460,7 +9484,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { continue; StoreSDNode *St = cast(StoreNodes[i].MemNode); DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain()); - removeFromWorkList(St); + removeFromWorklist(St); DAG.DeleteNode(St); } @@ -9609,7 +9633,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { MVT::Other, Chain, ReplStore); // Make sure the new and old chains are cleaned up. - AddToWorkList(Token.getNode()); + AddToWorklist(Token.getNode()); // Don't add users to work list. return CombineTo(N, Token, false); @@ -9631,7 +9655,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { APInt::getLowBitsSet( Value.getValueType().getScalarType().getSizeInBits(), ST->getMemoryVT().getScalarType().getSizeInBits())); - AddToWorkList(Value.getNode()); + AddToWorklist(Value.getNode()); if (Shorter.getNode()) return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(), ST->getMemOperand()); @@ -9725,7 +9749,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // Swap nodes. SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VT, InVec.getOperand(0), InVal, EltNo); - AddToWorkList(NewOp.getNode()); + AddToWorklist(NewOp.getNode()); return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()), VT, NewOp, InVec.getOperand(1), InVec.getOperand(2)); } @@ -9763,86 +9787,6 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } -SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( - SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) { - EVT ResultVT = EVE->getValueType(0); - EVT VecEltVT = InVecVT.getVectorElementType(); - unsigned Align = OriginalLoad->getAlignment(); - unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment( - VecEltVT.getTypeForEVT(*DAG.getContext())); - - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) - return SDValue(); - - Align = NewAlign; - - SDValue NewPtr = OriginalLoad->getBasePtr(); - SDValue Offset; - EVT PtrType = NewPtr.getValueType(); - MachinePointerInfo MPI; - if (auto *ConstEltNo = dyn_cast(EltNo)) { - int Elt = ConstEltNo->getZExtValue(); - unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; - if (TLI.isBigEndian()) - PtrOff = InVecVT.getSizeInBits() / 8 - PtrOff; - Offset = DAG.getConstant(PtrOff, PtrType); - MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); - } else { - Offset = DAG.getNode( - ISD::MUL, SDLoc(EVE), EltNo.getValueType(), EltNo, - DAG.getConstant(VecEltVT.getStoreSize(), EltNo.getValueType())); - if (TLI.isBigEndian()) - Offset = DAG.getNode( - ISD::SUB, SDLoc(EVE), EltNo.getValueType(), - DAG.getConstant(InVecVT.getStoreSize(), EltNo.getValueType()), Offset); - MPI = OriginalLoad->getPointerInfo(); - } - NewPtr = DAG.getNode(ISD::ADD, SDLoc(EVE), PtrType, NewPtr, Offset); - - // The replacement we need to do here is a little tricky: we need to - // replace an extractelement of a load with a load. - // Use ReplaceAllUsesOfValuesWith to do the replacement. - // Note that this replacement assumes that the extractvalue is the only - // use of the load; that's okay because we don't want to perform this - // transformation in other cases anyway. - SDValue Load; - SDValue Chain; - if (ResultVT.bitsGT(VecEltVT)) { - // If the result type of vextract is wider than the load, then issue an - // extending load instead. - ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, VecEltVT) - ? ISD::ZEXTLOAD - : ISD::EXTLOAD; - Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT, OriginalLoad->getChain(), - NewPtr, MPI, VecEltVT, OriginalLoad->isVolatile(), - OriginalLoad->isNonTemporal(), Align, - OriginalLoad->getTBAAInfo()); - Chain = Load.getValue(1); - } else { - Load = DAG.getLoad( - VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, - OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(), - OriginalLoad->isInvariant(), Align, OriginalLoad->getTBAAInfo()); - Chain = Load.getValue(1); - if (ResultVT.bitsLT(VecEltVT)) - Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load); - else - Load = DAG.getNode(ISD::BITCAST, SDLoc(EVE), ResultVT, Load); - } - WorkListRemover DeadNodes(*this); - SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) }; - SDValue To[] = { Load, Chain }; - DAG.ReplaceAllUsesOfValuesWith(From, To, 2); - // Since we're explicitly calling ReplaceAllUses, add the new node to the - // worklist explicitly as well. - AddToWorkList(Load.getNode()); - AddUsersToWorkList(Load.getNode()); // Add users too - // Make sure to revisit this node to clean it up; it will usually be dead. - AddToWorkList(EVE); - ++OpsNarrowed; - return SDValue(EVE, 0); -} - SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // (vextract (scalar_to_vector val, 0) -> val SDValue InVec = N->getOperand(0); @@ -9911,38 +9855,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { } } - bool BCNumEltsChanged = false; - EVT ExtVT = VT.getVectorElementType(); - EVT LVT = ExtVT; - - // If the result of load has to be truncated, then it's not necessarily - // profitable. - if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT)) - return SDValue(); - - if (InVec.getOpcode() == ISD::BITCAST) { - // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) - return SDValue(); - - EVT BCVT = InVec.getOperand(0).getValueType(); - if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType())) - return SDValue(); - if (VT.getVectorNumElements() != BCVT.getVectorNumElements()) - BCNumEltsChanged = true; - InVec = InVec.getOperand(0); - ExtVT = BCVT.getVectorElementType(); - } - - // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size) - if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() && - ISD::isNormalLoad(InVec.getNode())) { - SDValue Index = N->getOperand(1); - if (LoadSDNode *OrigLoad = dyn_cast(InVec)) - return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index, - OrigLoad); - } - // Perform only after legalization to ensure build_vector / vector_shuffle // optimizations have already been done. if (!LegalOperations) return SDValue(); @@ -9953,6 +9865,30 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { if (ConstEltNo) { int Elt = cast(EltNo)->getZExtValue(); + bool NewLoad = false; + bool BCNumEltsChanged = false; + EVT ExtVT = VT.getVectorElementType(); + EVT LVT = ExtVT; + + // If the result of load has to be truncated, then it's not necessarily + // profitable. + if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT)) + return SDValue(); + + if (InVec.getOpcode() == ISD::BITCAST) { + // Don't duplicate a load with other uses. + if (!InVec.hasOneUse()) + return SDValue(); + + EVT BCVT = InVec.getOperand(0).getValueType(); + if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType())) + return SDValue(); + if (VT.getVectorNumElements() != BCVT.getVectorNumElements()) + BCNumEltsChanged = true; + InVec = InVec.getOperand(0); + ExtVT = BCVT.getVectorElementType(); + NewLoad = true; + } LoadSDNode *LN0 = nullptr; const ShuffleVectorSDNode *SVN = nullptr; @@ -9995,7 +9931,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { if (ISD::isNormalLoad(InVec.getNode())) { LN0 = cast(InVec); Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems; - EltNo = DAG.getConstant(Elt, EltNo.getValueType()); } } @@ -10008,7 +9943,72 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { if (Elt == -1) return DAG.getUNDEF(LVT); - return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0); + unsigned Align = LN0->getAlignment(); + if (NewLoad) { + // Check the resultant load doesn't need a higher alignment than the + // original load. + unsigned NewAlign = + TLI.getDataLayout() + ->getABITypeAlignment(LVT.getTypeForEVT(*DAG.getContext())); + + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, LVT)) + return SDValue(); + + Align = NewAlign; + } + + SDValue NewPtr = LN0->getBasePtr(); + unsigned PtrOff = 0; + + if (Elt) { + PtrOff = LVT.getSizeInBits() * Elt / 8; + EVT PtrType = NewPtr.getValueType(); + if (TLI.isBigEndian()) + PtrOff = VT.getSizeInBits() / 8 - PtrOff; + NewPtr = DAG.getNode(ISD::ADD, SDLoc(N), PtrType, NewPtr, + DAG.getConstant(PtrOff, PtrType)); + } + + // The replacement we need to do here is a little tricky: we need to + // replace an extractelement of a load with a load. + // Use ReplaceAllUsesOfValuesWith to do the replacement. + // Note that this replacement assumes that the extractvalue is the only + // use of the load; that's okay because we don't want to perform this + // transformation in other cases anyway. + SDValue Load; + SDValue Chain; + if (NVT.bitsGT(LVT)) { + // If the result type of vextract is wider than the load, then issue an + // extending load instead. + ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, LVT) + ? ISD::ZEXTLOAD : ISD::EXTLOAD; + Load = DAG.getExtLoad(ExtType, SDLoc(N), NVT, LN0->getChain(), + NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff), + LVT, LN0->isVolatile(), LN0->isNonTemporal(), + Align, LN0->getTBAAInfo()); + Chain = Load.getValue(1); + } else { + Load = DAG.getLoad(LVT, SDLoc(N), LN0->getChain(), NewPtr, + LN0->getPointerInfo().getWithOffset(PtrOff), + LN0->isVolatile(), LN0->isNonTemporal(), + LN0->isInvariant(), Align, LN0->getTBAAInfo()); + Chain = Load.getValue(1); + if (NVT.bitsLT(LVT)) + Load = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, Load); + else + Load = DAG.getNode(ISD::BITCAST, SDLoc(N), NVT, Load); + } + WorklistRemover DeadNodes(*this); + SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) }; + SDValue To[] = { Load, Chain }; + DAG.ReplaceAllUsesOfValuesWith(From, To, 2); + // Since we're explcitly calling ReplaceAllUses, add the new node to the + // worklist explicitly as well. + AddToWorklist(Load.getNode()); + AddUsersToWorklist(Load.getNode()); // Add users too + // Make sure to revisit this node to clean it up; it will usually be dead. + AddToWorklist(N); + return SDValue(N, 0); } return SDValue(); @@ -10119,7 +10119,7 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) { SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); // The new BUILD_VECTOR node has the potential to be further optimized. - AddToWorkList(BV.getNode()); + AddToWorklist(BV.getNode()); // Bitcast to the desired type. return DAG.getNode(ISD::BITCAST, dl, VT, BV); } @@ -10185,7 +10185,7 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { Opnds.push_back(In.getOperand(0)); } SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Opnds); - AddToWorkList(BV.getNode()); + AddToWorklist(BV.getNode()); return DAG.getNode(Opcode, dl, VT, BV); } @@ -10369,10 +10369,24 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { SmallVector Opnds; unsigned BuildVecNumElts = N0.getNumOperands(); - for (unsigned i = 0; i != BuildVecNumElts; ++i) - Opnds.push_back(N0.getOperand(i)); - for (unsigned i = 0; i != BuildVecNumElts; ++i) - Opnds.push_back(N1.getOperand(i)); + EVT SclTy0 = N0.getOperand(0)->getValueType(0); + EVT SclTy1 = N1.getOperand(0)->getValueType(0); + if (SclTy0.isFloatingPoint()) { + for (unsigned i = 0; i != BuildVecNumElts; ++i) + Opnds.push_back(N0.getOperand(i)); + for (unsigned i = 0; i != BuildVecNumElts; ++i) + Opnds.push_back(N1.getOperand(i)); + } else { + // If BUILD_VECTOR are from built from integer, they may have different + // operand types. Get the smaller type and truncate all operands to it. + EVT MinTy = SclTy0.bitsLE(SclTy1) ? SclTy0 : SclTy1; + for (unsigned i = 0; i != BuildVecNumElts; ++i) + Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy, + N0.getOperand(i))); + for (unsigned i = 0; i != BuildVecNumElts; ++i) + Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy, + N1.getOperand(i))); + } return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds); } @@ -10647,22 +10661,19 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { } // If this shuffle node is simply a swizzle of another shuffle node, - // and it reverses the swizzle of the previous shuffle then we can - // optimize shuffle(shuffle(x, undef), undef) -> x. + // then try to simplify it. if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && N1.getOpcode() == ISD::UNDEF) { ShuffleVectorSDNode *OtherSV = cast(N0); - // Shuffle nodes can only reverse shuffles with a single non-undef value. - if (N0.getOperand(1).getOpcode() != ISD::UNDEF) - return SDValue(); - // The incoming shuffle must be of the same type as the result of the // current shuffle. assert(OtherSV->getOperand(0).getValueType() == VT && "Shuffle types don't match"); + SmallVector Mask; + // Compute the combined shuffle mask. for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); assert(Idx < (int)NumElts && "Index references undef operand"); @@ -10670,13 +10681,147 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { // shuffle. Adopt the incoming index. if (Idx >= 0) Idx = OtherSV->getMaskElt(Idx); + Mask.push_back(Idx); + } + + bool CommuteOperands = false; + if (N0.getOperand(1).getOpcode() != ISD::UNDEF) { + // To be valid, the combine shuffle mask should only reference elements + // from one of the two vectors in input to the inner shufflevector. + bool IsValidMask = true; + for (unsigned i = 0; i != NumElts && IsValidMask; ++i) + // See if the combined mask only reference undefs or elements coming + // from the first shufflevector operand. + IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] < NumElts; + + if (!IsValidMask) { + IsValidMask = true; + for (unsigned i = 0; i != NumElts && IsValidMask; ++i) + // Check that all the elements come from the second shuffle operand. + IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] >= NumElts; + CommuteOperands = IsValidMask; + } - // The combined shuffle must map each index to itself. - if (Idx >= 0 && (unsigned)Idx != i) + // Early exit if the combined shuffle mask is not valid. + if (!IsValidMask) return SDValue(); } - return OtherSV->getOperand(0); + // See if this pair of shuffles can be safely folded according to either + // of the following rules: + // shuffle(shuffle(x, y), undef) -> x + // shuffle(shuffle(x, undef), undef) -> x + // shuffle(shuffle(x, y), undef) -> y + bool IsIdentityMask = true; + unsigned BaseMaskIndex = CommuteOperands ? NumElts : 0; + for (unsigned i = 0; i != NumElts && IsIdentityMask; ++i) { + // Skip Undefs. + if (Mask[i] < 0) + continue; + + // The combined shuffle must map each index to itself. + IsIdentityMask = (unsigned)Mask[i] == i + BaseMaskIndex; + } + + if (IsIdentityMask) { + if (CommuteOperands) + // optimize shuffle(shuffle(x, y), undef) -> y. + return OtherSV->getOperand(1); + + // optimize shuffle(shuffle(x, undef), undef) -> x + // optimize shuffle(shuffle(x, y), undef) -> x + return OtherSV->getOperand(0); + } + + // It may still be beneficial to combine the two shuffles if the + // resulting shuffle is legal. + if (TLI.isTypeLegal(VT) && TLI.isShuffleMaskLegal(Mask, VT)) { + if (!CommuteOperands) + // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3). + // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3) + return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1, + &Mask[0]); + + // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(undef, y, M3) + return DAG.getVectorShuffle(VT, SDLoc(N), N1, N0->getOperand(1), + &Mask[0]); + } + } + + // Canonicalize shuffles according to rules: + // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) + // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) + // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) + if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && N0.getOpcode() != ISD::UNDEF && + N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && + TLI.isTypeLegal(VT)) { + // The incoming shuffle must be of the same type as the result of the + // current shuffle. + assert(N1->getOperand(0).getValueType() == VT && + "Shuffle types don't match"); + + SDValue SV0 = N1->getOperand(0); + SDValue SV1 = N1->getOperand(1); + bool HasSameOp0 = N0 == SV0; + bool IsSV1Undef = SV1.getOpcode() == ISD::UNDEF; + if (HasSameOp0 || IsSV1Undef || N0 == SV1) + // Commute the operands of this shuffle so that next rule + // will trigger. + return DAG.getCommutedVectorShuffle(*SVN); + } + + // Try to fold according to rules: + // shuffle(shuffle(A, B, M0), B, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, B, M0), A, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2) + // Don't try to fold shuffles with illegal type. + if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && + N1.getOpcode() != ISD::UNDEF && TLI.isTypeLegal(VT)) { + ShuffleVectorSDNode *OtherSV = cast(N0); + + // The incoming shuffle must be of the same type as the result of the + // current shuffle. + assert(OtherSV->getOperand(0).getValueType() == VT && + "Shuffle types don't match"); + + SDValue SV0 = OtherSV->getOperand(0); + SDValue SV1 = OtherSV->getOperand(1); + bool HasSameOp0 = N1 == SV0; + bool IsSV1Undef = SV1.getOpcode() == ISD::UNDEF; + if (!HasSameOp0 && !IsSV1Undef && N1 != SV1) + // Early exit. + return SDValue(); + + SmallVector Mask; + // Compute the combined shuffle mask for a shuffle with SV0 as the first + // operand, and SV1 as the second operand. + for (unsigned i = 0; i != NumElts; ++i) { + int Idx = SVN->getMaskElt(i); + if (Idx < 0) { + // Propagate Undef. + Mask.push_back(Idx); + continue; + } + + if (Idx < (int)NumElts) { + Idx = OtherSV->getMaskElt(Idx); + if (IsSV1Undef && Idx >= (int) NumElts) + Idx = -1; // Propagate Undef. + } else + Idx = HasSameOp0 ? Idx - NumElts : Idx; + + Mask.push_back(Idx); + } + + // Avoid introducing shuffles with illegal mask. + if (TLI.isShuffleMaskLegal(Mask, VT)) { + if (IsSV1Undef) + // shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2) + return DAG.getVectorShuffle(VT, SDLoc(N), SV0, N1, &Mask[0]); + return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]); + } } return SDValue(); @@ -10811,7 +10956,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { FoldOp.getOpcode() != ISD::ConstantFP) break; Ops.push_back(FoldOp); - AddToWorkList(FoldOp.getNode()); + AddToWorklist(FoldOp.getNode()); } if (Ops.size() == LHS.getNumOperands()) @@ -10833,7 +10978,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { SDValue UndefVector = LHS.getOperand(1); SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS.getOperand(0), RHS.getOperand(0)); - AddUsersToWorkList(N); + AddUsersToWorklist(N); return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector, &SVN0->getMask()[0]); } @@ -10865,7 +11010,7 @@ SDValue DAGCombiner::SimplifyVUnaryOp(SDNode *N) { FoldOp.getOpcode() != ISD::ConstantFP) break; Ops.push_back(FoldOp); - AddToWorkList(FoldOp.getNode()); + AddToWorklist(FoldOp.getNode()); } if (Ops.size() != N0.getNumOperands()) @@ -10892,7 +11037,7 @@ SDValue DAGCombiner::SimplifySelect(SDLoc DL, SDValue N0, N0.getValueType(), SCC.getOperand(0), SCC.getOperand(1), SCC.getOperand(4)); - AddToWorkList(SETCC.getNode()); + AddToWorklist(SETCC.getNode()); return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SCC.getOperand(2), SCC.getOperand(3), SETCC); } @@ -11033,7 +11178,7 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, // Determine if the condition we're dealing with is constant SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, CC, DL, false); - if (SCC.getNode()) AddToWorkList(SCC.getNode()); + if (SCC.getNode()) AddToWorklist(SCC.getNode()); ConstantSDNode *SCCC = dyn_cast_or_null(SCC.getNode()); // fold select_cc true, x, y -> x @@ -11101,13 +11246,13 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); - AddToWorkList(Cond.getNode()); + AddToWorklist(Cond.getNode()); SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero); - AddToWorkList(CstOffset.getNode()); + AddToWorklist(CstOffset.getNode()); CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset); - AddToWorkList(CPIdx.getNode()); + AddToWorklist(CPIdx.getNode()); return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); @@ -11132,11 +11277,11 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, getShiftAmountTy(N0.getValueType())); SDValue Shift = DAG.getNode(ISD::SRL, SDLoc(N0), XType, N0, ShCt); - AddToWorkList(Shift.getNode()); + AddToWorklist(Shift.getNode()); if (XType.bitsGT(AType)) { Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); - AddToWorkList(Shift.getNode()); + AddToWorklist(Shift.getNode()); } return DAG.getNode(ISD::AND, DL, AType, Shift, N2); @@ -11146,11 +11291,11 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, XType, N0, DAG.getConstant(XType.getSizeInBits()-1, getShiftAmountTy(N0.getValueType()))); - AddToWorkList(Shift.getNode()); + AddToWorklist(Shift.getNode()); if (XType.bitsGT(AType)) { Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); - AddToWorkList(Shift.getNode()); + AddToWorklist(Shift.getNode()); } return DAG.getNode(ISD::AND, DL, AType, Shift, N2); @@ -11190,8 +11335,8 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, // fold select C, 16, 0 -> shl C, 4 if (N2C && N3C && N3C->isNullValue() && N2C->getAPIntValue().isPowerOf2() && - TLI.getBooleanContents(N0.getValueType().isVector()) == - TargetLowering::ZeroOrOneBooleanContent) { + TLI.getBooleanContents(N0.getValueType()) == + TargetLowering::ZeroOrOneBooleanContent) { // If the caller doesn't want us to simplify this into a zext of a compare, // don't do it. @@ -11220,8 +11365,8 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, N2.getValueType(), SCC); } - AddToWorkList(SCC.getNode()); - AddToWorkList(Temp.getNode()); + AddToWorklist(SCC.getNode()); + AddToWorklist(Temp.getNode()); if (N2C->getAPIntValue() == 1) return Temp; @@ -11300,8 +11445,8 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, getShiftAmountTy(N0.getValueType()))); SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), XType, N0, Shift); - AddToWorkList(Shift.getNode()); - AddToWorkList(Add.getNode()); + AddToWorklist(Shift.getNode()); + AddToWorklist(Add.getNode()); return DAG.getNode(ISD::XOR, DL, XType, Add, Shift); } } @@ -11336,7 +11481,7 @@ SDValue DAGCombiner::BuildSDIV(SDNode *N) { TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); for (SDNode *N : Built) - AddToWorkList(N); + AddToWorklist(N); return S; } @@ -11358,7 +11503,7 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) { TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); for (SDNode *N : Built) - AddToWorkList(N); + AddToWorklist(N); return S; } diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp index f7da4d546d87..ad75e916cefa 100644 --- a/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -39,6 +39,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/Statistic.h" @@ -74,6 +75,21 @@ STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by " "target-specific selector"); STATISTIC(NumFastIselDead, "Number of dead insts removed on failure"); +/// \brief Set CallLoweringInfo attribute flags based on a call instruction +/// and called function attributes. +void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS, + unsigned AttrIdx) { + isSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt); + isZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt); + isInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg); + isSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet); + isNest = CS->paramHasAttr(AttrIdx, Attribute::Nest); + isByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal); + isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca); + isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned); + Alignment = CS->getParamAlignment(AttrIdx); +} + /// startNewBlock - Set the current block to which generated machine /// instructions will be appended, and clear the local CSE map. /// @@ -561,13 +577,13 @@ bool FastISel::SelectGetElementPtr(const User *I) { return true; } -/// \brief Add a stack map intrinsic call's live variable operands to a stackmap -/// or patchpoint machine instruction. -/// +/// \brief Add a stackmap or patchpoint intrinsic call's live variable operands +/// to a stackmap or patchpoint machine instruction. bool FastISel::addStackMapLiveVars(SmallVectorImpl &Ops, const CallInst *CI, unsigned StartIdx) { for (unsigned i = StartIdx, e = CI->getNumArgOperands(); i != e; ++i) { Value *Val = CI->getArgOperand(i); + // Check for constants and encode them with a StackMaps::ConstantOp prefix. if (auto *C = dyn_cast(Val)) { Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp)); Ops.push_back(MachineOperand::CreateImm(C->getSExtValue())); @@ -575,6 +591,9 @@ bool FastISel::addStackMapLiveVars(SmallVectorImpl &Ops, Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp)); Ops.push_back(MachineOperand::CreateImm(0)); } else if (auto *AI = dyn_cast(Val)) { + // Values coming from a stack location also require a sepcial encoding, + // but that is added later on by the target specific frame index + // elimination implementation. auto SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) Ops.push_back(MachineOperand::CreateFI(SI->second)); @@ -591,11 +610,439 @@ bool FastISel::addStackMapLiveVars(SmallVectorImpl &Ops, return true; } +bool FastISel::SelectStackmap(const CallInst *I) { + // void @llvm.experimental.stackmap(i64 , i32 , + // [live variables...]) + assert(I->getCalledFunction()->getReturnType()->isVoidTy() && + "Stackmap cannot return a value."); + + // The stackmap intrinsic only records the live variables (the arguments + // passed to it) and emits NOPS (if requested). Unlike the patchpoint + // intrinsic, this won't be lowered to a function call. This means we don't + // have to worry about calling conventions and target-specific lowering code. + // Instead we perform the call lowering right here. + // + // CALLSEQ_START(0) + // STACKMAP(id, nbytes, ...) + // CALLSEQ_END(0, 0) + // + SmallVector Ops; + + // Add the and constants. + assert(isa(I->getOperand(PatchPointOpers::IDPos)) && + "Expected a constant integer."); + const auto *ID = cast(I->getOperand(PatchPointOpers::IDPos)); + Ops.push_back(MachineOperand::CreateImm(ID->getZExtValue())); + + assert(isa(I->getOperand(PatchPointOpers::NBytesPos)) && + "Expected a constant integer."); + const auto *NumBytes = + cast(I->getOperand(PatchPointOpers::NBytesPos)); + Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue())); + + // Push live variables for the stack map (skipping the first two arguments + // and ). + if (!addStackMapLiveVars(Ops, I, 2)) + return false; + + // We are not adding any register mask info here, because the stackmap doesn't + // clobber anything. + + // Add scratch registers as implicit def and early clobber. + CallingConv::ID CC = I->getCallingConv(); + const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC); + for (unsigned i = 0; ScratchRegs[i]; ++i) + Ops.push_back(MachineOperand::CreateReg( + ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false, + /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true)); + + // Issue CALLSEQ_START + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) + .addImm(0); + + // Issue STACKMAP. + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::STACKMAP)); + for (auto const &MO : Ops) + MIB.addOperand(MO); + + // Issue CALLSEQ_END + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) + .addImm(0).addImm(0); + + // Inform the Frame Information that we have a stackmap in this function. + FuncInfo.MF->getFrameInfo()->setHasStackMap(); + + return true; +} + +/// \brief Lower an argument list according to the target calling convention. +/// +/// This is a helper for lowering intrinsics that follow a target calling +/// convention or require stack pointer adjustment. Only a subset of the +/// intrinsic's operands need to participate in the calling convention. +bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx, + unsigned NumArgs, const Value *Callee, + bool ForceRetVoidTy, CallLoweringInfo &CLI) { + ArgListTy Args; + Args.reserve(NumArgs); + + // Populate the argument list. + // Attributes for args start at offset 1, after the return attribute. + ImmutableCallSite CS(CI); + for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1; + ArgI != ArgE; ++ArgI) { + Value *V = CI->getOperand(ArgI); + + assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic."); + + ArgListEntry Entry; + Entry.Val = V; + Entry.Ty = V->getType(); + Entry.setAttributes(&CS, AttrI); + Args.push_back(Entry); + } + + Type *RetTy = ForceRetVoidTy ? Type::getVoidTy(CI->getType()->getContext()) + : CI->getType(); + CLI.setCallee(CI->getCallingConv(), RetTy, Callee, std::move(Args), NumArgs); + + return LowerCallTo(CLI); +} + +bool FastISel::SelectPatchpoint(const CallInst *I) { + // void|i64 @llvm.experimental.patchpoint.void|i64(i64 , + // i32 , + // i8* , + // i32 , + // [Args...], + // [live variables...]) + CallingConv::ID CC = I->getCallingConv(); + bool IsAnyRegCC = CC == CallingConv::AnyReg; + bool HasDef = !I->getType()->isVoidTy(); + Value *Callee = I->getOperand(PatchPointOpers::TargetPos); + + // Get the real number of arguments participating in the call + assert(isa(I->getOperand(PatchPointOpers::NArgPos)) && + "Expected a constant integer."); + const auto *NumArgsVal = + cast(I->getOperand(PatchPointOpers::NArgPos)); + unsigned NumArgs = NumArgsVal->getZExtValue(); + + // Skip the four meta args: , , , + // This includes all meta-operands up to but not including CC. + unsigned NumMetaOpers = PatchPointOpers::CCPos; + assert(I->getNumArgOperands() >= NumMetaOpers + NumArgs && + "Not enough arguments provided to the patchpoint intrinsic"); + + // For AnyRegCC the arguments are lowered later on manually. + unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs; + CallLoweringInfo CLI; + if (!lowerCallOperands(I, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC, CLI)) + return false; + + assert(CLI.Call && "No call instruction specified."); + + SmallVector Ops; + + // Add an explicit result reg if we use the anyreg calling convention. + if (IsAnyRegCC && HasDef) { + assert(CLI.NumResultRegs == 0 && "Unexpected result register."); + CLI.ResultReg = createResultReg(TLI.getRegClassFor(MVT::i64)); + CLI.NumResultRegs = 1; + Ops.push_back(MachineOperand::CreateReg(CLI.ResultReg, /*IsDef=*/true)); + } + + // Add the and constants. + assert(isa(I->getOperand(PatchPointOpers::IDPos)) && + "Expected a constant integer."); + const auto *ID = cast(I->getOperand(PatchPointOpers::IDPos)); + Ops.push_back(MachineOperand::CreateImm(ID->getZExtValue())); + + assert(isa(I->getOperand(PatchPointOpers::NBytesPos)) && + "Expected a constant integer."); + const auto *NumBytes = + cast(I->getOperand(PatchPointOpers::NBytesPos)); + Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue())); + + // Assume that the callee is a constant address or null pointer. + // FIXME: handle function symbols in the future. + unsigned CalleeAddr; + if (const auto *C = dyn_cast(Callee)) + CalleeAddr = cast(C->getOperand(0))->getZExtValue(); + else if (const auto *C = dyn_cast(Callee)) { + if (C->getOpcode() == Instruction::IntToPtr) + CalleeAddr = cast(C->getOperand(0))->getZExtValue(); + else + llvm_unreachable("Unsupported ConstantExpr."); + } else if (isa(Callee)) + CalleeAddr = 0; + else + llvm_unreachable("Unsupported callee address."); + + Ops.push_back(MachineOperand::CreateImm(CalleeAddr)); + + // Adjust to account for any arguments that have been passed on + // the stack instead. + unsigned NumCallRegArgs = IsAnyRegCC ? NumArgs : CLI.OutRegs.size(); + Ops.push_back(MachineOperand::CreateImm(NumCallRegArgs)); + + // Add the calling convention + Ops.push_back(MachineOperand::CreateImm((unsigned)CC)); + + // Add the arguments we omitted previously. The register allocator should + // place these in any free register. + if (IsAnyRegCC) { + for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i) { + unsigned Reg = getRegForValue(I->getArgOperand(i)); + if (!Reg) + return false; + Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false)); + } + } + + // Push the arguments from the call instruction. + for (auto Reg : CLI.OutRegs) + Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false)); + + // Push live variables for the stack map. + if (!addStackMapLiveVars(Ops, I, NumMetaOpers + NumArgs)) + return false; + + // Push the register mask info. + Ops.push_back(MachineOperand::CreateRegMask(TRI.getCallPreservedMask(CC))); + + // Add scratch registers as implicit def and early clobber. + const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC); + for (unsigned i = 0; ScratchRegs[i]; ++i) + Ops.push_back(MachineOperand::CreateReg( + ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false, + /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true)); + + // Add implicit defs (return values). + for (auto Reg : CLI.InRegs) + Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/true, + /*IsImpl=*/true)); + + // Insert the patchpoint instruction before the call generated by the target. + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, CLI.Call, DbgLoc, + TII.get(TargetOpcode::PATCHPOINT)); + + for (auto &MO : Ops) + MIB.addOperand(MO); + + MIB->setPhysRegsDeadExcept(CLI.InRegs, TRI); + + // Delete the original call instruction. + CLI.Call->eraseFromParent(); + + // Inform the Frame Information that we have a patchpoint in this function. + FuncInfo.MF->getFrameInfo()->setHasPatchPoint(); + + if (CLI.NumResultRegs) + UpdateValueMap(I, CLI.ResultReg, CLI.NumResultRegs); + return true; +} + +/// Returns an AttributeSet representing the attributes applied to the return +/// value of the given call. +static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) { + SmallVector Attrs; + if (CLI.RetSExt) + Attrs.push_back(Attribute::SExt); + if (CLI.RetZExt) + Attrs.push_back(Attribute::ZExt); + if (CLI.IsInReg) + Attrs.push_back(Attribute::InReg); + + return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex, + Attrs); +} + +bool FastISel::LowerCallTo(const CallInst *CI, const char *SymName, + unsigned NumArgs) { + ImmutableCallSite CS(CI); + + PointerType *PT = cast(CS.getCalledValue()->getType()); + FunctionType *FTy = cast(PT->getElementType()); + Type *RetTy = FTy->getReturnType(); + + ArgListTy Args; + Args.reserve(NumArgs); + + // Populate the argument list. + // Attributes for args start at offset 1, after the return attribute. + for (unsigned ArgI = 0; ArgI != NumArgs; ++ArgI) { + Value *V = CI->getOperand(ArgI); + + assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic."); + + ArgListEntry Entry; + Entry.Val = V; + Entry.Ty = V->getType(); + Entry.setAttributes(&CS, ArgI + 1); + Args.push_back(Entry); + } + + CallLoweringInfo CLI; + CLI.setCallee(RetTy, FTy, SymName, std::move(Args), CS, NumArgs); + + return LowerCallTo(CLI); +} + +bool FastISel::LowerCallTo(CallLoweringInfo &CLI) { + // Handle the incoming return values from the call. + CLI.clearIns(); + SmallVector RetTys; + ComputeValueVTs(TLI, CLI.RetTy, RetTys); + + SmallVector Outs; + GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, TLI); + + bool CanLowerReturn = TLI.CanLowerReturn(CLI.CallConv, *FuncInfo.MF, + CLI.IsVarArg, Outs, + CLI.RetTy->getContext()); + + // FIXME: sret demotion isn't supported yet - bail out. + if (!CanLowerReturn) + return false; + + for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { + EVT VT = RetTys[I]; + MVT RegisterVT = TLI.getRegisterType(CLI.RetTy->getContext(), VT); + unsigned NumRegs = TLI.getNumRegisters(CLI.RetTy->getContext(), VT); + for (unsigned i = 0; i != NumRegs; ++i) { + ISD::InputArg MyFlags; + MyFlags.VT = RegisterVT; + MyFlags.ArgVT = VT; + MyFlags.Used = CLI.IsReturnValueUsed; + if (CLI.RetSExt) + MyFlags.Flags.setSExt(); + if (CLI.RetZExt) + MyFlags.Flags.setZExt(); + if (CLI.IsInReg) + MyFlags.Flags.setInReg(); + CLI.Ins.push_back(MyFlags); + } + } + + // Handle all of the outgoing arguments. + CLI.clearOuts(); + for (auto &Arg : CLI.getArgs()) { + Type *FinalType = Arg.Ty; + if (Arg.isByVal) + FinalType = cast(Arg.Ty)->getElementType(); + bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( + FinalType, CLI.CallConv, CLI.IsVarArg); + + ISD::ArgFlagsTy Flags; + if (Arg.isZExt) + Flags.setZExt(); + if (Arg.isSExt) + Flags.setSExt(); + if (Arg.isInReg) + Flags.setInReg(); + if (Arg.isSRet) + Flags.setSRet(); + if (Arg.isByVal) + Flags.setByVal(); + if (Arg.isInAlloca) { + Flags.setInAlloca(); + // Set the byval flag for CCAssignFn callbacks that don't know about + // inalloca. This way we can know how many bytes we should've allocated + // and how many bytes a callee cleanup function will pop. If we port + // inalloca to more targets, we'll have to add custom inalloca handling in + // the various CC lowering callbacks. + Flags.setByVal(); + } + if (Arg.isByVal || Arg.isInAlloca) { + PointerType *Ty = cast(Arg.Ty); + Type *ElementTy = Ty->getElementType(); + unsigned FrameSize = DL.getTypeAllocSize(ElementTy); + // For ByVal, alignment should come from FE. BE will guess if this info is + // not there, but there are cases it cannot get right. + unsigned FrameAlign = Arg.Alignment; + if (!FrameAlign) + FrameAlign = TLI.getByValTypeAlignment(ElementTy); + Flags.setByValSize(FrameSize); + Flags.setByValAlign(FrameAlign); + } + if (Arg.isNest) + Flags.setNest(); + if (NeedsRegBlock) + Flags.setInConsecutiveRegs(); + unsigned OriginalAlignment = DL.getABITypeAlignment(Arg.Ty); + Flags.setOrigAlign(OriginalAlignment); + + CLI.OutVals.push_back(Arg.Val); + CLI.OutFlags.push_back(Flags); + } + + if (!FastLowerCall(CLI)) + return false; + + // Set all unused physreg defs as dead. + assert(CLI.Call && "No call instruction specified."); + CLI.Call->setPhysRegsDeadExcept(CLI.InRegs, TRI); + + if (CLI.NumResultRegs && CLI.CS) + UpdateValueMap(CLI.CS->getInstruction(), CLI.ResultReg, CLI.NumResultRegs); + + return true; +} + +bool FastISel::LowerCall(const CallInst *CI) { + ImmutableCallSite CS(CI); + + PointerType *PT = cast(CS.getCalledValue()->getType()); + FunctionType *FuncTy = cast(PT->getElementType()); + Type *RetTy = FuncTy->getReturnType(); + + ArgListTy Args; + ArgListEntry Entry; + Args.reserve(CS.arg_size()); + + for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); + i != e; ++i) { + Value *V = *i; + + // Skip empty types + if (V->getType()->isEmptyTy()) + continue; + + Entry.Val = V; + Entry.Ty = V->getType(); + + // Skip the first return-type Attribute to get to params. + Entry.setAttributes(&CS, i - CS.arg_begin() + 1); + Args.push_back(Entry); + } + + // Check if target-independent constraints permit a tail call here. + // Target-dependent constraints are checked within FastLowerCall. + bool IsTailCall = CI->isTailCall(); + if (IsTailCall && !isInTailCallPosition(CS, TM)) + IsTailCall = false; + + CallLoweringInfo CLI; + CLI.setCallee(RetTy, FuncTy, CI->getCalledValue(), std::move(Args), CS) + .setTailCall(IsTailCall); + + return LowerCallTo(CLI); +} + bool FastISel::SelectCall(const User *I) { const CallInst *Call = cast(I); // Handle simple inline asms. if (const InlineAsm *IA = dyn_cast(Call->getCalledValue())) { + // If the inline asm has side effects, then make sure that no local value + // lives across by flushing the local value map. + if (IA->hasSideEffects()) + flushLocalValueMap(); + // Don't attempt to handle constraints. if (!IA->getConstraintString().empty()) return false; @@ -616,26 +1063,37 @@ bool FastISel::SelectCall(const User *I) { MachineModuleInfo &MMI = FuncInfo.MF->getMMI(); ComputeUsesVAFloatArgument(*Call, &MMI); - const Function *F = Call->getCalledFunction(); - if (!F) return false; + // Handle intrinsic function calls. + if (const auto *II = dyn_cast(Call)) + return SelectIntrinsicCall(II); + + // Usually, it does not make sense to initialize a value, + // make an unrelated function call and use the value, because + // it tends to be spilled on the stack. So, we move the pointer + // to the last local value to the beginning of the block, so that + // all the values which have already been materialized, + // appear after the call. It also makes sense to skip intrinsics + // since they tend to be inlined. + flushLocalValueMap(); + + return LowerCall(Call); +} - // Handle selected intrinsic function calls. - switch (F->getIntrinsicID()) { +bool FastISel::SelectIntrinsicCall(const IntrinsicInst *II) { + switch (II->getIntrinsicID()) { default: break; - // At -O0 we don't care about the lifetime intrinsics. + // At -O0 we don't care about the lifetime intrinsics. case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: - // The donothing intrinsic does, well, nothing. + // The donothing intrinsic does, well, nothing. case Intrinsic::donothing: return true; - case Intrinsic::dbg_declare: { - const DbgDeclareInst *DI = cast(Call); + const DbgDeclareInst *DI = cast(II); DIVariable DIVar(DI->getVariable()); assert((!DIVar || DIVar.isVariable()) && - "Variable in DbgDeclareInst should be either null or a DIVariable."); - if (!DIVar || - !FuncInfo.MF->getMMI().hasDebugInfo()) { + "Variable in DbgDeclareInst should be either null or a DIVariable."); + if (!DIVar || !FuncInfo.MF->getMMI().hasDebugInfo()) { DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n"); return true; } @@ -652,7 +1110,7 @@ bool FastISel::SelectCall(const User *I) { // Some arguments' frame index is recorded during argument lowering. Offset = FuncInfo.getArgumentFrameIndex(Arg); if (Offset) - Op = MachineOperand::CreateFI(Offset); + Op = MachineOperand::CreateFI(Offset); if (!Op) if (unsigned Reg = lookUpRegForValue(Address)) Op = MachineOperand::CreateReg(Reg, false); @@ -683,9 +1141,9 @@ bool FastISel::SelectCall(const User *I) { } else BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::DBG_VALUE)) - .addOperand(*Op) - .addImm(0) - .addMetadata(DI->getVariable()); + .addOperand(*Op) + .addImm(0) + .addMetadata(DI->getVariable()); } else { // We can't yet handle anything else here because it would require // generating code, thus altering codegen because of debug info. @@ -695,7 +1153,7 @@ bool FastISel::SelectCall(const User *I) { } case Intrinsic::dbg_value: { // This form of DBG_VALUE is target-independent. - const DbgValueInst *DI = cast(Call); + const DbgValueInst *DI = cast(II); const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); const Value *V = DI->getValue(); if (!V) { @@ -730,106 +1188,30 @@ bool FastISel::SelectCall(const User *I) { return true; } case Intrinsic::objectsize: { - ConstantInt *CI = cast(Call->getArgOperand(1)); + ConstantInt *CI = cast(II->getArgOperand(1)); unsigned long long Res = CI->isZero() ? -1ULL : 0; - Constant *ResCI = ConstantInt::get(Call->getType(), Res); + Constant *ResCI = ConstantInt::get(II->getType(), Res); unsigned ResultReg = getRegForValue(ResCI); if (ResultReg == 0) return false; - UpdateValueMap(Call, ResultReg); + UpdateValueMap(II, ResultReg); return true; } case Intrinsic::expect: { - unsigned ResultReg = getRegForValue(Call->getArgOperand(0)); + unsigned ResultReg = getRegForValue(II->getArgOperand(0)); if (ResultReg == 0) return false; - UpdateValueMap(Call, ResultReg); - return true; - } - case Intrinsic::experimental_stackmap: { - // void @llvm.experimental.stackmap(i64 , i32 , - // [live variables...]) - - assert(Call->getCalledFunction()->getReturnType()->isVoidTy() && - "Stackmap cannot return a value."); - - // The stackmap intrinsic only records the live variables (the arguments - // passed to it) and emits NOPS (if requested). Unlike the patchpoint - // intrinsic, this won't be lowered to a function call. This means we don't - // have to worry about calling conventions and target-specific lowering - // code. Instead we perform the call lowering right here. - // - // CALLSEQ_START(0) - // STACKMAP(id, nbytes, ...) - // CALLSEQ_END(0, 0) - // - - SmallVector Ops; - - // Add the and constants. - assert(isa(Call->getOperand(PatchPointOpers::IDPos)) && - "Expected a constant integer."); - auto IDVal = cast(Call->getOperand(PatchPointOpers::IDPos)); - Ops.push_back(MachineOperand::CreateImm(IDVal->getZExtValue())); - - assert(isa(Call->getOperand(PatchPointOpers::NBytesPos)) && - "Expected a constant integer."); - auto NBytesVal = - cast(Call->getOperand(PatchPointOpers::NBytesPos)); - Ops.push_back(MachineOperand::CreateImm(NBytesVal->getZExtValue())); - - // Push live variables for the stack map. - if (!addStackMapLiveVars(Ops, Call, 2)) - return false; - - // We are not adding any register mask info here, because the stackmap - // doesn't clobber anything. - - // Add scratch registers as implicit def and early clobber. - CallingConv::ID CC = Call->getCallingConv(); - const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC); - for (unsigned i = 0; ScratchRegs[i]; ++i) - Ops.push_back(MachineOperand::CreateReg( - ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false, - /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true)); - - // Issue CALLSEQ_START - unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(0); - - // Issue STACKMAP. - MachineInstrBuilder MIB; - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::STACKMAP)); - - for (auto const &MO : Ops) - MIB.addOperand(MO); - - // Issue CALLSEQ_END - unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) - .addImm(0).addImm(0); - - // Inform the Frame Information that we have a stackmap in this function. - FuncInfo.MF->getFrameInfo()->setHasStackMap(); - + UpdateValueMap(II, ResultReg); return true; } + case Intrinsic::experimental_stackmap: + return SelectStackmap(II); + case Intrinsic::experimental_patchpoint_void: + case Intrinsic::experimental_patchpoint_i64: + return SelectPatchpoint(II); } - // Usually, it does not make sense to initialize a value, - // make an unrelated function call and use the value, because - // it tends to be spilled on the stack. So, we move the pointer - // to the last local value to the beginning of the block, so that - // all the values which have already been materialized, - // appear after the call. It also makes sense to skip intrinsics - // since they tend to be inlined. - if (!isa(Call)) - flushLocalValueMap(); - - // An arbitrary call. Bail. - return false; + return FastLowerIntrinsicCall(II); } bool FastISel::SelectCast(const User *I, unsigned Opcode) { @@ -1207,6 +1589,7 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) { FastISel::FastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) : FuncInfo(funcInfo), + MF(funcInfo.MF), MRI(FuncInfo.MF->getRegInfo()), MFI(*FuncInfo.MF->getFrameInfo()), MCP(*FuncInfo.MF->getConstantPool()), @@ -1224,6 +1607,14 @@ bool FastISel::FastLowerArguments() { return false; } +bool FastISel::FastLowerCall(CallLoweringInfo &/*CLI*/) { + return false; +} + +bool FastISel::FastLowerIntrinsicCall(const IntrinsicInst * /*II*/) { + return false; +} + unsigned FastISel::FastEmit_(MVT, MVT, unsigned) { return 0; diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index a4245a6016c2..16c5b4ba7768 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -270,7 +270,7 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) { EVT OrigVT = VT; EVT SVT = VT; - while (SVT != MVT::f32) { + while (SVT != MVT::f32 && SVT != MVT::f16) { SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1); if (ConstantFPSDNode::isValueValidForType(SVT, CFP->getValueAPF()) && // Only do this if the target has a native EXTLOAD instruction from @@ -1186,6 +1186,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { if (Action != TargetLowering::Promote) Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other); break; + case ISD::FP_TO_FP16: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::EXTRACT_VECTOR_ELT: @@ -2060,7 +2061,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0) + .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = TLI.LowerCallTo(CLI); @@ -2095,7 +2096,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0) + .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) .setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = TLI.LowerCallTo(CLI); @@ -2129,7 +2130,7 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0) + .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) .setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = TLI.LowerCallTo(CLI); @@ -2266,7 +2267,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, SDLoc dl(Node); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0) + .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) .setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = TLI.LowerCallTo(CLI); @@ -2381,7 +2382,7 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) .setCallee(TLI.getLibcallCallingConv(LC), - Type::getVoidTy(*DAG.getContext()), Callee, &Args, 0); + Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args), 0); std::pair CallInfo = TLI.LowerCallTo(CLI); @@ -2999,8 +3000,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Node->getOperand(0)) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__sync_synchronize", TLI.getPointerTy()), - &Args, 0); + DAG.getExternalSymbol("__sync_synchronize", + TLI.getPointerTy()), std::move(Args), 0); std::pair CallResult = TLI.LowerCallTo(CLI); @@ -3098,7 +3099,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Node->getOperand(0)) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("abort", TLI.getPointerTy()), &Args, 0); + DAG.getExternalSymbol("abort", TLI.getPointerTy()), + std::move(Args), 0); std::pair CallResult = TLI.LowerCallTo(CLI); Results.push_back(CallResult.second); @@ -3152,65 +3154,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { Node->getOperand(0), Node->getValueType(0), dl); Results.push_back(Tmp1); break; - case ISD::FP_TO_SINT: { - EVT VT = Node->getOperand(0).getValueType(); - EVT NVT = Node->getValueType(0); - - // FIXME: Only f32 to i64 conversions are supported. - if (VT != MVT::f32 || NVT != MVT::i64) - break; - - // Expand f32 -> i64 conversion - // This algorithm comes from compiler-rt's implementation of fixsfdi: - // https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/fixsfdi.c - EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), - VT.getSizeInBits()); - SDValue ExponentMask = DAG.getConstant(0x7F800000, IntVT); - SDValue ExponentLoBit = DAG.getConstant(23, IntVT); - SDValue Bias = DAG.getConstant(127, IntVT); - SDValue SignMask = DAG.getConstant(APInt::getSignBit(VT.getSizeInBits()), - IntVT); - SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, IntVT); - SDValue MantissaMask = DAG.getConstant(0x007FFFFF, IntVT); - - SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Node->getOperand(0)); - - SDValue ExponentBits = DAG.getNode(ISD::SRL, dl, IntVT, - DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask), - DAG.getZExtOrTrunc(ExponentLoBit, dl, TLI.getShiftAmountTy(IntVT))); - SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias); - - SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT, - DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask), - DAG.getZExtOrTrunc(SignLowBit, dl, TLI.getShiftAmountTy(IntVT))); - Sign = DAG.getSExtOrTrunc(Sign, dl, NVT); - - SDValue R = DAG.getNode(ISD::OR, dl, IntVT, - DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask), - DAG.getConstant(0x00800000, IntVT)); - - R = DAG.getZExtOrTrunc(R, dl, NVT); - - - R = DAG.getSelectCC(dl, Exponent, ExponentLoBit, - DAG.getNode(ISD::SHL, dl, NVT, R, - DAG.getZExtOrTrunc( - DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit), - dl, TLI.getShiftAmountTy(IntVT))), - DAG.getNode(ISD::SRL, dl, NVT, R, - DAG.getZExtOrTrunc( - DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent), - dl, TLI.getShiftAmountTy(IntVT))), - ISD::SETGT); - - SDValue Ret = DAG.getNode(ISD::SUB, dl, NVT, - DAG.getNode(ISD::XOR, dl, NVT, R, Sign), - Sign); - - Results.push_back(DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, IntVT), - DAG.getConstant(0, NVT), Ret, ISD::SETLT)); + case ISD::FP_TO_SINT: + if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG)) + Results.push_back(Tmp1); break; - } case ISD::FP_TO_UINT: { SDValue True, False; EVT VT = Node->getOperand(0).getValueType(); @@ -3567,12 +3514,28 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { RTLIB::FMA_F80, RTLIB::FMA_F128, RTLIB::FMA_PPCF128)); break; - case ISD::FP16_TO_FP32: - Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false)); + case ISD::FP16_TO_FP: { + if (Node->getValueType(0) == MVT::f32) { + Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false)); + break; + } + + // We can extend to types bigger than f32 in two steps without changing the + // result. Since "f16 -> f32" is much more commonly available, give CodeGen + // the option of emitting that before resorting to a libcall. + SDValue Res = + DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0)); + Results.push_back( + DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res)); break; - case ISD::FP32_TO_FP16: - Results.push_back(ExpandLibCall(RTLIB::FPROUND_F32_F16, Node, false)); + } + case ISD::FP_TO_FP16: { + RTLIB::Libcall LC = + RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16); + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16"); + Results.push_back(ExpandLibCall(LC, Node, false)); break; + } case ISD::ConstantFP: { ConstantFPSDNode *CFP = cast(Node); // Check to see if this FP immediate is already legal. @@ -3760,7 +3723,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE); SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE); - Results.push_back(DAG.getBoolExtOrTrunc(Cmp, dl, ResultType)); + Results.push_back(DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType)); break; } case ISD::UADDO: @@ -3778,7 +3741,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { = Node->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT; SDValue SetCC = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC); - Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType)); + Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType)); break; } case ISD::UMULO: @@ -3968,7 +3931,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { // illegal; expand it into a SELECT_CC. EVT VT = Node->getValueType(0); int TrueValue; - switch (TLI.getBooleanContents(VT.isVector())) { + switch (TLI.getBooleanContents(Tmp1->getValueType(0))) { case TargetLowering::ZeroOrOneBooleanContent: case TargetLowering::UndefinedBooleanContent: TrueValue = 1; diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 6b8fec6e824a..649dd7a349ff 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -85,7 +85,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::FNEG: R = SoftenFloatRes_FNEG(N); break; case ISD::FP_EXTEND: R = SoftenFloatRes_FP_EXTEND(N); break; case ISD::FP_ROUND: R = SoftenFloatRes_FP_ROUND(N); break; - case ISD::FP16_TO_FP32:R = SoftenFloatRes_FP16_TO_FP32(N); break; + case ISD::FP16_TO_FP: R = SoftenFloatRes_FP16_TO_FP(N); break; case ISD::FPOW: R = SoftenFloatRes_FPOW(N); break; case ISD::FPOWI: R = SoftenFloatRes_FPOWI(N); break; case ISD::FREM: R = SoftenFloatRes_FREM(N); break; @@ -373,6 +373,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) { SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue Op = N->getOperand(0); + + // There's only a libcall for f16 -> f32, so proceed in two stages. Also, it's + // entirely possible for both f16 and f32 to be legal, so use the fully + // hard-float FP_EXTEND rather than FP16_TO_FP. + if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) + Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op); + RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0)); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first; @@ -380,16 +387,29 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special // nodes? -SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP32(SDNode *N) { - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); +SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) { + EVT MidVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32); SDValue Op = N->getOperand(0); - return TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, NVT, &Op, 1, false, - SDLoc(N)).first; + SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, &Op, 1, + false, SDLoc(N)).first; + if (N->getValueType(0) == MVT::f32) + return Res32; + + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + RTLIB::Libcall LC = RTLIB::getFPEXT(MVT::f32, N->getValueType(0)); + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); + return TLI.makeLibCall(DAG, LC, NVT, &Res32, 1, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue Op = N->getOperand(0); + if (N->getValueType(0) == MVT::f16) { + // Semi-soften first, to FP_TO_FP16, so that targets which support f16 as a + // storage-only type get a chance to select things. + return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, Op); + } + RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0)); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!"); return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first; @@ -498,6 +518,9 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) { SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + if (N->getValueType(0) == MVT::f16) + return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, N->getOperand(0)); + SDValue Op = GetSoftenedFloat(N->getOperand(0)); return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0), RTLIB::TRUNC_F32, @@ -625,10 +648,11 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { case ISD::BITCAST: Res = SoftenFloatOp_BITCAST(N); break; case ISD::BR_CC: Res = SoftenFloatOp_BR_CC(N); break; + case ISD::FP_EXTEND: Res = SoftenFloatOp_FP_EXTEND(N); break; + case ISD::FP_TO_FP16: // Same as FP_ROUND for softening purposes case ISD::FP_ROUND: Res = SoftenFloatOp_FP_ROUND(N); break; case ISD::FP_TO_SINT: Res = SoftenFloatOp_FP_TO_SINT(N); break; case ISD::FP_TO_UINT: Res = SoftenFloatOp_FP_TO_UINT(N); break; - case ISD::FP32_TO_FP16:Res = SoftenFloatOp_FP32_TO_FP16(N); break; case ISD::SELECT_CC: Res = SoftenFloatOp_SELECT_CC(N); break; case ISD::SETCC: Res = SoftenFloatOp_SETCC(N); break; case ISD::STORE: Res = SoftenFloatOp_STORE(N, OpNo); break; @@ -654,11 +678,32 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) { GetSoftenedFloat(N->getOperand(0))); } +SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) { + // If we get here, the result must be legal but the source illegal. + EVT SVT = N->getOperand(0).getValueType(); + EVT RVT = N->getValueType(0); + SDValue Op = GetSoftenedFloat(N->getOperand(0)); + + if (SVT == MVT::f16) + return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), RVT, Op); + + RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, RVT); + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND libcall"); + + return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first; +} + + SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) { + // We actually deal with the partially-softened FP_TO_FP16 node too, which + // returns an i16 so doesn't meet the constraints necessary for FP_ROUND. + assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16); + EVT SVT = N->getOperand(0).getValueType(); EVT RVT = N->getValueType(0); + EVT FloatRVT = N->getOpcode() == ISD::FP_TO_FP16 ? MVT::f16 : RVT; - RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, RVT); + RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall"); SDValue Op = GetSoftenedFloat(N->getOperand(0)); @@ -704,13 +749,6 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) { return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first; } -SDValue DAGTypeLegalizer::SoftenFloatOp_FP32_TO_FP16(SDNode *N) { - EVT RVT = N->getValueType(0); - RTLIB::Libcall LC = RTLIB::FPROUND_F32_F16; - SDValue Op = GetSoftenedFloat(N->getOperand(0)); - return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first; -} - SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) { SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1); ISD::CondCode CCCode = cast(N->getOperand(4))->get(); diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index a8603423e32a..44d9e3875b83 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -99,7 +99,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: Res = PromoteIntRes_FP_TO_XINT(N); break; - case ISD::FP32_TO_FP16:Res = PromoteIntRes_FP32_TO_FP16(N); break; + case ISD::FP_TO_FP16: Res = PromoteIntRes_FP_TO_FP16(N); break; case ISD::AND: case ISD::OR: @@ -225,10 +225,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, N->getOpcode(), SDLoc(N), N->getMemoryVT(), VTs, N->getChain(), N->getBasePtr(), Op2, Op3, N->getMemOperand(), N->getSuccessOrdering(), N->getFailureOrdering(), N->getSynchScope()); - // Legalized the chain result - switch anything that used the old chain to - // use the new one. - unsigned ChainOp = N->getNumValues() - 1; - ReplaceValueWith(SDValue(N, ChainOp), Res.getValue(ChainOp)); + // Update the use to N with the newly created Res. + for (unsigned i = 1, NumResults = N->getNumValues(); i < NumResults; ++i) + ReplaceValueWith(SDValue(N, i), Res.getValue(i)); return Res; } @@ -401,7 +400,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) { DAG.getValueType(N->getValueType(0).getScalarType())); } -SDValue DAGTypeLegalizer::PromoteIntRes_FP32_TO_FP16(SDNode *N) { +SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); @@ -519,7 +518,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) { EVT OpTy = N->getOperand(1).getValueType(); // Promote all the way up to the canonical SetCC type. - Mask = PromoteTargetBoolean(Mask, getSetCCResultType(OpTy)); + Mask = PromoteTargetBoolean(Mask, OpTy); SDValue LHS = GetPromotedInteger(N->getOperand(1)); SDValue RHS = GetPromotedInteger(N->getOperand(2)); return DAG.getNode(ISD::VSELECT, SDLoc(N), @@ -826,7 +825,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::STORE: Res = PromoteIntOp_STORE(cast(N), OpNo); break; case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break; - case ISD::FP16_TO_FP32: + case ISD::FP16_TO_FP: case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break; case ISD::ZERO_EXTEND: Res = PromoteIntOp_ZERO_EXTEND(N); break; @@ -919,8 +918,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) { assert(OpNo == 1 && "only know how to promote condition"); // Promote all the way up to the canonical SetCC type. - EVT SVT = getSetCCResultType(MVT::Other); - SDValue Cond = PromoteTargetBoolean(N->getOperand(1), SVT); + SDValue Cond = PromoteTargetBoolean(N->getOperand(1), MVT::Other); // The chain (Op#0) and basic block destination (Op#2) are always legal types. return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond, @@ -1013,9 +1011,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) { EVT OpTy = N->getOperand(1).getValueType(); // Promote all the way up to the canonical SetCC type. - EVT SVT = getSetCCResultType(N->getOpcode() == ISD::SELECT ? - OpTy.getScalarType() : OpTy); - Cond = PromoteTargetBoolean(Cond, SVT); + EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy; + Cond = PromoteTargetBoolean(Cond, OpVT); return SDValue(DAG.UpdateNodeOperands(N, Cond, N->getOperand(1), N->getOperand(2)), 0); @@ -2348,7 +2345,7 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, &Args, 0) + .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args), 0) .setSExtResult(); std::pair CallInfo = TLI.LowerCallTo(CLI); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index 3971fc334982..bd7dacf2bc69 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -1054,7 +1054,7 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0) + .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) .setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = TLI.LowerCallTo(CLI); @@ -1065,11 +1065,14 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, /// PromoteTargetBoolean - Promote the given target boolean to a target boolean /// of the given type. A target boolean is an integer value, not necessarily of /// type i1, the bits of which conform to getBooleanContents. -SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT VT) { +/// +/// ValVT is the type of values that produced the boolean. +SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) { SDLoc dl(Bool); + EVT BoolVT = getSetCCResultType(ValVT); ISD::NodeType ExtendCode = - TargetLowering::getExtendForContent(TLI.getBooleanContents(VT.isVector())); - return DAG.getNode(ExtendCode, dl, VT, Bool); + TargetLowering::getExtendForContent(TLI.getBooleanContents(ValVT)); + return DAG.getNode(ExtendCode, dl, BoolVT, Bool); } /// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 04c200c9528c..117ff31e2e8b 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -167,7 +167,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDNode *Node, bool isSigned); std::pair ExpandAtomic(SDNode *Node); - SDValue PromoteTargetBoolean(SDValue Bool, EVT VT); + SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT); void ReplaceValueWith(SDValue From, SDValue To); void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi); void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT, @@ -237,7 +237,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_CTTZ(SDNode *N); SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N); SDValue PromoteIntRes_FP_TO_XINT(SDNode *N); - SDValue PromoteIntRes_FP32_TO_FP16(SDNode *N); + SDValue PromoteIntRes_FP_TO_FP16(SDNode *N); SDValue PromoteIntRes_INT_EXTEND(SDNode *N); SDValue PromoteIntRes_LOAD(LoadSDNode *N); SDValue PromoteIntRes_Overflow(SDNode *N); @@ -403,7 +403,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftenFloatRes_FNEARBYINT(SDNode *N); SDValue SoftenFloatRes_FNEG(SDNode *N); SDValue SoftenFloatRes_FP_EXTEND(SDNode *N); - SDValue SoftenFloatRes_FP16_TO_FP32(SDNode *N); + SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N); SDValue SoftenFloatRes_FP_ROUND(SDNode *N); SDValue SoftenFloatRes_FPOW(SDNode *N); SDValue SoftenFloatRes_FPOWI(SDNode *N); @@ -425,10 +425,10 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { bool SoftenFloatOperand(SDNode *N, unsigned OpNo); SDValue SoftenFloatOp_BITCAST(SDNode *N); SDValue SoftenFloatOp_BR_CC(SDNode *N); + SDValue SoftenFloatOp_FP_EXTEND(SDNode *N); SDValue SoftenFloatOp_FP_ROUND(SDNode *N); SDValue SoftenFloatOp_FP_TO_SINT(SDNode *N); SDValue SoftenFloatOp_FP_TO_UINT(SDNode *N); - SDValue SoftenFloatOp_FP32_TO_FP16(SDNode *N); SDValue SoftenFloatOp_SELECT_CC(SDNode *N); SDValue SoftenFloatOp_SETCC(SDNode *N); SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo); @@ -570,6 +570,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi); @@ -644,6 +645,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { bool WidenVectorOperand(SDNode *N, unsigned OpNo); SDValue WidenVecOp_BITCAST(SDNode *N); SDValue WidenVecOp_CONCAT_VECTORS(SDNode *N); + SDValue WidenVecOp_EXTEND(SDNode *N); SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index f40ed76b01be..7e2f7b6ffb55 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -60,12 +60,15 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { case TargetLowering::TypeExpandFloat: // Convert the expanded pieces of the input. GetExpandedOp(InOp, Lo, Hi); + if (TLI.hasBigEndianPartOrdering(InVT) != + TLI.hasBigEndianPartOrdering(OutVT)) + std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; case TargetLowering::TypeSplitVector: GetSplitVector(InOp, Lo, Hi); - if (TLI.isBigEndian()) + if (TLI.hasBigEndianPartOrdering(OutVT)) std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); @@ -82,7 +85,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(InVT); std::tie(Lo, Hi) = DAG.SplitVector(InOp, dl, LoVT, HiVT); - if (TLI.isBigEndian()) + if (TLI.hasBigEndianPartOrdering(OutVT)) std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); @@ -176,7 +179,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { false, false, MinAlign(Alignment, IncrementSize)); // Handle endianness of the load. - if (TLI.isBigEndian()) + if (TLI.hasBigEndianPartOrdering(OutVT)) std::swap(Lo, Hi); } @@ -245,7 +248,8 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo, SDLoc dl(N); LoadSDNode *LD = cast(N); - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0)); + EVT ValueVT = LD->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), ValueVT); SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); unsigned Alignment = LD->getAlignment(); @@ -275,7 +279,7 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo, Hi.getValue(1)); // Handle endianness of the load. - if (TLI.isBigEndian()) + if (TLI.hasBigEndianPartOrdering(ValueVT)) std::swap(Lo, Hi); // Modified the chain - switch anything that used the old chain to use @@ -295,7 +299,7 @@ void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) { Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0); // Handle endianness of the load. - if (TLI.isBigEndian()) + if (TLI.hasBigEndianPartOrdering(OVT)) std::swap(Lo, Hi); // Modified the chain - switch anything that used the old chain to use @@ -459,8 +463,8 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) { SDLoc dl(N); StoreSDNode *St = cast(N); - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), - St->getValue().getValueType()); + EVT ValueVT = St->getValue().getValueType(); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), ValueVT); SDValue Chain = St->getChain(); SDValue Ptr = St->getBasePtr(); unsigned Alignment = St->getAlignment(); @@ -474,7 +478,7 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) { SDValue Lo, Hi; GetExpandedOp(St->getValue(), Lo, Hi); - if (TLI.isBigEndian()) + if (TLI.hasBigEndianPartOrdering(ValueVT)) std::swap(Lo, Hi); Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(), diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 898cd29c9141..507e7ffb1d45 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -37,12 +37,12 @@ class VectorLegalizer { const TargetLowering &TLI; bool Changed; // Keep track of whether anything changed - /// LegalizedNodes - For nodes that are of legal width, and that have more - /// than one use, this map indicates what regularized operand to use. This - /// allows us to avoid legalizing the same thing more than once. + /// For nodes that are of legal width, and that have more than one use, this + /// map indicates what regularized operand to use. This allows us to avoid + /// legalizing the same thing more than once. SmallDenseMap LegalizedNodes; - // Adds a node to the translation cache + /// \brief Adds a node to the translation cache. void AddLegalizedOperand(SDValue From, SDValue To) { LegalizedNodes.insert(std::make_pair(From, To)); // If someone requests legalization of the new node, return itself. @@ -50,41 +50,81 @@ class VectorLegalizer { LegalizedNodes.insert(std::make_pair(To, To)); } - // Legalizes the given node + /// \brief Legalizes the given node. SDValue LegalizeOp(SDValue Op); - // Assuming the node is legal, "legalize" the results + + /// \brief Assuming the node is legal, "legalize" the results. SDValue TranslateLegalizeResults(SDValue Op, SDValue Result); - // Implements unrolling a VSETCC. + + /// \brief Implements unrolling a VSETCC. SDValue UnrollVSETCC(SDValue Op); - // Implements expansion for FNEG; falls back to UnrollVectorOp if FSUB - // isn't legal. - // Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if - // SINT_TO_FLOAT and SHR on vectors isn't legal. + + /// \brief Implement expand-based legalization of vector operations. + /// + /// This is just a high-level routine to dispatch to specific code paths for + /// operations to legalize them. + SDValue Expand(SDValue Op); + + /// \brief Implements expansion for FNEG; falls back to UnrollVectorOp if + /// FSUB isn't legal. + /// + /// Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if + /// SINT_TO_FLOAT and SHR on vectors isn't legal. SDValue ExpandUINT_TO_FLOAT(SDValue Op); - // Implement expansion for SIGN_EXTEND_INREG using SRL and SRA. + + /// \brief Implement expansion for SIGN_EXTEND_INREG using SRL and SRA. SDValue ExpandSEXTINREG(SDValue Op); - // Expand bswap of vectors into a shuffle if legal. + + /// \brief Implement expansion for ANY_EXTEND_VECTOR_INREG. + /// + /// Shuffles the low lanes of the operand into place and bitcasts to the proper + /// type. The contents of the bits in the extended part of each element are + /// undef. + SDValue ExpandANY_EXTEND_VECTOR_INREG(SDValue Op); + + /// \brief Implement expansion for SIGN_EXTEND_VECTOR_INREG. + /// + /// Shuffles the low lanes of the operand into place, bitcasts to the proper + /// type, then shifts left and arithmetic shifts right to introduce a sign + /// extension. + SDValue ExpandSIGN_EXTEND_VECTOR_INREG(SDValue Op); + + /// \brief Implement expansion for ZERO_EXTEND_VECTOR_INREG. + /// + /// Shuffles the low lanes of the operand into place and blends zeros into + /// the remaining lanes, finally bitcasting to the proper type. + SDValue ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op); + + /// \brief Expand bswap of vectors into a shuffle if legal. SDValue ExpandBSWAP(SDValue Op); - // Implement vselect in terms of XOR, AND, OR when blend is not supported - // by the target. + + /// \brief Implement vselect in terms of XOR, AND, OR when blend is not + /// supported by the target. SDValue ExpandVSELECT(SDValue Op); SDValue ExpandSELECT(SDValue Op); SDValue ExpandLoad(SDValue Op); SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); - // Implements vector promotion; this is essentially just bitcasting the - // operands to a different type and bitcasting the result back to the - // original type. - SDValue PromoteVectorOp(SDValue Op); - // Implements [SU]INT_TO_FP vector promotion; this is a [zs]ext of the input - // operand to the next size up. - SDValue PromoteVectorOpINT_TO_FP(SDValue Op); - // Implements FP_TO_[SU]INT vector promotion of the result type; it is - // promoted to the next size up integer type. The result is then truncated - // back to the original type. - SDValue PromoteVectorOpFP_TO_INT(SDValue Op, bool isSigned); - - public: + + /// \brief Implements vector promotion. + /// + /// This is essentially just bitcasting the operands to a different type and + /// bitcasting the result back to the original type. + SDValue Promote(SDValue Op); + + /// \brief Implements [SU]INT_TO_FP vector promotion. + /// + /// This is a [zs]ext of the input operand to the next size up. + SDValue PromoteINT_TO_FP(SDValue Op); + + /// \brief Implements FP_TO_[SU]INT vector promotion of the result type. + /// + /// It is promoted to the next size up integer type. The result is then + /// truncated back to the original type. + SDValue PromoteFP_TO_INT(SDValue Op, bool isSigned); + +public: + /// \brief Begin legalizer the vector operations in the DAG. bool Run(); VectorLegalizer(SelectionDAG& dag) : DAG(dag), TLI(dag.getTargetLoweringInfo()), Changed(false) {} @@ -254,6 +294,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FP_EXTEND: case ISD::FMA: case ISD::SIGN_EXTEND_INREG: + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: QueryType = Node->getValueType(0); break; case ISD::FP_ROUND_INREG: @@ -267,27 +310,11 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) { case TargetLowering::Promote: - switch (Op.getOpcode()) { - default: - // "Promote" the operation by bitcasting - Result = PromoteVectorOp(Op); - Changed = true; - break; - case ISD::SINT_TO_FP: - case ISD::UINT_TO_FP: - // "Promote" the operation by extending the operand. - Result = PromoteVectorOpINT_TO_FP(Op); - Changed = true; - break; - case ISD::FP_TO_UINT: - case ISD::FP_TO_SINT: - // Promote the operation by extending the operand. - Result = PromoteVectorOpFP_TO_INT(Op, Op->getOpcode() == ISD::FP_TO_SINT); - Changed = true; - break; - } + Result = Promote(Op); + Changed = true; + break; + case TargetLowering::Legal: break; - case TargetLowering::Legal: break; case TargetLowering::Custom: { SDValue Tmp1 = TLI.LowerOperation(Op, DAG); if (Tmp1.getNode()) { @@ -297,23 +324,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { // FALL THROUGH } case TargetLowering::Expand: - if (Node->getOpcode() == ISD::SIGN_EXTEND_INREG) - Result = ExpandSEXTINREG(Op); - else if (Node->getOpcode() == ISD::BSWAP) - Result = ExpandBSWAP(Op); - else if (Node->getOpcode() == ISD::VSELECT) - Result = ExpandVSELECT(Op); - else if (Node->getOpcode() == ISD::SELECT) - Result = ExpandSELECT(Op); - else if (Node->getOpcode() == ISD::UINT_TO_FP) - Result = ExpandUINT_TO_FLOAT(Op); - else if (Node->getOpcode() == ISD::FNEG) - Result = ExpandFNEG(Op); - else if (Node->getOpcode() == ISD::SETCC) - Result = UnrollVSETCC(Op); - else - Result = DAG.UnrollVectorOp(Op.getNode()); - break; + Result = Expand(Op); } // Make sure that the generated code is itself legal. @@ -328,10 +339,23 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { return Result; } -SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) { - // Vector "promotion" is basically just bitcasting and doing the operation - // in a different type. For example, x86 promotes ISD::AND on v2i32 to - // v1i64. +SDValue VectorLegalizer::Promote(SDValue Op) { + // For a few operations there is a specific concept for promotion based on + // the operand's type. + switch (Op.getOpcode()) { + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + // "Promote" the operation by extending the operand. + return PromoteINT_TO_FP(Op); + case ISD::FP_TO_UINT: + case ISD::FP_TO_SINT: + // Promote the operation by extending the operand. + return PromoteFP_TO_INT(Op, Op->getOpcode() == ISD::FP_TO_SINT); + } + + // The rest of the time, vector "promotion" is basically just bitcasting and + // doing the operation in a different type. For example, x86 promotes + // ISD::AND on v2i32 to v1i64. MVT VT = Op.getSimpleValueType(); assert(Op.getNode()->getNumValues() == 1 && "Can't promote a vector with multiple results!"); @@ -351,7 +375,7 @@ SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) { return DAG.getNode(ISD::BITCAST, dl, VT, Op); } -SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) { +SDValue VectorLegalizer::PromoteINT_TO_FP(SDValue Op) { // INT_TO_FP operations may require the input operand be promoted even // when the type is otherwise legal. EVT VT = Op.getOperand(0).getValueType(); @@ -387,7 +411,7 @@ SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) { // elements and then truncate the result. This is different from the default // PromoteVector which uses bitcast to promote thus assumning that the // promoted vector type has the same overall size. -SDValue VectorLegalizer::PromoteVectorOpFP_TO_INT(SDValue Op, bool isSigned) { +SDValue VectorLegalizer::PromoteFP_TO_INT(SDValue Op, bool isSigned) { assert(Op.getNode()->getNumValues() == 1 && "Can't promote a vector with multiple results!"); EVT VT = Op.getValueType(); @@ -609,6 +633,33 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) { return TF; } +SDValue VectorLegalizer::Expand(SDValue Op) { + switch (Op->getOpcode()) { + case ISD::SIGN_EXTEND_INREG: + return ExpandSEXTINREG(Op); + case ISD::ANY_EXTEND_VECTOR_INREG: + return ExpandANY_EXTEND_VECTOR_INREG(Op); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return ExpandSIGN_EXTEND_VECTOR_INREG(Op); + case ISD::ZERO_EXTEND_VECTOR_INREG: + return ExpandZERO_EXTEND_VECTOR_INREG(Op); + case ISD::BSWAP: + return ExpandBSWAP(Op); + case ISD::VSELECT: + return ExpandVSELECT(Op); + case ISD::SELECT: + return ExpandSELECT(Op); + case ISD::UINT_TO_FP: + return ExpandUINT_TO_FLOAT(Op); + case ISD::FNEG: + return ExpandFNEG(Op); + case ISD::SETCC: + return UnrollVSETCC(Op); + default: + return DAG.UnrollVectorOp(Op.getNode()); + } +} + SDValue VectorLegalizer::ExpandSELECT(SDValue Op) { // Lower a select instruction where the condition is a scalar and the // operands are vectors. Lower this select to VSELECT and implement it @@ -686,6 +737,85 @@ SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) { return DAG.getNode(ISD::SRA, DL, VT, Op, ShiftSz); } +// Generically expand a vector anyext in register to a shuffle of the relevant +// lanes into the appropriate locations, with other lanes left undef. +SDValue VectorLegalizer::ExpandANY_EXTEND_VECTOR_INREG(SDValue Op) { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + int NumElements = VT.getVectorNumElements(); + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + int NumSrcElements = SrcVT.getVectorNumElements(); + + // Build a base mask of undef shuffles. + SmallVector ShuffleMask; + ShuffleMask.resize(NumSrcElements, -1); + + // Place the extended lanes into the correct locations. + int ExtLaneScale = NumSrcElements / NumElements; + int EndianOffset = TLI.isBigEndian() ? ExtLaneScale - 1 : 0; + for (int i = 0; i < NumElements; ++i) + ShuffleMask[i * ExtLaneScale + EndianOffset] = i; + + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getVectorShuffle(SrcVT, DL, Src, DAG.getUNDEF(SrcVT), ShuffleMask)); +} + +SDValue VectorLegalizer::ExpandSIGN_EXTEND_VECTOR_INREG(SDValue Op) { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // First build an any-extend node which can be legalized above when we + // recurse through it. + Op = DAG.getAnyExtendVectorInReg(Src, DL, VT); + + // Now we need sign extend. Do this by shifting the elements. Even if these + // aren't legal operations, they have a better chance of being legalized + // without full scalarization than the sign extension does. + unsigned EltWidth = VT.getVectorElementType().getSizeInBits(); + unsigned SrcEltWidth = SrcVT.getVectorElementType().getSizeInBits(); + SDValue ShiftAmount = DAG.getConstant(EltWidth - SrcEltWidth, VT); + return DAG.getNode(ISD::SRA, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, Op, ShiftAmount), + ShiftAmount); +} + +// Generically expand a vector zext in register to a shuffle of the relevant +// lanes into the appropriate locations, a blend of zero into the high bits, +// and a bitcast to the wider element type. +SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op) { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + int NumElements = VT.getVectorNumElements(); + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + int NumSrcElements = SrcVT.getVectorNumElements(); + + // Build up a zero vector to blend into this one. + EVT SrcScalarVT = SrcVT.getScalarType(); + SDValue ScalarZero = DAG.getTargetConstant(0, SrcScalarVT); + SmallVector BuildVectorOperands(NumSrcElements, ScalarZero); + SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, DL, SrcVT, BuildVectorOperands); + + // Shuffle the incoming lanes into the correct position, and pull all other + // lanes from the zero vector. + SmallVector ShuffleMask; + ShuffleMask.reserve(NumSrcElements); + for (int i = 0; i < NumSrcElements; ++i) + ShuffleMask.push_back(i); + + int ExtLaneScale = NumSrcElements / NumElements; + int EndianOffset = TLI.isBigEndian() ? ExtLaneScale - 1 : 0; + for (int i = 0; i < NumElements; ++i) + ShuffleMask[i * ExtLaneScale + EndianOffset] = NumSrcElements + i; + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getVectorShuffle(SrcVT, DL, Zero, Src, ShuffleMask)); +} + SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) { EVT VT = Op.getValueType(); @@ -729,9 +859,9 @@ SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) { // FIXME: Sign extend 1 to all ones if thats legal on the target. if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand || TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand || - TLI.getOperationAction(ISD::OR, VT) == TargetLowering::Expand || - TLI.getBooleanContents(true) != - TargetLowering::ZeroOrNegativeOneBooleanContent) + TLI.getOperationAction(ISD::OR, VT) == TargetLowering::Expand || + TLI.getBooleanContents(Op1.getValueType()) != + TargetLowering::ZeroOrNegativeOneBooleanContent) return DAG.UnrollVectorOp(Op.getNode()); // If the mask and the type are different sizes, unroll the vector op. This diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 368eba396603..f77c592fddb7 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -257,8 +257,26 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) { SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) { SDValue Cond = GetScalarizedVector(N->getOperand(0)); SDValue LHS = GetScalarizedVector(N->getOperand(1)); - TargetLowering::BooleanContent ScalarBool = TLI.getBooleanContents(false); - TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true); + TargetLowering::BooleanContent ScalarBool = + TLI.getBooleanContents(false, false); + TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true, false); + + // If integer and float booleans have different contents then we can't + // reliably optimize in all cases. There is a full explanation for this in + // DAGCombiner::visitSELECT() where the same issue affects folding + // (select C, 0, 1) to (xor C, 1). + if (TLI.getBooleanContents(false, false) != + TLI.getBooleanContents(false, true)) { + // At least try the common case where the boolean is generated by a + // comparison. + if (Cond->getOpcode() == ISD::SETCC) { + EVT OpVT = Cond->getOperand(0)->getValueType(0); + ScalarBool = TLI.getBooleanContents(OpVT.getScalarType()); + VecBool = TLI.getBooleanContents(OpVT); + } else + ScalarBool = TargetLowering::UndefinedBooleanContent; + } + if (ScalarBool != VecBool) { EVT CondVT = Cond.getValueType(); switch (ScalarBool) { @@ -357,7 +375,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) { // Vectors may have a different boolean contents to scalars. Promote the // value appropriately. ISD::NodeType ExtendCode = - TargetLowering::getExtendForContent(TLI.getBooleanContents(true)); + TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT)); return DAG.getNode(ExtendCode, DL, NVT, Res); } @@ -545,6 +563,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::BUILD_VECTOR: SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break; case ISD::CONCAT_VECTORS: SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break; case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break; + case ISD::INSERT_SUBVECTOR: SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break; case ISD::FP_ROUND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break; case ISD::FPOWI: SplitVecRes_FPOWI(N, Lo, Hi); break; case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break; @@ -765,6 +784,43 @@ void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, TLI.getVectorIdxTy())); } +void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDValue Vec = N->getOperand(0); + SDValue SubVec = N->getOperand(1); + SDValue Idx = N->getOperand(2); + SDLoc dl(N); + GetSplitVector(Vec, Lo, Hi); + + // Spill the vector to the stack. + EVT VecVT = Vec.getValueType(); + EVT SubVecVT = VecVT.getVectorElementType(); + SDValue StackPtr = DAG.CreateStackTemporary(VecVT); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, + MachinePointerInfo(), false, false, 0); + + // Store the new subvector into the specified index. + SDValue SubVecPtr = GetVectorElementPointer(StackPtr, SubVecVT, Idx); + Type *VecType = VecVT.getTypeForEVT(*DAG.getContext()); + unsigned Alignment = TLI.getDataLayout()->getPrefTypeAlignment(VecType); + Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo(), + false, false, 0); + + // Load the Lo part from the stack slot. + Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo(), + false, false, false, 0); + + // Increment the pointer to the other part. + unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8; + StackPtr = + DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, + DAG.getConstant(IncrementSize, StackPtr.getValueType())); + + // Load the Hi part from the stack slot. + Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(), + false, false, false, MinAlign(Alignment, IncrementSize)); +} + void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); @@ -1511,7 +1567,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::ADD: case ISD::AND: - case ISD::BSWAP: case ISD::MUL: case ISD::MULHS: case ISD::MULHU: @@ -1558,6 +1613,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { Res = WidenVecRes_Convert(N); break; + case ISD::BSWAP: case ISD::CTLZ: case ISD::CTPOP: case ISD::CTTZ: @@ -2343,15 +2399,18 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { case ISD::STORE: Res = WidenVecOp_STORE(N); break; case ISD::SETCC: Res = WidenVecOp_SETCC(N); break; + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + Res = WidenVecOp_EXTEND(N); + break; + case ISD::FP_EXTEND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::TRUNCATE: - case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: - case ISD::ANY_EXTEND: Res = WidenVecOp_Convert(N); break; } @@ -2372,6 +2431,68 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { return false; } +SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + SDValue InOp = N->getOperand(0); + // If some legalization strategy other than widening is used on the operand, + // we can't safely assume that just extending the low lanes is the correct + // transformation. + if (getTypeAction(InOp.getValueType()) != TargetLowering::TypeWidenVector) + return WidenVecOp_Convert(N); + InOp = GetWidenedVector(InOp); + assert(VT.getVectorNumElements() < + InOp.getValueType().getVectorNumElements() && + "Input wasn't widened!"); + + // We may need to further widen the operand until it has the same total + // vector size as the result. + EVT InVT = InOp.getValueType(); + if (InVT.getSizeInBits() != VT.getSizeInBits()) { + EVT InEltVT = InVT.getVectorElementType(); + for (int i = MVT::FIRST_VECTOR_VALUETYPE, e = MVT::LAST_VECTOR_VALUETYPE; i < e; ++i) { + EVT FixedVT = (MVT::SimpleValueType)i; + EVT FixedEltVT = FixedVT.getVectorElementType(); + if (TLI.isTypeLegal(FixedVT) && + FixedVT.getSizeInBits() == VT.getSizeInBits() && + FixedEltVT == InEltVT) { + assert(FixedVT.getVectorNumElements() >= VT.getVectorNumElements() && + "Not enough elements in the fixed type for the operand!"); + assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() && + "We can't have the same type as we started with!"); + if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements()) + InOp = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, FixedVT, + DAG.getUNDEF(FixedVT), InOp, + DAG.getConstant(0, TLI.getVectorIdxTy())); + else + InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp, + DAG.getConstant(0, TLI.getVectorIdxTy())); + break; + } + } + InVT = InOp.getValueType(); + if (InVT.getSizeInBits() != VT.getSizeInBits()) + // We couldn't find a legal vector type that was a widening of the input + // and could be extended in-register to the result type, so we have to + // scalarize. + return WidenVecOp_Convert(N); + } + + // Use special DAG nodes to represent the operation of extending the + // low lanes. + switch (N->getOpcode()) { + default: + llvm_unreachable("Extend legalization on on extend operation!"); + case ISD::ANY_EXTEND: + return DAG.getAnyExtendVectorInReg(InOp, DL, VT); + case ISD::SIGN_EXTEND: + return DAG.getSignExtendVectorInReg(InOp, DL, VT); + case ISD::ZERO_EXTEND: + return DAG.getZeroExtendVectorInReg(InOp, DL, VT); + } +} + SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { // Since the result is legal and the input is illegal, it is unlikely // that we can fix the input to a legal type so unroll the convert diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index f92230cc497d..624003f5070e 100644 --- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -442,7 +442,7 @@ signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) { ResCount -= (regPressureDelta(SU) * ScaleTwo); } - // These are platform specific things. + // These are platform-specific things. // Will need to go into the back end // and accessed from here via a hook. for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) { diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 13cfae7515b8..dedca41c3aab 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -1373,7 +1373,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() { Interferences.push_back(CurSU); } else { - assert(CurSU->isPending && "Intereferences are pending"); + assert(CurSU->isPending && "Interferences are pending"); // Update the interference with current live regs. LRegsPair.first->second = LRegs; } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 639eb462e7ff..1b62e77e168c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -148,33 +148,34 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) { if (N->getOpcode() != ISD::BUILD_VECTOR) return false; - unsigned i = 0, e = N->getNumOperands(); - - // Skip over all of the undef values. - while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF) - ++i; + bool IsAllUndef = true; + for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) { + if (N->getOperand(i).getOpcode() == ISD::UNDEF) + continue; + IsAllUndef = false; + // Do not accept build_vectors that aren't all constants or which have non-0 + // elements. We have to be a bit careful here, as the type of the constant + // may not be the same as the type of the vector elements due to type + // legalization (the elements are promoted to a legal type for the target + // and a vector of a type may be legal when the base element type is not). + // We only want to check enough bits to cover the vector elements, because + // we care if the resultant vector is all zeros, not whether the individual + // constants are. + SDValue Zero = N->getOperand(i); + unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits(); + if (ConstantSDNode *CN = dyn_cast(Zero)) { + if (CN->getAPIntValue().countTrailingZeros() < EltSize) + return false; + } else if (ConstantFPSDNode *CFPN = dyn_cast(Zero)) { + if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize) + return false; + } else + return false; + } // Do not accept an all-undef vector. - if (i == e) return false; - - // Do not accept build_vectors that aren't all constants or which have non-0 - // elements. - SDValue Zero = N->getOperand(i); - if (ConstantSDNode *CN = dyn_cast(Zero)) { - if (!CN->isNullValue()) - return false; - } else if (ConstantFPSDNode *CFPN = dyn_cast(Zero)) { - if (!CFPN->getValueAPF().isPosZero()) - return false; - } else + if (IsAllUndef) return false; - - // Okay, we have at least one 0 value, check to see if the rest match or are - // undefs. - for (++i; i != e; ++i) - if (N->getOperand(i) != Zero && - N->getOperand(i).getOpcode() != ISD::UNDEF) - return false; return true; } @@ -556,7 +557,7 @@ static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) { // Add the return value info. AddNodeIDValueTypes(ID, N->getVTList()); // Add the operand info. - AddNodeIDOperands(ID, makeArrayRef(N->op_begin(), N->op_end())); + AddNodeIDOperands(ID, N->ops()); // Handle SDNode leafs with special info. AddNodeIDCustom(ID, N); @@ -701,6 +702,57 @@ void SelectionDAG::DeallocateNode(SDNode *N) { DbgVals[i]->setIsInvalidated(); } +#ifndef NDEBUG +/// VerifySDNode - Sanity check the given SDNode. Aborts if it is invalid. +static void VerifySDNode(SDNode *N) { + switch (N->getOpcode()) { + default: + break; + case ISD::BUILD_PAIR: { + EVT VT = N->getValueType(0); + assert(N->getNumValues() == 1 && "Too many results!"); + assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) && + "Wrong return type!"); + assert(N->getNumOperands() == 2 && "Wrong number of operands!"); + assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() && + "Mismatched operand types!"); + assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() && + "Wrong operand type!"); + assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() && + "Wrong return type size"); + break; + } + case ISD::BUILD_VECTOR: { + assert(N->getNumValues() == 1 && "Too many results!"); + assert(N->getValueType(0).isVector() && "Wrong return type!"); + assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() && + "Wrong number of operands!"); + EVT EltVT = N->getValueType(0).getVectorElementType(); + for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) { + assert((I->getValueType() == EltVT || + (EltVT.isInteger() && I->getValueType().isInteger() && + EltVT.bitsLE(I->getValueType()))) && + "Wrong operand type!"); + assert(I->getValueType() == N->getOperand(0).getValueType() && + "Operands must all have the same type"); + } + break; + } + } +} +#endif // NDEBUG + +/// \brief Insert a newly allocated node into the DAG. +/// +/// Handles insertion into the all nodes list and CSE map, as well as +/// verification and other common operations when a new node is allocated. +void SelectionDAG::InsertNode(SDNode *N) { + AllNodes.push_back(N); +#ifndef NDEBUG + VerifySDNode(N); +#endif +} + /// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that /// correspond to it. This is useful when we're about to delete or repurpose /// the node. We don't want future request for structurally identical nodes @@ -838,83 +890,6 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef Ops, return Node; } -#ifndef NDEBUG -/// VerifyNodeCommon - Sanity check the given node. Aborts if it is invalid. -static void VerifyNodeCommon(SDNode *N) { - switch (N->getOpcode()) { - default: - break; - case ISD::BUILD_PAIR: { - EVT VT = N->getValueType(0); - assert(N->getNumValues() == 1 && "Too many results!"); - assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) && - "Wrong return type!"); - assert(N->getNumOperands() == 2 && "Wrong number of operands!"); - assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() && - "Mismatched operand types!"); - assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() && - "Wrong operand type!"); - assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() && - "Wrong return type size"); - break; - } - case ISD::BUILD_VECTOR: { - assert(N->getNumValues() == 1 && "Too many results!"); - assert(N->getValueType(0).isVector() && "Wrong return type!"); - assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() && - "Wrong number of operands!"); - EVT EltVT = N->getValueType(0).getVectorElementType(); - for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) { - assert((I->getValueType() == EltVT || - (EltVT.isInteger() && I->getValueType().isInteger() && - EltVT.bitsLE(I->getValueType()))) && - "Wrong operand type!"); - assert(I->getValueType() == N->getOperand(0).getValueType() && - "Operands must all have the same type"); - } - break; - } - } -} - -/// VerifySDNode - Sanity check the given SDNode. Aborts if it is invalid. -static void VerifySDNode(SDNode *N) { - // The SDNode allocators cannot be used to allocate nodes with fields that are - // not present in an SDNode! - assert(!isa(N) && "Bad MemSDNode!"); - assert(!isa(N) && "Bad ShuffleVectorSDNode!"); - assert(!isa(N) && "Bad ConstantSDNode!"); - assert(!isa(N) && "Bad ConstantFPSDNode!"); - assert(!isa(N) && "Bad GlobalAddressSDNode!"); - assert(!isa(N) && "Bad FrameIndexSDNode!"); - assert(!isa(N) && "Bad JumpTableSDNode!"); - assert(!isa(N) && "Bad ConstantPoolSDNode!"); - assert(!isa(N) && "Bad BasicBlockSDNode!"); - assert(!isa(N) && "Bad SrcValueSDNode!"); - assert(!isa(N) && "Bad MDNodeSDNode!"); - assert(!isa(N) && "Bad RegisterSDNode!"); - assert(!isa(N) && "Bad BlockAddressSDNode!"); - assert(!isa(N) && "Bad EHLabelSDNode!"); - assert(!isa(N) && "Bad ExternalSymbolSDNode!"); - assert(!isa(N) && "Bad CondCodeSDNode!"); - assert(!isa(N) && "Bad CvtRndSatSDNode!"); - assert(!isa(N) && "Bad VTSDNode!"); - assert(!isa(N) && "Bad MachineSDNode!"); - - VerifyNodeCommon(N); -} - -/// VerifyMachineNode - Sanity check the given MachineNode. Aborts if it is -/// invalid. -static void VerifyMachineNode(SDNode *N) { - // The MachineNode allocators cannot be used to allocate nodes with fields - // that are not present in a MachineNode! - // Currently there are no such nodes. - - VerifyNodeCommon(N); -} -#endif // NDEBUG - /// getEVTAlignment - Compute the default alignment value for the /// given type. /// @@ -1011,11 +986,12 @@ SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) { getNode(ISD::TRUNCATE, DL, VT, Op); } -SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT) { +SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT, + EVT OpVT) { if (VT.bitsLE(Op.getValueType())) return getNode(ISD::TRUNCATE, SL, VT, Op); - TargetLowering::BooleanContent BType = TLI->getBooleanContents(VT.isVector()); + TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT); return getNode(TLI->getExtendForContent(BType), SL, VT, Op); } @@ -1031,6 +1007,36 @@ SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, SDLoc DL, EVT VT) { getConstant(Imm, Op.getValueType())); } +SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) { + assert(VT.isVector() && "This DAG node is restricted to vector types."); + assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() && + "The sizes of the input and result must match in order to perform the " + "extend in-register."); + assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() && + "The destination vector type must have fewer lanes than the input."); + return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op); +} + +SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) { + assert(VT.isVector() && "This DAG node is restricted to vector types."); + assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() && + "The sizes of the input and result must match in order to perform the " + "extend in-register."); + assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() && + "The destination vector type must have fewer lanes than the input."); + return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op); +} + +SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) { + assert(VT.isVector() && "This DAG node is restricted to vector types."); + assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() && + "The sizes of the input and result must match in order to perform the " + "extend in-register."); + assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() && + "The destination vector type must have fewer lanes than the input."); + return getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Op); +} + /// getNOT - Create a bitwise NOT operation as (XOR Val, -1). /// SDValue SelectionDAG::getNOT(SDLoc DL, SDValue Val, EVT VT) { @@ -1043,7 +1049,7 @@ SDValue SelectionDAG::getNOT(SDLoc DL, SDValue Val, EVT VT) { SDValue SelectionDAG::getLogicalNOT(SDLoc DL, SDValue Val, EVT VT) { EVT EltVT = VT.getScalarType(); SDValue TrueValue; - switch (TLI->getBooleanContents(VT.isVector())) { + switch (TLI->getBooleanContents(VT)) { case TargetLowering::ZeroOrOneBooleanContent: case TargetLowering::UndefinedBooleanContent: TrueValue = getConstant(1, VT); @@ -1153,7 +1159,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT, if (!N) { N = new (NodeAllocator) ConstantSDNode(isT, isO, Elt, EltVT); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); } SDValue Result(N, 0); @@ -1195,7 +1201,7 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP& V, EVT VT, bool isTarget){ if (!N) { N = new (NodeAllocator) ConstantFPSDNode(isTarget, &V, EltVT); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); } SDValue Result(N, 0); @@ -1258,7 +1264,7 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL, DL.getDebugLoc(), GV, VT, Offset, TargetFlags); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1273,7 +1279,7 @@ SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) { SDNode *N = new (NodeAllocator) FrameIndexSDNode(FI, VT, isTarget); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1293,7 +1299,7 @@ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget, SDNode *N = new (NodeAllocator) JumpTableSDNode(JTI, VT, isTarget, TargetFlags); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1320,7 +1326,7 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT, SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment, TargetFlags); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1348,7 +1354,7 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT, SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment, TargetFlags); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1366,7 +1372,7 @@ SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset, SDNode *N = new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset, TargetFlags); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1380,7 +1386,7 @@ SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) { SDNode *N = new (NodeAllocator) BasicBlockSDNode(MBB); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1394,7 +1400,7 @@ SDValue SelectionDAG::getValueType(EVT VT) { if (N) return SDValue(N, 0); N = new (NodeAllocator) VTSDNode(VT); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1402,7 +1408,7 @@ SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) { SDNode *&N = ExternalSymbols[Sym]; if (N) return SDValue(N, 0); N = new (NodeAllocator) ExternalSymbolSDNode(false, Sym, 0, VT); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1413,7 +1419,7 @@ SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT, TargetFlags)]; if (N) return SDValue(N, 0); N = new (NodeAllocator) ExternalSymbolSDNode(true, Sym, TargetFlags, VT); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1424,7 +1430,7 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) { if (!CondCodeNodes[Cond]) { CondCodeSDNode *N = new (NodeAllocator) CondCodeSDNode(Cond); CondCodeNodes[Cond] = N; - AllNodes.push_back(N); + InsertNode(N); } return SDValue(CondCodeNodes[Cond], 0); @@ -1495,6 +1501,11 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, N1 = getUNDEF(VT); commuteShuffle(N1, N2, MaskVec); } + // Reset our undef status after accounting for the mask. + N2Undef = N2.getOpcode() == ISD::UNDEF; + // Re-check whether both sides ended up undef. + if (N1.getOpcode() == ISD::UNDEF && N2Undef) + return getUNDEF(VT); // If Identity shuffle return that node. bool Identity = true; @@ -1505,9 +1516,36 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, return N1; // Shuffling a constant splat doesn't change the result. - if (N2Undef && N1.getOpcode() == ISD::BUILD_VECTOR) - if (cast(N1)->getConstantSplatValue()) - return N1; + if (N2Undef) { + SDValue V = N1; + + // Look through any bitcasts. We check that these don't change the number + // (and size) of elements and just changes their types. + while (V.getOpcode() == ISD::BITCAST) + V = V->getOperand(0); + + // A splat should always show up as a build vector node. + if (auto *BV = dyn_cast(V)) { + BitVector UndefElements; + SDValue Splat = BV->getSplatValue(&UndefElements); + // If this is a splat of an undef, shuffling it is also undef. + if (Splat && Splat.getOpcode() == ISD::UNDEF) + return getUNDEF(VT); + + // We only have a splat which can skip shuffles if there is a splatted + // value and no undef lanes rearranged by the shuffle. + if (Splat && UndefElements.none()) { + // Splat of , return , provided that the + // number of elements match or the value splatted is a zero constant. + if (V.getValueType().getVectorNumElements() == + VT.getVectorNumElements()) + return N1; + if (auto *C = dyn_cast(Splat)) + if (C->isNullValue()) + return N1; + } + } + } FoldingSetNodeID ID; SDValue Ops[2] = { N1, N2 }; @@ -1530,10 +1568,31 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, dl.getDebugLoc(), N1, N2, MaskAlloc); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } +SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) { + MVT VT = SV.getSimpleValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + SmallVector MaskVec; + + for (unsigned i = 0; i != NumElems; ++i) { + int Idx = SV.getMaskElt(i); + if (Idx >= 0) { + if (Idx < (int)NumElems) + Idx += NumElems; + else + Idx -= NumElems; + } + MaskVec.push_back(Idx); + } + + SDValue Op0 = SV.getOperand(0); + SDValue Op1 = SV.getOperand(1); + return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, &MaskVec[0]); +} + SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl, SDValue Val, SDValue DTy, SDValue STy, SDValue Rnd, SDValue Sat, @@ -1555,7 +1614,7 @@ SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl, dl.getDebugLoc(), Ops, Code); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1569,7 +1628,7 @@ SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) { SDNode *N = new (NodeAllocator) RegisterSDNode(RegNo, VT); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1583,7 +1642,7 @@ SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) { SDNode *N = new (NodeAllocator) RegisterMaskSDNode(RegMask); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1599,7 +1658,7 @@ SDValue SelectionDAG::getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label) { SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(), dl.getDebugLoc(), Root, Label); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1622,7 +1681,7 @@ SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT, SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, Offset, TargetFlags); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1640,7 +1699,7 @@ SDValue SelectionDAG::getSrcValue(const Value *V) { SDNode *N = new (NodeAllocator) SrcValueSDNode(V); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1656,7 +1715,7 @@ SDValue SelectionDAG::getMDNode(const MDNode *MD) { SDNode *N = new (NodeAllocator) MDNodeSDNode(MD); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1677,7 +1736,7 @@ SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr, dl.getDebugLoc(), VT, Ptr, SrcAS, DestAS); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -1733,7 +1792,8 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, case ISD::SETTRUE: case ISD::SETTRUE2: { const TargetLowering *TLI = TM.getTargetLowering(); - TargetLowering::BooleanContent Cnt = TLI->getBooleanContents(VT.isVector()); + TargetLowering::BooleanContent Cnt = + TLI->getBooleanContents(N1->getValueType(0)); return getConstant( Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT); } @@ -1964,11 +2024,20 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, case ISD::UMULO: if (Op.getResNo() != 1) break; - // The boolean result conforms to getBooleanContents. Fall through. + // The boolean result conforms to getBooleanContents. + // If we know the result of a setcc has the top bits zero, use this info. + // We know that we have an integer-based boolean since these operations + // are only available for integer. + if (TLI->getBooleanContents(Op.getValueType().isVector(), false) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + break; case ISD::SETCC: // If we know the result of a setcc has the top bits zero, use this info. - if (TLI->getBooleanContents(Op.getValueType().isVector()) == - TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) + if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::SHL: @@ -2084,7 +2153,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, unsigned MemBits = VT.getScalarType().getSizeInBits(); KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); } else if (const MDNode *Ranges = LD->getRanges()) { - computeKnownBitsLoad(*Ranges, KnownZero); + computeKnownBitsFromRangeMetadata(*Ranges, KnownZero); } break; } @@ -2367,9 +2436,16 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{ if (Op.getResNo() != 1) break; // The boolean result conforms to getBooleanContents. Fall through. + // If setcc returns 0/-1, all bits are sign bits. + // We know that we have an integer-based boolean since these operations + // are only available for integer. + if (TLI->getBooleanContents(Op.getValueType().isVector(), false) == + TargetLowering::ZeroOrNegativeOneBooleanContent) + return VTBits; + break; case ISD::SETCC: // If setcc returns 0/-1, all bits are sign bits. - if (TLI->getBooleanContents(Op.getValueType().isVector()) == + if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == TargetLowering::ZeroOrNegativeOneBooleanContent) return VTBits; break; @@ -2574,10 +2650,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) { DL.getDebugLoc(), getVTList(VT)); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); -#ifndef NDEBUG - VerifySDNode(N); -#endif + InsertNode(N); return SDValue(N, 0); } @@ -2691,6 +2764,32 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, } } + // Constant fold unary operations with a vector integer operand. + if (BuildVectorSDNode *BV = dyn_cast(Operand.getNode())) { + APInt Val; + APInt DummyUndefs; + unsigned SplatBitSize; + bool DummyHasUndefs; + if (BV->isConstantSplat(Val, DummyUndefs, SplatBitSize, DummyHasUndefs)) { + switch (Opcode) { + default: + // FIXME: Entirely reasonable to perform folding of other unary + // operations here as the need arises. + break; + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: { + APFloat APF( + EVTToAPFloatSemantics(VT.getVectorElementType()), + APInt::getNullValue(VT.getVectorElementType().getSizeInBits())); + (void)APF.convertFromAPInt(Val, Opcode == ISD::SINT_TO_FP, + APFloat::rmNearestTiesToEven); + + return getConstantFP(APF, VT); + } + } + } + } + unsigned OpOpcode = Operand.getNode()->getOpcode(); switch (Opcode) { case ISD::TokenFactor: @@ -2850,10 +2949,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, DL.getDebugLoc(), VTs, Operand); } - AllNodes.push_back(N); -#ifndef NDEBUG - VerifySDNode(N); -#endif + InsertNode(N); return SDValue(N, 0); } @@ -3445,10 +3541,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, nuw, nsw, exact); } - AllNodes.push_back(N); -#ifndef NDEBUG - VerifySDNode(N); -#endif + InsertNode(N); return SDValue(N, 0); } @@ -3552,10 +3645,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, DL.getDebugLoc(), VTs, N1, N2, N3); } - AllNodes.push_back(N); -#ifndef NDEBUG - VerifySDNode(N); -#endif + InsertNode(N); return SDValue(N, 0); } @@ -3630,7 +3720,7 @@ static SDValue getMemsetStringVal(EVT VT, SDLoc dl, SelectionDAG &DAG, if (Str.empty()) { if (VT.isInteger()) return DAG.getConstant(0, VT); - else if (VT == MVT::f32 || VT == MVT::f64) + else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128) return DAG.getConstantFP(0.0, VT); else if (VT.isVector()) { unsigned NumElts = VT.getVectorNumElements(); @@ -4157,7 +4247,7 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst, .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), Type::getVoidTy(*getContext()), getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), - TLI->getPointerTy()), &Args, 0) + TLI->getPointerTy()), std::move(Args), 0) .setDiscardResult(); std::pair CallResult = TLI->LowerCallTo(CLI); @@ -4213,7 +4303,7 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst, .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), Type::getVoidTy(*getContext()), getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), - TLI->getPointerTy()), &Args, 0) + TLI->getPointerTy()), std::move(Args), 0) .setDiscardResult(); std::pair CallResult = TLI->LowerCallTo(CLI); @@ -4277,7 +4367,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst, .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), Type::getVoidTy(*getContext()), getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), - TLI->getPointerTy()), &Args, 0) + TLI->getPointerTy()), std::move(Args), 0) .setDiscardResult(); std::pair CallResult = TLI->LowerCallTo(CLI); @@ -4315,7 +4405,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SuccessOrdering, FailureOrdering, SynchScope); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -4512,7 +4602,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList, dl.getDebugLoc(), VTList, Ops, MemVT, MMO); } - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -4628,7 +4718,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, dl.getDebugLoc(), VTs, AM, ExtType, MemVT, MMO); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -4735,7 +4825,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val, dl.getDebugLoc(), VTs, ISD::UNINDEXED, false, VT, MMO); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -4804,7 +4894,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, dl.getDebugLoc(), VTs, ISD::UNINDEXED, true, SVT, MMO); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -4831,7 +4921,7 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base, ST->getMemoryVT(), ST->getMemOperand()); CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); + InsertNode(N); return SDValue(N, 0); } @@ -4910,10 +5000,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, VTs, Ops); } - AllNodes.push_back(N); -#ifndef NDEBUG - VerifySDNode(N); -#endif + InsertNode(N); return SDValue(N, 0); } @@ -4993,10 +5080,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, VTList, Ops); } } - AllNodes.push_back(N); -#ifndef NDEBUG - VerifySDNode(N); -#endif + InsertNode(N); return SDValue(N, 0); } @@ -5621,10 +5705,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, SDLoc DL, SDVTList VTs, if (DoCSE) CSEMap.InsertNode(N, IP); - AllNodes.push_back(N); -#ifndef NDEBUG - VerifyMachineNode(N); -#endif + InsertNode(N); return N; } @@ -6602,16 +6683,43 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, return true; } -ConstantSDNode *BuildVectorSDNode::getConstantSplatValue() const { - SDValue Op0 = getOperand(0); - if (Op0.getOpcode() != ISD::Constant) - return nullptr; +SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const { + if (UndefElements) { + UndefElements->clear(); + UndefElements->resize(getNumOperands()); + } + SDValue Splatted; + for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { + SDValue Op = getOperand(i); + if (Op.getOpcode() == ISD::UNDEF) { + if (UndefElements) + (*UndefElements)[i] = true; + } else if (!Splatted) { + Splatted = Op; + } else if (Splatted != Op) { + return SDValue(); + } + } + + if (!Splatted) { + assert(getOperand(0).getOpcode() == ISD::UNDEF && + "Can only have a splat without a constant for all undefs."); + return getOperand(0); + } - for (unsigned i = 1, e = getNumOperands(); i != e; ++i) - if (getOperand(i) != Op0) - return nullptr; + return Splatted; +} + +ConstantSDNode * +BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const { + return dyn_cast_or_null( + getSplatValue(UndefElements).getNode()); +} - return cast(Op0); +ConstantFPSDNode * +BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const { + return dyn_cast_or_null( + getSplatValue(UndefElements).getNode()); } bool BuildVectorSDNode::isConstant() const { diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index e6dc27219787..c07b5e6a7362 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -169,7 +169,7 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL, SDValue Lo, Hi; Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]); Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]); - if (TLI.isBigEndian()) + if (TLI.hasBigEndianPartOrdering(ValueVT)) std::swap(Lo, Hi); Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi); } else { @@ -5155,12 +5155,16 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return nullptr; } case Intrinsic::convert_to_fp16: - setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, sdl, - MVT::i16, getValue(I.getArgOperand(0)))); + setValue(&I, DAG.getNode(ISD::BITCAST, sdl, MVT::i16, + DAG.getNode(ISD::FP_ROUND, sdl, MVT::f16, + getValue(I.getArgOperand(0)), + DAG.getTargetConstant(0, MVT::i32)))); return nullptr; case Intrinsic::convert_from_fp16: - setValue(&I, DAG.getNode(ISD::FP16_TO_FP32, sdl, - MVT::f32, getValue(I.getArgOperand(0)))); + setValue(&I, + DAG.getNode(ISD::FP_EXTEND, sdl, TLI->getValueType(I.getType()), + DAG.getNode(ISD::BITCAST, sdl, MVT::f16, + getValue(I.getArgOperand(0))))); return nullptr; case Intrinsic::pcmarker: { SDValue Tmp = getValue(I.getArgOperand(0)); @@ -5322,7 +5326,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { CLI.setDebugLoc(sdl).setChain(getRoot()) .setCallee(CallingConv::C, I.getType(), DAG.getExternalSymbol(TrapFuncName.data(), TLI->getPointerTy()), - &Args, 0); + std::move(Args), 0); std::pair Result = TLI->LowerCallTo(CLI); DAG.setRoot(Result.second); @@ -5490,12 +5494,12 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, // Check if target-independent constraints permit a tail call here. // Target-dependent constraints are checked within TLI->LowerCallTo. - if (isTailCall && !isInTailCallPosition(CS, DAG)) + if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget())) isTailCall = false; TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot()) - .setCallee(RetTy, FTy, Callee, &Args, CS).setTailCall(isTailCall); + .setCallee(RetTy, FTy, Callee, std::move(Args), CS).setTailCall(isTailCall); std::pair Result = TLI->LowerCallTo(CLI); assert((isTailCall || Result.second.getNode()) && @@ -6798,7 +6802,7 @@ SelectionDAGBuilder::LowerCallOperands(const CallInst &CI, unsigned ArgIdx, Type *retTy = useVoidTy ? Type::getVoidTy(*DAG.getContext()) : CI.getType(); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot()) - .setCallee(CI.getCallingConv(), retTy, Callee, &Args, NumArgs) + .setCallee(CI.getCallingConv(), retTy, Callee, std::move(Args), NumArgs) .setDiscardResult(!CI.use_empty()); const TargetLowering *TLI = TM.getTargetLowering(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index fb29691ee417..84679f98d84e 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -320,7 +320,7 @@ class SelectionDAGBuilder { /// 1. Preserve the architecture independence of stack protector generation. /// /// 2. Preserve the normal IR level stack protector check for platforms like - /// OpenBSD for which we support platform specific stack protector + /// OpenBSD for which we support platform-specific stack protector /// generation. /// /// The main problem that guided the present solution is that one can not @@ -338,7 +338,7 @@ class SelectionDAGBuilder { /// basic block (where the return inst is placed) and then move it back /// later at SelectionDAG/MI time before the stack protector check if the /// tail call optimization failed. The MI level option was nixed - /// immediately since it would require platform specific pattern + /// immediately since it would require platform-specific pattern /// matching. The SelectionDAG level option was nixed because /// SelectionDAG only processes one IR level basic block at a time /// implying one could not create a DAG Combine to move the callinst. diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index c92fb2453c2a..a71cc6859ea0 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -221,6 +221,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::ZERO_EXTEND: return "zero_extend"; case ISD::ANY_EXTEND: return "any_extend"; case ISD::SIGN_EXTEND_INREG: return "sign_extend_inreg"; + case ISD::ANY_EXTEND_VECTOR_INREG: return "any_extend_vector_inreg"; + case ISD::SIGN_EXTEND_VECTOR_INREG: return "sign_extend_vector_inreg"; + case ISD::ZERO_EXTEND_VECTOR_INREG: return "zero_extend_vector_inreg"; case ISD::TRUNCATE: return "truncate"; case ISD::FP_ROUND: return "fp_round"; case ISD::FLT_ROUNDS_: return "flt_rounds"; @@ -233,8 +236,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FP_TO_UINT: return "fp_to_uint"; case ISD::BITCAST: return "bitcast"; case ISD::ADDRSPACECAST: return "addrspacecast"; - case ISD::FP16_TO_FP32: return "fp16_to_fp32"; - case ISD::FP32_TO_FP16: return "fp32_to_fp16"; + case ISD::FP16_TO_FP: return "fp16_to_fp"; + case ISD::FP_TO_FP16: return "fp_to_fp16"; case ISD::CONVERT_RNDSAT: { switch (cast(this)->getCvtCode()) { diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 75bbbe749e58..05ace413bfdf 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -105,7 +105,7 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(getLibcallCallingConv(LC), RetTy, Callee, &Args, 0) + .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed) .setSExtResult(isSigned).setZExtResult(!isSigned); return LowerCallTo(CLI); @@ -1150,18 +1150,21 @@ bool TargetLowering::isConstTrueVal(const SDNode *N) const { if (!N) return false; - bool IsVec = false; const ConstantSDNode *CN = dyn_cast(N); if (!CN) { const BuildVectorSDNode *BV = dyn_cast(N); if (!BV) return false; - IsVec = true; - CN = BV->getConstantSplatValue(); + BitVector UndefElements; + CN = BV->getConstantSplatNode(&UndefElements); + // Only interested in constant splats, and we don't try to handle undef + // elements in identifying boolean constants. + if (!CN || UndefElements.none()) + return false; } - switch (getBooleanContents(IsVec)) { + switch (getBooleanContents(N->getValueType(0))) { case UndefinedBooleanContent: return CN->getAPIntValue()[0]; case ZeroOrOneBooleanContent: @@ -1177,18 +1180,21 @@ bool TargetLowering::isConstFalseVal(const SDNode *N) const { if (!N) return false; - bool IsVec = false; const ConstantSDNode *CN = dyn_cast(N); if (!CN) { const BuildVectorSDNode *BV = dyn_cast(N); if (!BV) return false; - IsVec = true; - CN = BV->getConstantSplatValue(); + BitVector UndefElements; + CN = BV->getConstantSplatNode(&UndefElements); + // Only interested in constant splats, and we don't try to handle undef + // elements in identifying boolean constants. + if (!CN || UndefElements.none()) + return false; } - if (getBooleanContents(IsVec) == UndefinedBooleanContent) + if (getBooleanContents(N->getValueType(0)) == UndefinedBooleanContent) return !CN->getAPIntValue()[0]; return CN->isNullValue(); @@ -1209,7 +1215,8 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, case ISD::SETFALSE2: return DAG.getConstant(0, VT); case ISD::SETTRUE: case ISD::SETTRUE2: { - TargetLowering::BooleanContent Cnt = getBooleanContents(VT.isVector()); + TargetLowering::BooleanContent Cnt = + getBooleanContents(N0->getValueType(0)); return DAG.getConstant( Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT); } @@ -1416,7 +1423,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, SDValue NewSetCC = DAG.getSetCC(dl, NewSetCCVT, N0.getOperand(0), NewConst, Cond); - return DAG.getBoolExtOrTrunc(NewSetCC, dl, VT); + return DAG.getBoolExtOrTrunc(NewSetCC, dl, VT, N0.getValueType()); } break; } @@ -1500,7 +1507,8 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } } else if (N1C->getAPIntValue() == 1 && (VT == MVT::i1 || - getBooleanContents(false) == ZeroOrOneBooleanContent)) { + getBooleanContents(N0->getValueType(0)) == + ZeroOrOneBooleanContent)) { SDValue Op0 = N0; if (Op0.getOpcode() == ISD::TRUNCATE) Op0 = Op0.getOperand(0); @@ -1771,7 +1779,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // The sext(setcc()) => setcc() optimization relies on the appropriate // constant being emitted. uint64_t EqVal = 0; - switch (getBooleanContents(N0.getValueType().isVector())) { + switch (getBooleanContents(N0.getValueType())) { case UndefinedBooleanContent: case ZeroOrOneBooleanContent: EqVal = ISD::isTrueWhenEqual(Cond); @@ -2877,3 +2885,65 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT, } return false; } + +bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result, + SelectionDAG &DAG) const { + EVT VT = Node->getOperand(0).getValueType(); + EVT NVT = Node->getValueType(0); + SDLoc dl(SDValue(Node, 0)); + + // FIXME: Only f32 to i64 conversions are supported. + if (VT != MVT::f32 || NVT != MVT::i64) + return false; + + // Expand f32 -> i64 conversion + // This algorithm comes from compiler-rt's implementation of fixsfdi: + // https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/fixsfdi.c + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), + VT.getSizeInBits()); + SDValue ExponentMask = DAG.getConstant(0x7F800000, IntVT); + SDValue ExponentLoBit = DAG.getConstant(23, IntVT); + SDValue Bias = DAG.getConstant(127, IntVT); + SDValue SignMask = DAG.getConstant(APInt::getSignBit(VT.getSizeInBits()), + IntVT); + SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, IntVT); + SDValue MantissaMask = DAG.getConstant(0x007FFFFF, IntVT); + + SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Node->getOperand(0)); + + SDValue ExponentBits = DAG.getNode(ISD::SRL, dl, IntVT, + DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask), + DAG.getZExtOrTrunc(ExponentLoBit, dl, getShiftAmountTy(IntVT))); + SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias); + + SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT, + DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask), + DAG.getZExtOrTrunc(SignLowBit, dl, getShiftAmountTy(IntVT))); + Sign = DAG.getSExtOrTrunc(Sign, dl, NVT); + + SDValue R = DAG.getNode(ISD::OR, dl, IntVT, + DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask), + DAG.getConstant(0x00800000, IntVT)); + + R = DAG.getZExtOrTrunc(R, dl, NVT); + + + R = DAG.getSelectCC(dl, Exponent, ExponentLoBit, + DAG.getNode(ISD::SHL, dl, NVT, R, + DAG.getZExtOrTrunc( + DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit), + dl, getShiftAmountTy(IntVT))), + DAG.getNode(ISD::SRL, dl, NVT, R, + DAG.getZExtOrTrunc( + DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent), + dl, getShiftAmountTy(IntVT))), + ISD::SETGT); + + SDValue Ret = DAG.getNode(ISD::SUB, dl, NVT, + DAG.getNode(ISD::XOR, dl, NVT, R, Sign), + Sign); + + Result = DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, IntVT), + DAG.getConstant(0, NVT), Ret, ISD::SETLT); + return true; +} diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp index d2f395594860..b0950ded270a 100644 --- a/lib/CodeGen/SjLjEHPrepare.cpp +++ b/lib/CodeGen/SjLjEHPrepare.cpp @@ -249,34 +249,16 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) { ++AI) { Type *Ty = AI->getType(); - // Aggregate types can't be cast, but are legal argument types, so we have - // to handle them differently. We use an extract/insert pair as a - // lightweight method to achieve the same goal. - if (isa(Ty) || isa(Ty)) { - Instruction *EI = ExtractValueInst::Create(AI, 0, "", AfterAllocaInsPt); - Instruction *NI = InsertValueInst::Create(AI, EI, 0); - NI->insertAfter(EI); - AI->replaceAllUsesWith(NI); - - // Set the operand of the instructions back to the AllocaInst. - EI->setOperand(0, AI); - NI->setOperand(0, AI); - } else { - // This is always a no-op cast because we're casting AI to AI->getType() - // so src and destination types are identical. BitCast is the only - // possibility. - CastInst *NC = new BitCastInst(AI, AI->getType(), AI->getName() + ".tmp", - AfterAllocaInsPt); - AI->replaceAllUsesWith(NC); - - // Set the operand of the cast instruction back to the AllocaInst. - // Normally it's forbidden to replace a CastInst's operand because it - // could cause the opcode to reflect an illegal conversion. However, we're - // replacing it here with the same value it was constructed with. We do - // this because the above replaceAllUsesWith() clobbered the operand, but - // we want this one to remain. - NC->setOperand(0, AI); - } + // Use 'select i8 true, %arg, undef' to simulate a 'no-op' instruction. + Value *TrueValue = ConstantInt::getTrue(F.getContext()); + Value *UndefValue = UndefValue::get(Ty); + Instruction *SI = SelectInst::Create(TrueValue, AI, UndefValue, + AI->getName() + ".tmp", + AfterAllocaInsPt); + AI->replaceAllUsesWith(SI); + + // Reset the operand, because it was clobbered by the RAUW above. + SI->setOperand(1, AI); } } diff --git a/lib/CodeGen/StackMapLivenessAnalysis.cpp b/lib/CodeGen/StackMapLivenessAnalysis.cpp index 4dd87ddb88c4..3ba502fff695 100644 --- a/lib/CodeGen/StackMapLivenessAnalysis.cpp +++ b/lib/CodeGen/StackMapLivenessAnalysis.cpp @@ -28,10 +28,9 @@ using namespace llvm; #define DEBUG_TYPE "stackmaps" namespace llvm { -cl::opt EnableStackMapLiveness("enable-stackmap-liveness", - cl::Hidden, cl::desc("Enable StackMap Liveness Analysis Pass")); cl::opt EnablePatchPointLiveness("enable-patchpoint-liveness", - cl::Hidden, cl::desc("Enable PatchPoint Liveness Analysis Pass")); + cl::Hidden, cl::init(true), + cl::desc("Enable PatchPoint Liveness Analysis Pass")); } STATISTIC(NumStackMapFuncVisited, "Number of functions visited"); @@ -62,15 +61,17 @@ void StackMapLiveness::getAnalysisUsage(AnalysisUsage &AU) const { /// Calculate the liveness information for the given machine function. bool StackMapLiveness::runOnMachineFunction(MachineFunction &_MF) { + if (!EnablePatchPointLiveness) + return false; + DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: " << _MF.getName() << " **********\n"); MF = &_MF; TRI = MF->getTarget().getRegisterInfo(); ++NumStackMapFuncVisited; - // Skip this function if there are no stackmaps or patchpoints to process. - if (!((MF->getFrameInfo()->hasStackMap() && EnableStackMapLiveness) || - (MF->getFrameInfo()->hasPatchPoint() && EnablePatchPointLiveness))) { + // Skip this function if there are no patchpoints to process. + if (!MF->getFrameInfo()->hasPatchPoint()) { ++NumStackMapFuncSkipped; return false; } @@ -88,13 +89,10 @@ bool StackMapLiveness::calculateLiveness() { LiveRegs.addLiveOuts(MBBI); bool HasStackMap = false; // Reverse iterate over all instructions and add the current live register - // set to an instruction if we encounter a stackmap or patchpoint - // instruction. + // set to an instruction if we encounter a patchpoint instruction. for (MachineBasicBlock::reverse_iterator I = MBBI->rbegin(), E = MBBI->rend(); I != E; ++I) { - int Opc = I->getOpcode(); - if ((EnableStackMapLiveness && (Opc == TargetOpcode::STACKMAP)) || - (EnablePatchPointLiveness && (Opc == TargetOpcode::PATCHPOINT))) { + if (I->getOpcode() == TargetOpcode::PATCHPOINT) { addLiveOutSetToMI(*I); HasChanged = true; HasStackMap = true; diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index f53048554322..e80ef7176c21 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -220,6 +220,10 @@ static void InitLibcallNames(const char **Names, const Triple &TT) { Names[RTLIB::FPEXT_F32_F64] = "__extendsfdf2"; Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee"; Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee"; + Names[RTLIB::FPROUND_F64_F16] = "__truncdfhf2"; + Names[RTLIB::FPROUND_F80_F16] = "__truncxfhf2"; + Names[RTLIB::FPROUND_F128_F16] = "__trunctfhf2"; + Names[RTLIB::FPROUND_PPCF128_F16] = "__trunctfhf2"; Names[RTLIB::FPROUND_F64_F32] = "__truncdfsf2"; Names[RTLIB::FPROUND_F80_F32] = "__truncxfsf2"; Names[RTLIB::FPROUND_F128_F32] = "__trunctfsf2"; @@ -418,7 +422,10 @@ static void InitLibcallCallingConvs(CallingConv::ID *CCs) { /// getFPEXT - Return the FPEXT_*_* value for the given types, or /// UNKNOWN_LIBCALL if there is none. RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) { - if (OpVT == MVT::f32) { + if (OpVT == MVT::f16) { + if (RetVT == MVT::f32) + return FPEXT_F16_F32; + } else if (OpVT == MVT::f32) { if (RetVT == MVT::f64) return FPEXT_F32_F64; if (RetVT == MVT::f128) @@ -434,7 +441,18 @@ RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) { /// getFPROUND - Return the FPROUND_*_* value for the given types, or /// UNKNOWN_LIBCALL if there is none. RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) { - if (RetVT == MVT::f32) { + if (RetVT == MVT::f16) { + if (OpVT == MVT::f32) + return FPROUND_F32_F16; + if (OpVT == MVT::f64) + return FPROUND_F64_F16; + if (OpVT == MVT::f80) + return FPROUND_F80_F16; + if (OpVT == MVT::f128) + return FPROUND_F128_F16; + if (OpVT == MVT::ppcf128) + return FPROUND_PPCF128_F16; + } else if (RetVT == MVT::f32) { if (OpVT == MVT::f64) return FPROUND_F64_F32; if (OpVT == MVT::f80) @@ -690,6 +708,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm, ExceptionPointerRegister = 0; ExceptionSelectorRegister = 0; BooleanContents = UndefinedBooleanContent; + BooleanFloatContents = UndefinedBooleanContent; BooleanVectorContents = UndefinedBooleanContent; SchedPreferenceInfo = Sched::ILP; JumpBufSize = 0; @@ -743,8 +762,15 @@ void TargetLoweringBase::initActions() { // These operations default to expand for vector types. if (VT >= MVT::FIRST_VECTOR_VALUETYPE && - VT <= MVT::LAST_VECTOR_VALUETYPE) + VT <= MVT::LAST_VECTOR_VALUETYPE) { setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, + (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, + (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, + (MVT::SimpleValueType)VT, Expand); + } } // Most targets ignore the @llvm.prefetch intrinsic. @@ -1081,27 +1107,35 @@ void TargetLoweringBase::computeRegisterProperties() { } } + if (!isTypeLegal(MVT::f16)) { + NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::i16]; + RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::i16]; + TransformToType[MVT::f16] = MVT::i16; + ValueTypeActions.setTypeAction(MVT::f16, TypeSoftenFloat); + } + // Loop over all of the vector value types to see which need transformations. for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE; i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT VT = (MVT::SimpleValueType)i; - if (isTypeLegal(VT)) continue; + MVT VT = (MVT::SimpleValueType) i; + if (isTypeLegal(VT)) + continue; - // Determine if there is a legal wider type. If so, we should promote to - // that wider vector type. MVT EltVT = VT.getVectorElementType(); unsigned NElts = VT.getVectorNumElements(); - if (NElts != 1 && !shouldSplitVectorType(VT)) { - bool IsLegalWiderType = false; - // First try to promote the elements of integer vectors. If no legal - // promotion was found, fallback to the widen-vector method. - for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { - MVT SVT = (MVT::SimpleValueType)nVT; + bool IsLegalWiderType = false; + LegalizeTypeAction PreferredAction = getPreferredVectorAction(VT); + switch (PreferredAction) { + case TypePromoteInteger: { + // Try to promote the elements of integer vectors. If no legal + // promotion was found, fall through to the widen-vector method. + for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { + MVT SVT = (MVT::SimpleValueType) nVT; // Promote vectors of integers to vectors with the same number // of elements, with a wider element type. if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits() - && SVT.getVectorNumElements() == NElts && - isTypeLegal(SVT) && SVT.getScalarType().isInteger()) { + && SVT.getVectorNumElements() == NElts && isTypeLegal(SVT) + && SVT.getScalarType().isInteger()) { TransformToType[i] = SVT; RegisterTypeForVT[i] = SVT; NumRegistersForVT[i] = 1; @@ -1110,15 +1144,15 @@ void TargetLoweringBase::computeRegisterProperties() { break; } } - - if (IsLegalWiderType) continue; - + if (IsLegalWiderType) + break; + } + case TypeWidenVector: { // Try to widen the vector. - for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { - MVT SVT = (MVT::SimpleValueType)nVT; - if (SVT.getVectorElementType() == EltVT && - SVT.getVectorNumElements() > NElts && - isTypeLegal(SVT)) { + for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { + MVT SVT = (MVT::SimpleValueType) nVT; + if (SVT.getVectorElementType() == EltVT + && SVT.getVectorNumElements() > NElts && isTypeLegal(SVT)) { TransformToType[i] = SVT; RegisterTypeForVT[i] = SVT; NumRegistersForVT[i] = 1; @@ -1127,27 +1161,34 @@ void TargetLoweringBase::computeRegisterProperties() { break; } } - if (IsLegalWiderType) continue; + if (IsLegalWiderType) + break; } - - MVT IntermediateVT; - MVT RegisterVT; - unsigned NumIntermediates; - NumRegistersForVT[i] = - getVectorTypeBreakdownMVT(VT, IntermediateVT, NumIntermediates, - RegisterVT, this); - RegisterTypeForVT[i] = RegisterVT; - - MVT NVT = VT.getPow2VectorType(); - if (NVT == VT) { - // Type is already a power of 2. The default action is to split. - TransformToType[i] = MVT::Other; - unsigned NumElts = VT.getVectorNumElements(); - ValueTypeActions.setTypeAction(VT, - NumElts > 1 ? TypeSplitVector : TypeScalarizeVector); - } else { - TransformToType[i] = NVT; - ValueTypeActions.setTypeAction(VT, TypeWidenVector); + case TypeSplitVector: + case TypeScalarizeVector: { + MVT IntermediateVT; + MVT RegisterVT; + unsigned NumIntermediates; + NumRegistersForVT[i] = getVectorTypeBreakdownMVT(VT, IntermediateVT, + NumIntermediates, RegisterVT, this); + RegisterTypeForVT[i] = RegisterVT; + + MVT NVT = VT.getPow2VectorType(); + if (NVT == VT) { + // Type is already a power of 2. The default action is to split. + TransformToType[i] = MVT::Other; + if (PreferredAction == TypeScalarizeVector) + ValueTypeActions.setTypeAction(VT, TypeScalarizeVector); + else + ValueTypeActions.setTypeAction(VT, TypeSplitVector); + } else { + TransformToType[i] = NVT; + ValueTypeActions.setTypeAction(VT, TypeWidenVector); + } + break; + } + default: + llvm_unreachable("Unknown vector legalization action!"); } } diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 02abc282e6d6..f59efa35031c 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -192,6 +192,18 @@ getELFSectionFlags(SectionKind K) { return Flags; } +static const Comdat *getELFComdat(const GlobalValue *GV) { + const Comdat *C = GV->getComdat(); + if (!C) + return nullptr; + + if (C->getSelectionKind() != Comdat::Any) + report_fatal_error("ELF COMDATs only support SelectionKind::Any, '" + + C->getName() + "' cannot be lowered."); + + return C; +} + const MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( const GlobalValue *GV, SectionKind Kind, Mangler &Mang, const TargetMachine &TM) const { @@ -200,14 +212,20 @@ const MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( // Infer section flags from the section name if we can. Kind = getELFKindForNamedSection(SectionName, Kind); + StringRef Group = ""; + unsigned Flags = getELFSectionFlags(Kind); + if (const Comdat *C = getELFComdat(GV)) { + Group = C->getName(); + Flags |= ELF::SHF_GROUP; + } return getContext().getELFSection(SectionName, - getELFSectionType(SectionName, Kind), - getELFSectionFlags(Kind), Kind); + getELFSectionType(SectionName, Kind), Flags, + Kind, /*EntrySize=*/0, Group); } /// getSectionPrefixForGlobal - Return the section prefix name used by options /// FunctionsSections and DataSections. -static const char *getSectionPrefixForGlobal(SectionKind Kind) { +static StringRef getSectionPrefixForGlobal(SectionKind Kind) { if (Kind.isText()) return ".text."; if (Kind.isReadOnly()) return ".rodata."; if (Kind.isBSS()) return ".bss."; @@ -224,7 +242,6 @@ static const char *getSectionPrefixForGlobal(SectionKind Kind) { return ".data.rel.ro."; } - const MCSection *TargetLoweringObjectFileELF:: SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang, const TargetMachine &TM) const { @@ -238,18 +255,20 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, // If this global is linkonce/weak and the target handles this by emitting it // into a 'uniqued' section name, create and return the section now. - if ((GV->isWeakForLinker() || EmitUniquedSection) && + if ((GV->isWeakForLinker() || EmitUniquedSection || GV->hasComdat()) && !Kind.isCommon()) { - const char *Prefix; - Prefix = getSectionPrefixForGlobal(Kind); + StringRef Prefix = getSectionPrefixForGlobal(Kind); - SmallString<128> Name(Prefix, Prefix+strlen(Prefix)); + SmallString<128> Name(Prefix); TM.getNameWithPrefix(Name, GV, Mang, true); StringRef Group = ""; unsigned Flags = getELFSectionFlags(Kind); - if (GV->isWeakForLinker()) { - Group = Name.substr(strlen(Prefix)); + if (GV->isWeakForLinker() || GV->hasComdat()) { + if (const Comdat *C = getELFComdat(GV)) + Group = C->getName(); + else + Group = Name.substr(Prefix.size()); Flags |= ELF::SHF_GROUP; } @@ -319,8 +338,9 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, /// getSectionForConstant - Given a mergeable constant with the /// specified size and relocation information, return a section that it /// should be placed in. -const MCSection *TargetLoweringObjectFileELF:: -getSectionForConstant(SectionKind Kind) const { +const MCSection * +TargetLoweringObjectFileELF::getSectionForConstant(SectionKind Kind, + const Constant *C) const { if (Kind.isMergeableConst4() && MergeableConst4Section) return MergeableConst4Section; if (Kind.isMergeableConst8() && MergeableConst8Section) @@ -483,6 +503,15 @@ emitModuleFlags(MCStreamer &Streamer, Streamer.AddBlankLine(); } +static void checkMachOComdat(const GlobalValue *GV) { + const Comdat *C = GV->getComdat(); + if (!C) + return; + + report_fatal_error("MachO doesn't support COMDATs, '" + C->getName() + + "' cannot be lowered."); +} + const MCSection *TargetLoweringObjectFileMachO::getExplicitSectionGlobal( const GlobalValue *GV, SectionKind Kind, Mangler &Mang, const TargetMachine &TM) const { @@ -490,6 +519,9 @@ const MCSection *TargetLoweringObjectFileMachO::getExplicitSectionGlobal( StringRef Segment, Section; unsigned TAA = 0, StubSize = 0; bool TAAParsed; + + checkMachOComdat(GV); + std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(GV->getSection(), Segment, Section, TAA, TAAParsed, StubSize); @@ -560,6 +592,7 @@ bool TargetLoweringObjectFileMachO::isSectionAtomizableBySymbols( const MCSection *TargetLoweringObjectFileMachO:: SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang, const TargetMachine &TM) const { + checkMachOComdat(GV); // Handle thread local data. if (Kind.isThreadBSS()) return TLSBSSSection; @@ -622,7 +655,8 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, } const MCSection * -TargetLoweringObjectFileMachO::getSectionForConstant(SectionKind Kind) const { +TargetLoweringObjectFileMachO::getSectionForConstant(SectionKind Kind, + const Constant *C) const { // If this constant requires a relocation, we have to put it in the data // segment, not in the text segment. if (Kind.isDataRel() || Kind.isReadOnlyWithRel()) @@ -728,6 +762,50 @@ getCOFFSectionFlags(SectionKind K) { return Flags; } +static const GlobalValue *getComdatGVForCOFF(const GlobalValue *GV) { + const Comdat *C = GV->getComdat(); + assert(C && "expected GV to have a Comdat!"); + + StringRef ComdatGVName = C->getName(); + const GlobalValue *ComdatGV = GV->getParent()->getNamedValue(ComdatGVName); + if (!ComdatGV) + report_fatal_error("Associative COMDAT symbol '" + ComdatGVName + + "' does not exist."); + + if (ComdatGV->getComdat() != C) + report_fatal_error("Associative COMDAT symbol '" + ComdatGVName + + "' is not a key for it's COMDAT."); + + return ComdatGV; +} + +static int getSelectionForCOFF(const GlobalValue *GV) { + if (const Comdat *C = GV->getComdat()) { + const GlobalValue *ComdatKey = getComdatGVForCOFF(GV); + if (const auto *GA = dyn_cast(ComdatKey)) + ComdatKey = GA->getBaseObject(); + if (ComdatKey == GV) { + switch (C->getSelectionKind()) { + case Comdat::Any: + return COFF::IMAGE_COMDAT_SELECT_ANY; + case Comdat::ExactMatch: + return COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH; + case Comdat::Largest: + return COFF::IMAGE_COMDAT_SELECT_LARGEST; + case Comdat::NoDuplicates: + return COFF::IMAGE_COMDAT_SELECT_NODUPLICATES; + case Comdat::SameSize: + return COFF::IMAGE_COMDAT_SELECT_SAME_SIZE; + } + } else { + return COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE; + } + } else if (GV->isWeakForLinker()) { + return COFF::IMAGE_COMDAT_SELECT_ANY; + } + return 0; +} + const MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal( const GlobalValue *GV, SectionKind Kind, Mangler &Mang, const TargetMachine &TM) const { @@ -735,11 +813,21 @@ const MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal( unsigned Characteristics = getCOFFSectionFlags(Kind); StringRef Name = GV->getSection(); StringRef COMDATSymName = ""; - if (GV->isWeakForLinker()) { - Selection = COFF::IMAGE_COMDAT_SELECT_ANY; - Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT; - MCSymbol *Sym = TM.getSymbol(GV, Mang); - COMDATSymName = Sym->getName(); + if ((GV->isWeakForLinker() || GV->hasComdat()) && !Kind.isCommon()) { + Selection = getSelectionForCOFF(GV); + const GlobalValue *ComdatGV; + if (Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) + ComdatGV = getComdatGVForCOFF(GV); + else + ComdatGV = GV; + + if (!ComdatGV->hasPrivateLinkage()) { + MCSymbol *Sym = TM.getSymbol(ComdatGV, Mang); + COMDATSymName = Sym->getName(); + Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT; + } else { + Selection = 0; + } } return getContext().getCOFFSection(Name, Characteristics, @@ -776,17 +864,27 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, // into a 'uniqued' section name, create and return the section now. // Section names depend on the name of the symbol which is not feasible if the // symbol has private linkage. - if ((GV->isWeakForLinker() || EmitUniquedSection) && - !GV->hasPrivateLinkage() && !Kind.isCommon()) { + if ((GV->isWeakForLinker() || EmitUniquedSection || GV->hasComdat()) && + !Kind.isCommon()) { const char *Name = getCOFFSectionNameForUniqueGlobal(Kind); unsigned Characteristics = getCOFFSectionFlags(Kind); Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT; - MCSymbol *Sym = TM.getSymbol(GV, Mang); - return getContext().getCOFFSection( - Name, Characteristics, Kind, Sym->getName(), - GV->isWeakForLinker() ? COFF::IMAGE_COMDAT_SELECT_ANY - : COFF::IMAGE_COMDAT_SELECT_NODUPLICATES); + int Selection = getSelectionForCOFF(GV); + if (!Selection) + Selection = COFF::IMAGE_COMDAT_SELECT_NODUPLICATES; + const GlobalValue *ComdatGV; + if (GV->hasComdat()) + ComdatGV = getComdatGVForCOFF(GV); + else + ComdatGV = GV; + + if (!ComdatGV->hasPrivateLinkage()) { + MCSymbol *Sym = TM.getSymbol(ComdatGV, Mang); + StringRef COMDATSymName = Sym->getName(); + return getContext().getCOFFSection(Name, Characteristics, Kind, + COMDATSymName, Selection); + } } if (Kind.isText()) diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp index 9154fe2f5ff4..b0e985d6dd54 100644 --- a/lib/ExecutionEngine/ExecutionEngine.cpp +++ b/lib/ExecutionEngine/ExecutionEngine.cpp @@ -411,13 +411,14 @@ ExecutionEngine *ExecutionEngine::create(Module *M, std::string *ErrorStr, CodeGenOpt::Level OptLevel, bool GVsWithCode) { - EngineBuilder EB = EngineBuilder(M) - .setEngineKind(ForceInterpreter - ? EngineKind::Interpreter - : EngineKind::JIT) - .setErrorStr(ErrorStr) - .setOptLevel(OptLevel) - .setAllocateGVsWithCode(GVsWithCode); + + EngineBuilder EB = + EngineBuilder(M) + .setEngineKind(ForceInterpreter ? EngineKind::Interpreter + : EngineKind::Either) + .setErrorStr(ErrorStr) + .setOptLevel(OptLevel) + .setAllocateGVsWithCode(GVsWithCode); return EB.create(); } diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp index 50b8c10b638b..2ba1f8695d7c 100644 --- a/lib/ExecutionEngine/JIT/JITEmitter.cpp +++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp @@ -36,7 +36,6 @@ #include "llvm/IR/ValueHandle.h" #include "llvm/IR/ValueMap.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Disassembler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Memory.h" @@ -929,11 +928,6 @@ bool JITEmitter::finishFunction(MachineFunction &F) { MemMgr->setMemoryExecutable(); DEBUG({ - if (sys::hasDisassembler()) { - dbgs() << "JIT: Disassembled code:\n"; - dbgs() << sys::disassembleBuffer(FnStart, FnEnd-FnStart, - (uintptr_t)FnStart); - } else { dbgs() << "JIT: Binary code:\n"; uint8_t* q = FnStart; for (int i = 0; q < FnEnd; q += 4, ++i) { @@ -955,7 +949,6 @@ bool JITEmitter::finishFunction(MachineFunction &F) { dbgs() << '\n'; } dbgs()<< '\n'; - } }); if (MMI) diff --git a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt index cbf7cf14d491..eb1a60b60d08 100644 --- a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt +++ b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMRuntimeDyld GDBRegistrar.cpp RuntimeDyld.cpp + RuntimeDyldChecker.cpp RuntimeDyldELF.cpp RuntimeDyldMachO.cpp ) diff --git a/lib/ExecutionEngine/RuntimeDyld/LLVMBuild.txt b/lib/ExecutionEngine/RuntimeDyld/LLVMBuild.txt index 97dc86129a33..8bd562191891 100644 --- a/lib/ExecutionEngine/RuntimeDyld/LLVMBuild.txt +++ b/lib/ExecutionEngine/RuntimeDyld/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = RuntimeDyld parent = ExecutionEngine -required_libraries = Object Support +required_libraries = MC Object Support diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h index 4917b93a96e5..c3a21823bbc8 100644 --- a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h +++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h @@ -48,7 +48,8 @@ class ObjectImageCommon : public ObjectImage { { // FIXME: error checking? createObjectFile returns an ErrorOr // and should probably be checked for failure. - ObjFile.reset(object::ObjectFile::createObjectFile(Buffer->getMemBuffer()).get()); + std::unique_ptr Buf(Buffer->getMemBuffer()); + ObjFile.reset(object::ObjectFile::createObjectFile(Buf).get()); } ObjectImageCommon(std::unique_ptr Input) : ObjectImage(nullptr), ObjFile(std::move(Input)) {} diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index 9dfd1678de8b..d86a75130685 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -517,7 +517,8 @@ void RuntimeDyldImpl::addRelocationForSymbol(const RelocationEntry &RE, } } -uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) { +uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr, + unsigned AbiVariant) { if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be || Arch == Triple::arm64 || Arch == Triple::arm64_be) { // This stub has to be able to access the full address space, @@ -561,22 +562,31 @@ uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) { *StubAddr = NopInstr; return Addr; } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) { - // PowerPC64 stub: the address points to a function descriptor - // instead of the function itself. Load the function address - // on r11 and sets it to control register. Also loads the function - // TOC in r2 and environment pointer to r11. + // Depending on which version of the ELF ABI is in use, we need to + // generate one of two variants of the stub. They both start with + // the same sequence to load the target address into r12. writeInt32BE(Addr, 0x3D800000); // lis r12, highest(addr) writeInt32BE(Addr+4, 0x618C0000); // ori r12, higher(addr) writeInt32BE(Addr+8, 0x798C07C6); // sldi r12, r12, 32 writeInt32BE(Addr+12, 0x658C0000); // oris r12, r12, h(addr) writeInt32BE(Addr+16, 0x618C0000); // ori r12, r12, l(addr) - writeInt32BE(Addr+20, 0xF8410028); // std r2, 40(r1) - writeInt32BE(Addr+24, 0xE96C0000); // ld r11, 0(r12) - writeInt32BE(Addr+28, 0xE84C0008); // ld r2, 0(r12) - writeInt32BE(Addr+32, 0x7D6903A6); // mtctr r11 - writeInt32BE(Addr+36, 0xE96C0010); // ld r11, 16(r2) - writeInt32BE(Addr+40, 0x4E800420); // bctr - + if (AbiVariant == 2) { + // PowerPC64 stub ELFv2 ABI: The address points to the function itself. + // The address is already in r12 as required by the ABI. Branch to it. + writeInt32BE(Addr+20, 0xF8410018); // std r2, 24(r1) + writeInt32BE(Addr+24, 0x7D8903A6); // mtctr r12 + writeInt32BE(Addr+28, 0x4E800420); // bctr + } else { + // PowerPC64 stub ELFv1 ABI: The address points to a function descriptor. + // Load the function address on r11 and sets it to control register. Also + // loads the function TOC in r2 and environment pointer to r11. + writeInt32BE(Addr+20, 0xF8410028); // std r2, 40(r1) + writeInt32BE(Addr+24, 0xE96C0000); // ld r11, 0(r12) + writeInt32BE(Addr+28, 0xE84C0008); // ld r2, 0(r12) + writeInt32BE(Addr+32, 0x7D6903A6); // mtctr r11 + writeInt32BE(Addr+36, 0xE96C0010); // ld r11, 16(r2) + writeInt32BE(Addr+40, 0x4E800420); // bctr + } return Addr; } else if (Arch == Triple::systemz) { writeInt16BE(Addr, 0xC418); // lgrl %r1,.+8 @@ -697,8 +707,9 @@ createRuntimeDyldELF(RTDyldMemoryManager *MM, bool ProcessAllSections) { } static std::unique_ptr -createRuntimeDyldMachO(RTDyldMemoryManager *MM, bool ProcessAllSections) { - std::unique_ptr Dyld(new RuntimeDyldMachO(MM)); +createRuntimeDyldMachO(Triple::ArchType Arch, RTDyldMemoryManager *MM, + bool ProcessAllSections) { + std::unique_ptr Dyld(RuntimeDyldMachO::create(Arch, MM)); Dyld->setProcessAllSections(ProcessAllSections); return Dyld; } @@ -715,7 +726,9 @@ ObjectImage *RuntimeDyld::loadObject(std::unique_ptr InputObject) { } else if (InputObject->isMachO()) { InputImage.reset(RuntimeDyldMachO::createObjectImageFromFile(std::move(InputObject))); if (!Dyld) - Dyld = createRuntimeDyldMachO(MM, ProcessAllSections).release(); + Dyld = createRuntimeDyldMachO( + static_cast(InputImage->getArch()), + MM, ProcessAllSections).release(); } else report_fatal_error("Incompatible object format!"); @@ -751,7 +764,9 @@ ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) { case sys::fs::file_magic::macho_dsym_companion: InputImage.reset(RuntimeDyldMachO::createObjectImage(InputBuffer)); if (!Dyld) - Dyld = createRuntimeDyldMachO(MM, ProcessAllSections).release(); + Dyld = createRuntimeDyldMachO( + static_cast(InputImage->getArch()), + MM, ProcessAllSections).release(); break; case sys::fs::file_magic::unknown: case sys::fs::file_magic::bitcode: diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp new file mode 100644 index 000000000000..1e63d9207f73 --- /dev/null +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp @@ -0,0 +1,656 @@ +//===--- RuntimeDyldChecker.cpp - RuntimeDyld tester framework --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/RuntimeDyldChecker.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/StringRefMemoryObject.h" +#include "RuntimeDyldImpl.h" +#include +#include + +#define DEBUG_TYPE "rtdyld" + +using namespace llvm; + +namespace llvm { + + // Helper class that implements the language evaluated by RuntimeDyldChecker. + class RuntimeDyldCheckerExprEval { + public: + + RuntimeDyldCheckerExprEval(const RuntimeDyldChecker &Checker, + llvm::raw_ostream &ErrStream) + : Checker(Checker), ErrStream(ErrStream) {} + + bool evaluate(StringRef Expr) const { + // Expect equality expression of the form 'LHS = RHS'. + Expr = Expr.trim(); + size_t EQIdx = Expr.find('='); + + // Evaluate LHS. + StringRef LHSExpr = Expr.substr(0, EQIdx).rtrim(); + StringRef RemainingExpr; + EvalResult LHSResult; + std::tie(LHSResult, RemainingExpr) = + evalComplexExpr(evalSimpleExpr(LHSExpr)); + if (LHSResult.hasError()) + return handleError(Expr, LHSResult); + if (RemainingExpr != "") + return handleError(Expr, unexpectedToken(RemainingExpr, LHSExpr, "")); + + // Evaluate RHS. + StringRef RHSExpr = Expr.substr(EQIdx + 1).ltrim(); + EvalResult RHSResult; + std::tie(RHSResult, RemainingExpr) = + evalComplexExpr(evalSimpleExpr(RHSExpr)); + if (RHSResult.hasError()) + return handleError(Expr, RHSResult); + if (RemainingExpr != "") + return handleError(Expr, unexpectedToken(RemainingExpr, RHSExpr, "")); + + if (LHSResult.getValue() != RHSResult.getValue()) { + ErrStream << "Expression '" << Expr << "' is false: " + << format("0x%lx", LHSResult.getValue()) << " != " + << format("0x%lx", RHSResult.getValue()) << "\n"; + return false; + } + return true; + } + + private: + const RuntimeDyldChecker &Checker; + llvm::raw_ostream &ErrStream; + + enum class BinOpToken : unsigned { Invalid, Add, Sub, BitwiseAnd, + BitwiseOr, ShiftLeft, ShiftRight }; + + class EvalResult { + public: + EvalResult() + : Value(0), ErrorMsg("") {} + EvalResult(uint64_t Value) + : Value(Value), ErrorMsg("") {} + EvalResult(std::string ErrorMsg) + : Value(0), ErrorMsg(ErrorMsg) {} + uint64_t getValue() const { return Value; } + bool hasError() const { return ErrorMsg != ""; } + const std::string& getErrorMsg() const { return ErrorMsg; } + private: + uint64_t Value; + std::string ErrorMsg; + }; + + StringRef getTokenForError(StringRef Expr) const { + if (Expr.empty()) + return ""; + + StringRef Token, Remaining; + if (isalpha(Expr[0])) + std::tie(Token, Remaining) = parseSymbol(Expr); + else if (isdigit(Expr[0])) + std::tie(Token, Remaining) = parseNumberString(Expr); + else { + unsigned TokLen = 1; + if (Expr.startswith("<<") || Expr.startswith(">>")) + TokLen = 2; + Token = Expr.substr(0, TokLen); + } + return Token; + } + + EvalResult unexpectedToken(StringRef TokenStart, + StringRef SubExpr, + StringRef ErrText) const { + std::string ErrorMsg("Encountered unexpected token '"); + ErrorMsg += getTokenForError(TokenStart); + if (SubExpr != "") { + ErrorMsg += "' while parsing subexpression '"; + ErrorMsg += SubExpr; + } + ErrorMsg += "'"; + if (ErrText != "") { + ErrorMsg += " "; + ErrorMsg += ErrText; + } + return EvalResult(std::move(ErrorMsg)); + } + + bool handleError(StringRef Expr, const EvalResult &R) const { + assert(R.hasError() && "Not an error result."); + ErrStream << "Error evaluating expression '" << Expr << "': " + << R.getErrorMsg() << "\n"; + return false; + } + + std::pair parseBinOpToken(StringRef Expr) const { + if (Expr.empty()) + return std::make_pair(BinOpToken::Invalid, ""); + + // Handle the two 2-character tokens. + if (Expr.startswith("<<")) + return std::make_pair(BinOpToken::ShiftLeft, + Expr.substr(2).ltrim()); + if (Expr.startswith(">>")) + return std::make_pair(BinOpToken::ShiftRight, + Expr.substr(2).ltrim()); + + // Handle one-character tokens. + BinOpToken Op; + switch (Expr[0]) { + default: return std::make_pair(BinOpToken::Invalid, Expr); + case '+': Op = BinOpToken::Add; break; + case '-': Op = BinOpToken::Sub; break; + case '&': Op = BinOpToken::BitwiseAnd; break; + case '|': Op = BinOpToken::BitwiseOr; break; + } + + return std::make_pair(Op, Expr.substr(1).ltrim()); + } + + EvalResult computeBinOpResult(BinOpToken Op, const EvalResult &LHSResult, + const EvalResult &RHSResult) const { + switch (Op) { + default: llvm_unreachable("Tried to evaluate unrecognized operation."); + case BinOpToken::Add: + return EvalResult(LHSResult.getValue() + RHSResult.getValue()); + case BinOpToken::Sub: + return EvalResult(LHSResult.getValue() - RHSResult.getValue()); + case BinOpToken::BitwiseAnd: + return EvalResult(LHSResult.getValue() & RHSResult.getValue()); + case BinOpToken::BitwiseOr: + return EvalResult(LHSResult.getValue() | RHSResult.getValue()); + case BinOpToken::ShiftLeft: + return EvalResult(LHSResult.getValue() << RHSResult.getValue()); + case BinOpToken::ShiftRight: + return EvalResult(LHSResult.getValue() >> RHSResult.getValue()); + } + } + + // Parse a symbol and return a (string, string) pair representing the symbol + // name and expression remaining to be parsed. + std::pair parseSymbol(StringRef Expr) const { + size_t FirstNonSymbol = + Expr.find_first_not_of("0123456789" + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + ":_"); + return std::make_pair(Expr.substr(0, FirstNonSymbol), + Expr.substr(FirstNonSymbol).ltrim()); + } + + // Evaluate a call to decode_operand. Decode the instruction operand at the + // given symbol and get the value of the requested operand. + // Returns an error if the instruction cannot be decoded, or the requested + // operand is not an immediate. + // On success, retuns a pair containing the value of the operand, plus + // the expression remaining to be evaluated. + std::pair evalDecodeOperand(StringRef Expr) const { + if (!Expr.startswith("(")) + return std::make_pair(unexpectedToken(Expr, Expr, "expected '('"), ""); + StringRef RemainingExpr = Expr.substr(1).ltrim(); + StringRef Symbol; + std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr); + + if (!Checker.isSymbolValid(Symbol)) + return std::make_pair(EvalResult(("Cannot decode unknown symbol '" + + Symbol + "'").str()), + ""); + + if (!RemainingExpr.startswith(",")) + return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr, + "expected ','"), + ""); + RemainingExpr = RemainingExpr.substr(1).ltrim(); + + EvalResult OpIdxExpr; + std::tie(OpIdxExpr, RemainingExpr) = evalNumberExpr(RemainingExpr); + if (OpIdxExpr.hasError()) + return std::make_pair(OpIdxExpr, ""); + + if (!RemainingExpr.startswith(")")) + return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr, + "expected ')'"), + ""); + RemainingExpr = RemainingExpr.substr(1).ltrim(); + + MCInst Inst; + uint64_t Size; + if (!decodeInst(Symbol, Inst, Size)) + return std::make_pair(EvalResult(("Couldn't decode instruction at '" + + Symbol + "'").str()), + ""); + + unsigned OpIdx = OpIdxExpr.getValue(); + if (OpIdx >= Inst.getNumOperands()) { + std::string ErrMsg; + raw_string_ostream ErrMsgStream(ErrMsg); + ErrMsgStream << "Invalid operand index '" << format("%i", OpIdx) + << "' for instruction '" << Symbol + << "'. Instruction has only " + << format("%i", Inst.getNumOperands()) + << " operands.\nInstruction is:\n "; + Inst.dump_pretty(ErrMsgStream, + Checker.Disassembler->getContext().getAsmInfo(), + Checker.InstPrinter); + return std::make_pair(EvalResult(ErrMsgStream.str()), ""); + } + + const MCOperand &Op = Inst.getOperand(OpIdx); + if (!Op.isImm()) { + std::string ErrMsg; + raw_string_ostream ErrMsgStream(ErrMsg); + ErrMsgStream << "Operand '" << format("%i", OpIdx) + << "' of instruction '" << Symbol + << "' is not an immediate.\nInstruction is:\n "; + Inst.dump_pretty(ErrMsgStream, + Checker.Disassembler->getContext().getAsmInfo(), + Checker.InstPrinter); + + return std::make_pair(EvalResult(ErrMsgStream.str()), ""); + } + + return std::make_pair(EvalResult(Op.getImm()), RemainingExpr); + } + + // Evaluate a call to next_pc. Decode the instruction at the given + // symbol and return the following program counter.. + // Returns an error if the instruction cannot be decoded. + // On success, returns a pair containing the next PC, plus the length of the + // expression remaining to be evaluated. + std::pair evalNextPC(StringRef Expr) const { + if (!Expr.startswith("(")) + return std::make_pair(unexpectedToken(Expr, Expr, "expected '('"), ""); + StringRef RemainingExpr = Expr.substr(1).ltrim(); + StringRef Symbol; + std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr); + + if (!Checker.isSymbolValid(Symbol)) + return std::make_pair(EvalResult(("Cannot decode unknown symbol '" + + Symbol + "'").str()), + ""); + + if (!RemainingExpr.startswith(")")) + return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr, + "expected ')'"), + ""); + RemainingExpr = RemainingExpr.substr(1).ltrim(); + + MCInst Inst; + uint64_t Size; + if (!decodeInst(Symbol, Inst, Size)) + return std::make_pair(EvalResult(("Couldn't decode instruction at '" + + Symbol + "'").str()), + ""); + uint64_t NextPC = Checker.getSymbolAddress(Symbol) + Size; + + return std::make_pair(EvalResult(NextPC), RemainingExpr); + } + + // Evaluate an identiefer expr, which may be a symbol, or a call to + // one of the builtin functions: get_insn_opcode or get_insn_length. + // Return the result, plus the expression remaining to be parsed. + std::pair evalIdentifierExpr(StringRef Expr) const { + StringRef Symbol; + StringRef RemainingExpr; + std::tie(Symbol, RemainingExpr) = parseSymbol(Expr); + + // Check for builtin function calls. + if (Symbol == "decode_operand") + return evalDecodeOperand(RemainingExpr); + else if (Symbol == "next_pc") + return evalNextPC(RemainingExpr); + + if (!Checker.isSymbolValid(Symbol)) { + std::string ErrMsg("No known address for symbol '"); + ErrMsg += Symbol; + ErrMsg += "'"; + if (Symbol.startswith("L")) + ErrMsg += " (this appears to be an assembler local label - " + " perhaps drop the 'L'?)"; + + return std::make_pair(EvalResult(ErrMsg), ""); + } + + // Looks like a plain symbol reference. + return std::make_pair(EvalResult(Checker.getSymbolAddress(Symbol)), + RemainingExpr); + } + + // Parse a number (hexadecimal or decimal) and return a (string, string) + // pair representing the number and the expression remaining to be parsed. + std::pair parseNumberString(StringRef Expr) const { + size_t FirstNonDigit = StringRef::npos; + if (Expr.startswith("0x")) { + FirstNonDigit = Expr.find_first_not_of("0123456789abcdefABCDEF", 2); + if (FirstNonDigit == StringRef::npos) + FirstNonDigit = Expr.size(); + } else { + FirstNonDigit = Expr.find_first_not_of("0123456789"); + if (FirstNonDigit == StringRef::npos) + FirstNonDigit = Expr.size(); + } + return std::make_pair(Expr.substr(0, FirstNonDigit), + Expr.substr(FirstNonDigit)); + } + + // Evaluate a constant numeric expression (hexidecimal or decimal) and + // return a pair containing the result, and the expression remaining to be + // evaluated. + std::pair evalNumberExpr(StringRef Expr) const { + StringRef ValueStr; + StringRef RemainingExpr; + std::tie(ValueStr, RemainingExpr) = parseNumberString(Expr); + + if (ValueStr.empty() || !isdigit(ValueStr[0])) + return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr, + "expected number"), + ""); + uint64_t Value; + ValueStr.getAsInteger(0, Value); + return std::make_pair(EvalResult(Value), RemainingExpr); + } + + // Evaluate an expression of the form "()" and return a pair + // containing the result of evaluating , plus the expression + // remaining to be parsed. + std::pair evalParensExpr(StringRef Expr) const { + assert(Expr.startswith("(") && "Not a parenthesized expression"); + EvalResult SubExprResult; + StringRef RemainingExpr; + std::tie(SubExprResult, RemainingExpr) = + evalComplexExpr(evalSimpleExpr(Expr.substr(1).ltrim())); + if (SubExprResult.hasError()) + return std::make_pair(SubExprResult, ""); + if (!RemainingExpr.startswith(")")) + return std::make_pair(unexpectedToken(RemainingExpr, Expr, + "expected ')'"), + ""); + RemainingExpr = RemainingExpr.substr(1).ltrim(); + return std::make_pair(SubExprResult, RemainingExpr); + } + + // Evaluate an expression in one of the following forms: + // *{} + // *{}( + ) + // *{}( - ) + // Return a pair containing the result, plus the expression remaining to be + // parsed. + std::pair evalLoadExpr(StringRef Expr) const { + assert(Expr.startswith("*") && "Not a load expression"); + StringRef RemainingExpr = Expr.substr(1).ltrim(); + // Parse read size. + if (!RemainingExpr.startswith("{")) + return std::make_pair(EvalResult("Expected '{' following '*'."), ""); + RemainingExpr = RemainingExpr.substr(1).ltrim(); + EvalResult ReadSizeExpr; + std::tie(ReadSizeExpr, RemainingExpr) = evalNumberExpr(RemainingExpr); + if (ReadSizeExpr.hasError()) + return std::make_pair(ReadSizeExpr, RemainingExpr); + uint64_t ReadSize = ReadSizeExpr.getValue(); + if (ReadSize < 1 || ReadSize > 8) + return std::make_pair(EvalResult("Invalid size for dereference."), ""); + if (!RemainingExpr.startswith("}")) + return std::make_pair(EvalResult("Missing '}' for dereference."), ""); + RemainingExpr = RemainingExpr.substr(1).ltrim(); + + // Check for '(symbol +/- constant)' form. + bool SymbolPlusConstant = false; + if (RemainingExpr.startswith("(")) { + SymbolPlusConstant = true; + RemainingExpr = RemainingExpr.substr(1).ltrim(); + } + + // Read symbol. + StringRef Symbol; + std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr); + + if (!Checker.isSymbolValid(Symbol)) + return std::make_pair(EvalResult(("Cannot dereference unknown symbol '" + + Symbol + "'").str()), + ""); + + // Set up defaut offset. + int64_t Offset = 0; + + // Handle "+/- constant)" portion if necessary. + if (SymbolPlusConstant) { + char OpChar = RemainingExpr[0]; + if (OpChar != '+' && OpChar != '-') + return std::make_pair(EvalResult("Invalid operator in load address."), + ""); + RemainingExpr = RemainingExpr.substr(1).ltrim(); + + EvalResult OffsetExpr; + std::tie(OffsetExpr, RemainingExpr) = evalNumberExpr(RemainingExpr); + + Offset = (OpChar == '+') ? + OffsetExpr.getValue() : -1 * OffsetExpr.getValue(); + + if (!RemainingExpr.startswith(")")) + return std::make_pair(EvalResult("Missing ')' in load address."), + ""); + + RemainingExpr = RemainingExpr.substr(1).ltrim(); + } + + return std::make_pair( + EvalResult(Checker.readMemoryAtSymbol(Symbol, Offset, ReadSize)), + RemainingExpr); + } + + // Evaluate a "simple" expression. This is any expression that _isn't_ an + // un-parenthesized binary expression. + // + // "Simple" expressions can be optionally bit-sliced. See evalSlicedExpr. + // + // Returns a pair containing the result of the evaluation, plus the + // expression remaining to be parsed. + std::pair evalSimpleExpr(StringRef Expr) const { + EvalResult SubExprResult; + StringRef RemainingExpr; + + if (Expr.empty()) + return std::make_pair(EvalResult("Unexpected end of expression"), ""); + + if (Expr[0] == '(') + std::tie(SubExprResult, RemainingExpr) = evalParensExpr(Expr); + else if (Expr[0] == '*') + std::tie(SubExprResult, RemainingExpr) = evalLoadExpr(Expr); + else if (isalpha(Expr[0])) + std::tie(SubExprResult, RemainingExpr) = evalIdentifierExpr(Expr); + else if (isdigit(Expr[0])) + std::tie(SubExprResult, RemainingExpr) = evalNumberExpr(Expr); + + if (SubExprResult.hasError()) + return std::make_pair(SubExprResult, RemainingExpr); + + // Evaluate bit-slice if present. + if (RemainingExpr.startswith("[")) + std::tie(SubExprResult, RemainingExpr) = + evalSliceExpr(std::make_pair(SubExprResult, RemainingExpr)); + + return std::make_pair(SubExprResult, RemainingExpr); + } + + // Evaluate a bit-slice of an expression. + // A bit-slice has the form "[high:low]". The result of evaluating a + // slice is the bits between high and low (inclusive) in the original + // expression, right shifted so that the "low" bit is in position 0 in the + // result. + // Returns a pair containing the result of the slice operation, plus the + // expression remaining to be parsed. + std::pair evalSliceExpr( + std::pair Ctx) const{ + EvalResult SubExprResult; + StringRef RemainingExpr; + std::tie(SubExprResult, RemainingExpr) = Ctx; + + assert(RemainingExpr.startswith("[") && "Not a slice expr."); + RemainingExpr = RemainingExpr.substr(1).ltrim(); + + EvalResult HighBitExpr; + std::tie(HighBitExpr, RemainingExpr) = evalNumberExpr(RemainingExpr); + + if (HighBitExpr.hasError()) + return std::make_pair(HighBitExpr, RemainingExpr); + + if (!RemainingExpr.startswith(":")) + return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr, + "expected ':'"), + ""); + RemainingExpr = RemainingExpr.substr(1).ltrim(); + + EvalResult LowBitExpr; + std::tie(LowBitExpr, RemainingExpr) = evalNumberExpr(RemainingExpr); + + if (LowBitExpr.hasError()) + return std::make_pair(LowBitExpr, RemainingExpr); + + if (!RemainingExpr.startswith("]")) + return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr, + "expected ']'"), + ""); + RemainingExpr = RemainingExpr.substr(1).ltrim(); + + unsigned HighBit = HighBitExpr.getValue(); + unsigned LowBit = LowBitExpr.getValue(); + uint64_t Mask = ((uint64_t)1 << (HighBit - LowBit + 1)) - 1; + uint64_t SlicedValue = (SubExprResult.getValue() >> LowBit) & Mask; + return std::make_pair(EvalResult(SlicedValue), RemainingExpr); + } + + // Evaluate a "complex" expression. + // Takes an already evaluated subexpression and checks for the presence of a + // binary operator, computing the result of the binary operation if one is + // found. Used to make arithmetic expressions left-associative. + // Returns a pair containing the ultimate result of evaluating the + // expression, plus the expression remaining to be evaluated. + std::pair evalComplexExpr( + std::pair Ctx) const { + EvalResult LHSResult; + StringRef RemainingExpr; + std::tie(LHSResult, RemainingExpr) = Ctx; + + // If there was an error, or there's nothing left to evaluate, return the + // result. + if (LHSResult.hasError() || RemainingExpr == "") + return std::make_pair(LHSResult, RemainingExpr); + + // Otherwise check if this is a binary expressioan. + BinOpToken BinOp; + std::tie(BinOp, RemainingExpr) = parseBinOpToken(RemainingExpr); + + // If this isn't a recognized expression just return. + if (BinOp == BinOpToken::Invalid) + return std::make_pair(LHSResult, RemainingExpr); + + // This is a recognized bin-op. Evaluate the RHS, then evaluate the binop. + EvalResult RHSResult; + std::tie(RHSResult, RemainingExpr) = evalSimpleExpr(RemainingExpr); + + // If there was an error evaluating the RHS, return it. + if (RHSResult.hasError()) + return std::make_pair(RHSResult, RemainingExpr); + + // This is a binary expression - evaluate and try to continue as a + // complex expr. + EvalResult ThisResult(computeBinOpResult(BinOp, LHSResult, RHSResult)); + + return evalComplexExpr(std::make_pair(ThisResult, RemainingExpr)); + } + + bool decodeInst(StringRef Symbol, MCInst &Inst, uint64_t &Size) const { + MCDisassembler *Dis = Checker.Disassembler; + StringRef SectionMem = Checker.getSubsectionStartingAt(Symbol); + StringRefMemoryObject SectionBytes(SectionMem, 0); + + MCDisassembler::DecodeStatus S = + Dis->getInstruction(Inst, Size, SectionBytes, 0, nulls(), nulls()); + + return (S == MCDisassembler::Success); + } + + }; + +} + +bool RuntimeDyldChecker::check(StringRef CheckExpr) const { + CheckExpr = CheckExpr.trim(); + DEBUG(llvm::dbgs() << "RuntimeDyldChecker: Checking '" << CheckExpr + << "'...\n"); + RuntimeDyldCheckerExprEval P(*this, ErrStream); + bool Result = P.evaluate(CheckExpr); + (void)Result; + DEBUG(llvm::dbgs() << "RuntimeDyldChecker: '" << CheckExpr << "' " + << (Result ? "passed" : "FAILED") << ".\n"); + return Result; +} + +bool RuntimeDyldChecker::checkAllRulesInBuffer(StringRef RulePrefix, + MemoryBuffer* MemBuf) const { + bool DidAllTestsPass = true; + unsigned NumRules = 0; + + const char *LineStart = MemBuf->getBufferStart(); + + // Eat whitespace. + while (LineStart != MemBuf->getBufferEnd() && + std::isspace(*LineStart)) + ++LineStart; + + while (LineStart != MemBuf->getBufferEnd() && *LineStart != '\0') { + const char *LineEnd = LineStart; + while (LineEnd != MemBuf->getBufferEnd() && + *LineEnd != '\r' && *LineEnd != '\n') + ++LineEnd; + + StringRef Line(LineStart, LineEnd - LineStart); + if (Line.startswith(RulePrefix)) { + DidAllTestsPass &= check(Line.substr(RulePrefix.size())); + ++NumRules; + } + + // Eat whitespace. + LineStart = LineEnd; + while (LineStart != MemBuf->getBufferEnd() && + std::isspace(*LineStart)) + ++LineStart; + } + return DidAllTestsPass && (NumRules != 0); +} + +bool RuntimeDyldChecker::isSymbolValid(StringRef Symbol) const { + return RTDyld.getSymbolAddress(Symbol) != nullptr; +} + +uint64_t RuntimeDyldChecker::getSymbolAddress(StringRef Symbol) const { + return RTDyld.getAnySymbolRemoteAddress(Symbol); +} + +uint64_t RuntimeDyldChecker::readMemoryAtSymbol(StringRef Symbol, + int64_t Offset, + unsigned Size) const { + uint8_t *Src = RTDyld.getSymbolAddress(Symbol); + uint64_t Result = 0; + memcpy(&Result, Src + Offset, Size); + return Result; +} + +StringRef RuntimeDyldChecker::getSubsectionStartingAt(StringRef Name) const { + RuntimeDyldImpl::SymbolTableMap::const_iterator pos = + RTDyld.GlobalSymbolTable.find(Name); + if (pos == RTDyld.GlobalSymbolTable.end()) + return StringRef(); + RuntimeDyldImpl::SymbolLoc Loc = pos->second; + uint8_t *SectionAddr = RTDyld.getSectionAddress(Loc.first); + return StringRef(reinterpret_cast(SectionAddr) + Loc.second, + RTDyld.Sections[Loc.first].Size - Loc.second); +} diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 56471f43b2e2..728138ed8c16 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -55,9 +55,9 @@ template class DyldELFObject : public ELFObjectFile { public: DyldELFObject(std::unique_ptr UnderlyingFile, - MemoryBuffer *Wrapper, std::error_code &ec); + std::unique_ptr Wrapper, std::error_code &ec); - DyldELFObject(MemoryBuffer *Wrapper, std::error_code &ec); + DyldELFObject(std::unique_ptr Wrapper, std::error_code &ec); void updateSectionAddress(const SectionRef &Sec, uint64_t Addr); void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr); @@ -109,15 +109,17 @@ template class ELFObjectImage : public ObjectImageCommon { // actual memory. Ultimately, the Binary parent class will take ownership of // this MemoryBuffer object but not the underlying memory. template -DyldELFObject::DyldELFObject(MemoryBuffer *Wrapper, std::error_code &ec) - : ELFObjectFile(Wrapper, ec) { +DyldELFObject::DyldELFObject(std::unique_ptr Wrapper, + std::error_code &EC) + : ELFObjectFile(std::move(Wrapper), EC) { this->isDyldELFObject = true; } template DyldELFObject::DyldELFObject(std::unique_ptr UnderlyingFile, - MemoryBuffer *Wrapper, std::error_code &ec) - : ELFObjectFile(Wrapper, ec), + std::unique_ptr Wrapper, + std::error_code &EC) + : ELFObjectFile(std::move(Wrapper), EC), UnderlyingFile(std::move(UnderlyingFile)) { this->isDyldELFObject = true; } @@ -183,29 +185,29 @@ RuntimeDyldELF::createObjectImageFromFile(std::unique_ptr Ob return nullptr; std::error_code ec; - MemoryBuffer *Buffer = - MemoryBuffer::getMemBuffer(ObjFile->getData(), "", false); + std::unique_ptr Buffer( + MemoryBuffer::getMemBuffer(ObjFile->getData(), "", false)); if (ObjFile->getBytesInAddress() == 4 && ObjFile->isLittleEndian()) { auto Obj = llvm::make_unique>>( - std::move(ObjFile), Buffer, ec); + std::move(ObjFile), std::move(Buffer), ec); return new ELFObjectImage>( nullptr, std::move(Obj)); } else if (ObjFile->getBytesInAddress() == 4 && !ObjFile->isLittleEndian()) { auto Obj = llvm::make_unique>>( - std::move(ObjFile), Buffer, ec); + std::move(ObjFile), std::move(Buffer), ec); return new ELFObjectImage>(nullptr, std::move(Obj)); } else if (ObjFile->getBytesInAddress() == 8 && !ObjFile->isLittleEndian()) { auto Obj = llvm::make_unique>>( - std::move(ObjFile), Buffer, ec); + std::move(ObjFile), std::move(Buffer), ec); return new ELFObjectImage>(nullptr, std::move(Obj)); } else if (ObjFile->getBytesInAddress() == 8 && ObjFile->isLittleEndian()) { auto Obj = llvm::make_unique>>( - std::move(ObjFile), Buffer, ec); + std::move(ObjFile), std::move(Buffer), ec); return new ELFObjectImage>( nullptr, std::move(Obj)); } else @@ -220,29 +222,31 @@ ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) { (uint8_t)Buffer->getBufferStart()[ELF::EI_DATA]); std::error_code ec; + std::unique_ptr Buf(Buffer->getMemBuffer()); + if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) { auto Obj = llvm::make_unique>>( - Buffer->getMemBuffer(), ec); + std::move(Buf), ec); return new ELFObjectImage>( Buffer, std::move(Obj)); } else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB) { auto Obj = llvm::make_unique>>( - Buffer->getMemBuffer(), ec); + std::move(Buf), ec); return new ELFObjectImage>(Buffer, std::move(Obj)); } else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB) { auto Obj = llvm::make_unique>>( - Buffer->getMemBuffer(), ec); + std::move(Buf), ec); return new ELFObjectImage>(Buffer, std::move(Obj)); } else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) { auto Obj = llvm::make_unique>>( - Buffer->getMemBuffer(), ec); + std::move(Buf), ec); return new ELFObjectImage>(Buffer, std::move(Obj)); } else llvm_unreachable("Unexpected ELF format"); @@ -612,30 +616,38 @@ void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section, } } -// Return the .TOC. section address to R_PPC64_TOC relocations. -uint64_t RuntimeDyldELF::findPPC64TOC() const { +// Return the .TOC. section and offset. +void RuntimeDyldELF::findPPC64TOCSection(ObjectImage &Obj, + ObjSectionToIDMap &LocalSections, + RelocationValueRef &Rel) { + // Set a default SectionID in case we do not find a TOC section below. + // This may happen for references to TOC base base (sym@toc, .odp + // relocation) without a .toc directive. In this case just use the + // first section (which is usually the .odp) since the code won't + // reference the .toc base directly. + Rel.SymbolName = NULL; + Rel.SectionID = 0; + // The TOC consists of sections .got, .toc, .tocbss, .plt in that // order. The TOC starts where the first of these sections starts. - SectionList::const_iterator it = Sections.begin(); - SectionList::const_iterator ite = Sections.end(); - for (; it != ite; ++it) { - if (it->Name == ".got" || it->Name == ".toc" || it->Name == ".tocbss" || - it->Name == ".plt") + for (section_iterator si = Obj.begin_sections(), se = Obj.end_sections(); + si != se; ++si) { + + StringRef SectionName; + check(si->getName(SectionName)); + + if (SectionName == ".got" + || SectionName == ".toc" + || SectionName == ".tocbss" + || SectionName == ".plt") { + Rel.SectionID = findOrEmitSection(Obj, *si, false, LocalSections); break; + } } - if (it == ite) { - // This may happen for - // * references to TOC base base (sym@toc, .odp relocation) without - // a .toc directive. - // In this case just use the first section (which is usually - // the .odp) since the code won't reference the .toc base - // directly. - it = Sections.begin(); - } - assert(it != ite); + // Per the ppc64-elf-linux ABI, The TOC base is TOC value plus 0x8000 // thus permitting a full 64 Kbytes segment. - return it->LoadAddress + 0x8000; + Rel.Addend = 0x8000; } // Returns the sections and offset associated with the ODP entry referenced @@ -702,24 +714,37 @@ void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj, llvm_unreachable("Attempting to get address of ODP entry!"); } -// Relocation masks following the #lo(value), #hi(value), #higher(value), -// and #highest(value) macros defined in section 4.5.1. Relocation Types -// in PPC-elf64abi document. -// +// Relocation masks following the #lo(value), #hi(value), #ha(value), +// #higher(value), #highera(value), #highest(value), and #highesta(value) +// macros defined in section 4.5.1. Relocation Types of the PPC-elf64abi +// document. + static inline uint16_t applyPPClo(uint64_t value) { return value & 0xffff; } static inline uint16_t applyPPChi(uint64_t value) { return (value >> 16) & 0xffff; } +static inline uint16_t applyPPCha (uint64_t value) { + return ((value + 0x8000) >> 16) & 0xffff; +} + static inline uint16_t applyPPChigher(uint64_t value) { return (value >> 32) & 0xffff; } +static inline uint16_t applyPPChighera (uint64_t value) { + return ((value + 0x8000) >> 32) & 0xffff; +} + static inline uint16_t applyPPChighest(uint64_t value) { return (value >> 48) & 0xffff; } +static inline uint16_t applyPPChighesta (uint64_t value) { + return ((value + 0x8000) >> 48) & 0xffff; +} + void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section, uint64_t Offset, uint64_t Value, uint32_t Type, int64_t Addend) { @@ -728,24 +753,57 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section, default: llvm_unreachable("Relocation type not implemented yet!"); break; + case ELF::R_PPC64_ADDR16: + writeInt16BE(LocalAddress, applyPPClo(Value + Addend)); + break; + case ELF::R_PPC64_ADDR16_DS: + writeInt16BE(LocalAddress, applyPPClo(Value + Addend) & ~3); + break; case ELF::R_PPC64_ADDR16_LO: writeInt16BE(LocalAddress, applyPPClo(Value + Addend)); break; + case ELF::R_PPC64_ADDR16_LO_DS: + writeInt16BE(LocalAddress, applyPPClo(Value + Addend) & ~3); + break; case ELF::R_PPC64_ADDR16_HI: writeInt16BE(LocalAddress, applyPPChi(Value + Addend)); break; + case ELF::R_PPC64_ADDR16_HA: + writeInt16BE(LocalAddress, applyPPCha(Value + Addend)); + break; case ELF::R_PPC64_ADDR16_HIGHER: writeInt16BE(LocalAddress, applyPPChigher(Value + Addend)); break; + case ELF::R_PPC64_ADDR16_HIGHERA: + writeInt16BE(LocalAddress, applyPPChighera(Value + Addend)); + break; case ELF::R_PPC64_ADDR16_HIGHEST: writeInt16BE(LocalAddress, applyPPChighest(Value + Addend)); break; + case ELF::R_PPC64_ADDR16_HIGHESTA: + writeInt16BE(LocalAddress, applyPPChighesta(Value + Addend)); + break; case ELF::R_PPC64_ADDR14: { assert(((Value + Addend) & 3) == 0); // Preserve the AA/LK bits in the branch instruction uint8_t aalk = *(LocalAddress + 3); writeInt16BE(LocalAddress + 2, (aalk & 3) | ((Value + Addend) & 0xfffc)); } break; + case ELF::R_PPC64_REL16_LO: { + uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t Delta = Value - FinalAddress + Addend; + writeInt16BE(LocalAddress, applyPPClo(Delta)); + } break; + case ELF::R_PPC64_REL16_HI: { + uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t Delta = Value - FinalAddress + Addend; + writeInt16BE(LocalAddress, applyPPChi(Delta)); + } break; + case ELF::R_PPC64_REL16_HA: { + uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t Delta = Value - FinalAddress + Addend; + writeInt16BE(LocalAddress, applyPPCha(Delta)); + } break; case ELF::R_PPC64_ADDR32: { int32_t Result = static_cast(Value + Addend); if (SignExtend32<32>(Result) != Result) @@ -775,19 +833,6 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section, case ELF::R_PPC64_ADDR64: writeInt64BE(LocalAddress, Value + Addend); break; - case ELF::R_PPC64_TOC: - writeInt64BE(LocalAddress, findPPC64TOC()); - break; - case ELF::R_PPC64_TOC16: { - uint64_t TOCStart = findPPC64TOC(); - Value = applyPPClo((Value + Addend) - TOCStart); - writeInt16BE(LocalAddress, applyPPClo(Value)); - } break; - case ELF::R_PPC64_TOC16_DS: { - uint64_t TOCStart = findPPC64TOC(); - Value = ((Value + Addend) - TOCStart); - writeInt16BE(LocalAddress, applyPPClo(Value)); - } break; } } @@ -1096,6 +1141,10 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( } } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) { if (RelType == ELF::R_PPC64_REL24) { + // Determine ABI variant in use for this object. + unsigned AbiVariant; + Obj.getObjectFile()->getPlatformFlags(AbiVariant); + AbiVariant &= ELF::EF_PPC64_ABI; // A PPC branch relocation will need a stub function if the target is // an external symbol (Symbol::ST_Unknown) or if the target address // is not within the signed 24-bits branch address. @@ -1103,10 +1152,18 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( uint8_t *Target = Section.Address + Offset; bool RangeOverflow = false; if (SymType != SymbolRef::ST_Unknown) { - // A function call may points to the .opd entry, so the final symbol - // value - // in calculated based in the relocation values in .opd section. - findOPDEntrySection(Obj, ObjSectionToID, Value); + if (AbiVariant != 2) { + // In the ELFv1 ABI, a function call may point to the .opd entry, + // so the final symbol value is calculated based on the relocation + // values in the .opd section. + findOPDEntrySection(Obj, ObjSectionToID, Value); + } else { + // In the ELFv2 ABI, a function symbol may provide a local entry + // point, which must be used for direct calls. + uint8_t SymOther; + Symbol->getOther(SymOther); + Value.Addend += ELF::decodePPC64LocalEntryOffset(SymOther); + } uint8_t *RelocTarget = Sections[Value.SectionID].Address + Value.Addend; int32_t delta = static_cast(Target - RelocTarget); // If it is within 24-bits branch range, just set the branch target @@ -1134,19 +1191,26 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( DEBUG(dbgs() << " Create a new stub function\n"); Stubs[Value] = Section.StubOffset; uint8_t *StubTargetAddr = - createStubFunction(Section.Address + Section.StubOffset); + createStubFunction(Section.Address + Section.StubOffset, + AbiVariant); RelocationEntry RE(SectionID, StubTargetAddr - Section.Address, ELF::R_PPC64_ADDR64, Value.Addend); // Generates the 64-bits address loads as exemplified in section - // 4.5.1 in PPC64 ELF ABI. - RelocationEntry REhst(SectionID, StubTargetAddr - Section.Address + 2, + // 4.5.1 in PPC64 ELF ABI. Note that the relocations need to + // apply to the low part of the instructions, so we have to update + // the offset according to the target endianness. + uint64_t StubRelocOffset = StubTargetAddr - Section.Address; + if (!IsTargetLittleEndian) + StubRelocOffset += 2; + + RelocationEntry REhst(SectionID, StubRelocOffset + 0, ELF::R_PPC64_ADDR16_HIGHEST, Value.Addend); - RelocationEntry REhr(SectionID, StubTargetAddr - Section.Address + 6, + RelocationEntry REhr(SectionID, StubRelocOffset + 4, ELF::R_PPC64_ADDR16_HIGHER, Value.Addend); - RelocationEntry REh(SectionID, StubTargetAddr - Section.Address + 14, + RelocationEntry REh(SectionID, StubRelocOffset + 12, ELF::R_PPC64_ADDR16_HI, Value.Addend); - RelocationEntry REl(SectionID, StubTargetAddr - Section.Address + 18, + RelocationEntry REl(SectionID, StubRelocOffset + 16, ELF::R_PPC64_ADDR16_LO, Value.Addend); if (Value.SymbolName) { @@ -1166,16 +1230,60 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( RelType, 0); Section.StubOffset += getMaxStubSize(); } - if (SymType == SymbolRef::ST_Unknown) + if (SymType == SymbolRef::ST_Unknown) { // Restore the TOC for external calls - writeInt32BE(Target + 4, 0xE8410028); // ld r2,40(r1) + if (AbiVariant == 2) + writeInt32BE(Target + 4, 0xE8410018); // ld r2,28(r1) + else + writeInt32BE(Target + 4, 0xE8410028); // ld r2,40(r1) + } + } + } else if (RelType == ELF::R_PPC64_TOC16 || + RelType == ELF::R_PPC64_TOC16_DS || + RelType == ELF::R_PPC64_TOC16_LO || + RelType == ELF::R_PPC64_TOC16_LO_DS || + RelType == ELF::R_PPC64_TOC16_HI || + RelType == ELF::R_PPC64_TOC16_HA) { + // These relocations are supposed to subtract the TOC address from + // the final value. This does not fit cleanly into the RuntimeDyld + // scheme, since there may be *two* sections involved in determining + // the relocation value (the section of the symbol refered to by the + // relocation, and the TOC section associated with the current module). + // + // Fortunately, these relocations are currently only ever generated + // refering to symbols that themselves reside in the TOC, which means + // that the two sections are actually the same. Thus they cancel out + // and we can immediately resolve the relocation right now. + switch (RelType) { + case ELF::R_PPC64_TOC16: RelType = ELF::R_PPC64_ADDR16; break; + case ELF::R_PPC64_TOC16_DS: RelType = ELF::R_PPC64_ADDR16_DS; break; + case ELF::R_PPC64_TOC16_LO: RelType = ELF::R_PPC64_ADDR16_LO; break; + case ELF::R_PPC64_TOC16_LO_DS: RelType = ELF::R_PPC64_ADDR16_LO_DS; break; + case ELF::R_PPC64_TOC16_HI: RelType = ELF::R_PPC64_ADDR16_HI; break; + case ELF::R_PPC64_TOC16_HA: RelType = ELF::R_PPC64_ADDR16_HA; break; + default: llvm_unreachable("Wrong relocation type."); } + + RelocationValueRef TOCValue; + findPPC64TOCSection(Obj, ObjSectionToID, TOCValue); + if (Value.SymbolName || Value.SectionID != TOCValue.SectionID) + llvm_unreachable("Unsupported TOC relocation."); + Value.Addend -= TOCValue.Addend; + resolveRelocation(Sections[SectionID], Offset, Value.Addend, RelType, 0); } else { + // There are two ways to refer to the TOC address directly: either + // via a ELF::R_PPC64_TOC relocation (where both symbol and addend are + // ignored), or via any relocation that refers to the magic ".TOC." + // symbols (in which case the addend is respected). + if (RelType == ELF::R_PPC64_TOC) { + RelType = ELF::R_PPC64_ADDR64; + findPPC64TOCSection(Obj, ObjSectionToID, Value); + } else if (TargetName == ".TOC.") { + findPPC64TOCSection(Obj, ObjSectionToID, Value); + Value.Addend += Addend; + } + RelocationEntry RE(SectionID, Offset, RelType, Value.Addend); - // Extra check to avoid relocation againt empty symbols (usually - // the R_PPC64_TOC). - if (SymType != SymbolRef::ST_Unknown && TargetName.empty()) - Value.SymbolName = nullptr; if (Value.SymbolName) addRelocationForSymbol(RE, Value.SymbolName); diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h index b84883310b4f..59fdfbe32521 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h @@ -82,7 +82,8 @@ class RuntimeDyldELF : public RuntimeDyldImpl { return 1; } - uint64_t findPPC64TOC() const; + void findPPC64TOCSection(ObjectImage &Obj, ObjSectionToIDMap &LocalSections, + RelocationValueRef &Rel); void findOPDEntrySection(ObjectImage &Obj, ObjSectionToIDMap &LocalSections, RelocationValueRef &Rel); diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h index 11cc3b246aca..0211d2bbbb05 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h @@ -20,6 +20,7 @@ #include "llvm/ADT/Triple.h" #include "llvm/ExecutionEngine/ObjectImage.h" #include "llvm/ExecutionEngine/RuntimeDyld.h" +#include "llvm/ExecutionEngine/RuntimeDyldChecker.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -158,6 +159,15 @@ class RelocationValueRef { }; class RuntimeDyldImpl { + friend class RuntimeDyldChecker; +private: + + uint64_t getAnySymbolRemoteAddress(StringRef Symbol) { + if (uint64_t InternalSymbolAddr = getSymbolLoadAddress(Symbol)) + return InternalSymbolAddr; + return MemMgr->getSymbolAddress(Symbol); + } + protected: // The MemoryManager to load objects into. RTDyldMemoryManager *MemMgr; @@ -302,7 +312,7 @@ class RuntimeDyldImpl { /// \brief Emits long jump instruction to Addr. /// \return Pointer to the memory area for emitting target address. - uint8_t *createStubFunction(uint8_t *Addr); + uint8_t *createStubFunction(uint8_t *Addr, unsigned AbiVariant = 0); /// \brief Resolves relocations from Relocs list with address from Value. void resolveRelocationList(const RelocationList &Relocs, uint64_t Value); @@ -339,7 +349,8 @@ class RuntimeDyldImpl { public: RuntimeDyldImpl(RTDyldMemoryManager *mm) - : MemMgr(mm), ProcessAllSections(false), HasError(false) {} + : MemMgr(mm), ProcessAllSections(false), HasError(false) { + } virtual ~RuntimeDyldImpl(); @@ -349,7 +360,7 @@ class RuntimeDyldImpl { ObjectImage *loadObject(ObjectImage *InputObject); - void *getSymbolAddress(StringRef Name) { + uint8_t* getSymbolAddress(StringRef Name) { // FIXME: Just look up as a function for now. Overly simple of course. // Work in progress. SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name); diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp index 2b425fbdd339..58fb51557c93 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp @@ -14,6 +14,12 @@ #include "RuntimeDyldMachO.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" + +#include "Targets/RuntimeDyldMachOARM.h" +#include "Targets/RuntimeDyldMachOAArch64.h" +#include "Targets/RuntimeDyldMachOI386.h" +#include "Targets/RuntimeDyldMachOX86_64.h" + using namespace llvm; using namespace llvm::object; @@ -21,6 +27,117 @@ using namespace llvm::object; namespace llvm { +uint64_t RuntimeDyldMachO::decodeAddend(uint8_t *LocalAddress, unsigned NumBytes, + uint32_t RelType) const { + uint64_t Addend = 0; + memcpy(&Addend, LocalAddress, NumBytes); + return Addend; +} + +RelocationValueRef RuntimeDyldMachO::getRelocationValueRef( + ObjectImage &ObjImg, const relocation_iterator &RI, + const RelocationEntry &RE, ObjSectionToIDMap &ObjSectionToID, + const SymbolTableMap &Symbols) { + + const MachOObjectFile &Obj = + static_cast(*ObjImg.getObjectFile()); + MachO::any_relocation_info RelInfo = + Obj.getRelocation(RI->getRawDataRefImpl()); + RelocationValueRef Value; + + bool IsExternal = Obj.getPlainRelocationExternal(RelInfo); + if (IsExternal) { + symbol_iterator Symbol = RI->getSymbol(); + StringRef TargetName; + Symbol->getName(TargetName); + SymbolTableMap::const_iterator SI = Symbols.find(TargetName.data()); + if (SI != Symbols.end()) { + Value.SectionID = SI->second.first; + Value.Addend = SI->second.second + RE.Addend; + } else { + SI = GlobalSymbolTable.find(TargetName.data()); + if (SI != GlobalSymbolTable.end()) { + Value.SectionID = SI->second.first; + Value.Addend = SI->second.second + RE.Addend; + } else { + Value.SymbolName = TargetName.data(); + Value.Addend = RE.Addend; + } + } + } else { + SectionRef Sec = Obj.getRelocationSection(RelInfo); + bool IsCode = false; + Sec.isText(IsCode); + Value.SectionID = findOrEmitSection(ObjImg, Sec, IsCode, ObjSectionToID); + uint64_t Addr; + Sec.getAddress(Addr); + Value.Addend = RE.Addend - Addr; + } + + return Value; +} + +void RuntimeDyldMachO::makeValueAddendPCRel(RelocationValueRef &Value, + ObjectImage &ObjImg, + const relocation_iterator &RI) { + const MachOObjectFile &Obj = + static_cast(*ObjImg.getObjectFile()); + MachO::any_relocation_info RelInfo = + Obj.getRelocation(RI->getRawDataRefImpl()); + + bool IsPCRel = Obj.getAnyRelocationPCRel(RelInfo); + if (IsPCRel) { + uint64_t RelocAddr = 0; + RI->getAddress(RelocAddr); + unsigned RelocSize = Obj.getAnyRelocationLength(RelInfo); + Value.Addend += RelocAddr + (1ULL << RelocSize); + } +} + +void RuntimeDyldMachO::dumpRelocationToResolve(const RelocationEntry &RE, + uint64_t Value) const { + const SectionEntry &Section = Sections[RE.SectionID]; + uint8_t *LocalAddress = Section.Address + RE.Offset; + uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + + dbgs() << "resolveRelocation Section: " << RE.SectionID + << " LocalAddress: " << format("%p", LocalAddress) + << " FinalAddress: " << format("%p", FinalAddress) + << " Value: " << format("%p", Value) << " Addend: " << RE.Addend + << " isPCRel: " << RE.IsPCRel << " MachoType: " << RE.RelType + << " Size: " << (1 << RE.Size) << "\n"; +} + +bool RuntimeDyldMachO::writeBytesUnaligned(uint8_t *Addr, uint64_t Value, + unsigned Size) { + for (unsigned i = 0; i < Size; ++i) { + *Addr++ = (uint8_t)Value; + Value >>= 8; + } + + return false; +} + +bool +RuntimeDyldMachO::isCompatibleFormat(const ObjectBuffer *InputBuffer) const { + if (InputBuffer->getBufferSize() < 4) + return false; + StringRef Magic(InputBuffer->getBufferStart(), 4); + if (Magic == "\xFE\xED\xFA\xCE") + return true; + if (Magic == "\xCE\xFA\xED\xFE") + return true; + if (Magic == "\xFE\xED\xFA\xCF") + return true; + if (Magic == "\xCF\xFA\xED\xFE") + return true; + return false; +} + +bool RuntimeDyldMachO::isCompatibleFile(const object::ObjectFile *Obj) const { + return Obj->isMachO(); +} + static unsigned char *processFDE(unsigned char *P, intptr_t DeltaForText, intptr_t DeltaForEH) { DEBUG(dbgs() << "Processing FDE: Delta for text: " << DeltaForText @@ -90,614 +207,17 @@ void RuntimeDyldMachO::registerEHFrames() { UnregisteredEHFrameSections.clear(); } -void RuntimeDyldMachO::finalizeLoad(ObjectImage &ObjImg, - ObjSectionToIDMap &SectionMap) { - unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID; - unsigned TextSID = RTDYLD_INVALID_SECTION_ID; - unsigned ExceptTabSID = RTDYLD_INVALID_SECTION_ID; - ObjSectionToIDMap::iterator i, e; - for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) { - const SectionRef &Section = i->first; - StringRef Name; - Section.getName(Name); - if (Name == "__eh_frame") - EHFrameSID = i->second; - else if (Name == "__text") - TextSID = i->second; - else if (Name == "__gcc_except_tab") - ExceptTabSID = i->second; - else if (Name == "__jump_table") - populateJumpTable(cast(*ObjImg.getObjectFile()), - Section, i->second); - else if (Name == "__pointers") - populatePointersSection(cast(*ObjImg.getObjectFile()), - Section, i->second); - } - UnregisteredEHFrameSections.push_back( - EHFrameRelatedSections(EHFrameSID, TextSID, ExceptTabSID)); -} - -// The target location for the relocation is described by RE.SectionID and -// RE.Offset. RE.SectionID can be used to find the SectionEntry. Each -// SectionEntry has three members describing its location. -// SectionEntry::Address is the address at which the section has been loaded -// into memory in the current (host) process. SectionEntry::LoadAddress is the -// address that the section will have in the target process. -// SectionEntry::ObjAddress is the address of the bits for this section in the -// original emitted object image (also in the current address space). -// -// Relocations will be applied as if the section were loaded at -// SectionEntry::LoadAddress, but they will be applied at an address based -// on SectionEntry::Address. SectionEntry::ObjAddress will be used to refer to -// Target memory contents if they are required for value calculations. -// -// The Value parameter here is the load address of the symbol for the -// relocation to be applied. For relocations which refer to symbols in the -// current object Value will be the LoadAddress of the section in which -// the symbol resides (RE.Addend provides additional information about the -// symbol location). For external symbols, Value will be the address of the -// symbol in the target address space. -void RuntimeDyldMachO::resolveRelocation(const RelocationEntry &RE, - uint64_t Value) { - DEBUG ( - const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t* LocalAddress = Section.Address + RE.Offset; - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; - - dbgs() << "resolveRelocation Section: " << RE.SectionID - << " LocalAddress: " << format("%p", LocalAddress) - << " FinalAddress: " << format("%p", FinalAddress) - << " Value: " << format("%p", Value) - << " Addend: " << RE.Addend - << " isPCRel: " << RE.IsPCRel - << " MachoType: " << RE.RelType - << " Size: " << (1 << RE.Size) << "\n"; - ); - - // This just dispatches to the proper target specific routine. +std::unique_ptr +llvm::RuntimeDyldMachO::create(Triple::ArchType Arch, RTDyldMemoryManager *MM) { switch (Arch) { default: - llvm_unreachable("Unsupported CPU type!"); - case Triple::x86_64: - resolveX86_64Relocation(RE, Value); - break; - case Triple::x86: - resolveI386Relocation(RE, Value); - break; - case Triple::arm: // Fall through. - case Triple::thumb: - resolveARMRelocation(RE, Value); - break; - case Triple::aarch64: - case Triple::arm64: - resolveAArch64Relocation(RE, Value); - break; - } -} - -bool RuntimeDyldMachO::resolveI386Relocation(const RelocationEntry &RE, - uint64_t Value) { - const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t* LocalAddress = Section.Address + RE.Offset; - - if (RE.IsPCRel) { - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; - Value -= FinalAddress + 4; // see MachOX86_64::resolveRelocation. - } - - switch (RE.RelType) { - default: - llvm_unreachable("Invalid relocation type!"); - case MachO::GENERIC_RELOC_VANILLA: - return applyRelocationValue(LocalAddress, Value + RE.Addend, - 1 << RE.Size); - case MachO::GENERIC_RELOC_SECTDIFF: - case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: { - uint64_t SectionABase = Sections[RE.Sections.SectionA].LoadAddress; - uint64_t SectionBBase = Sections[RE.Sections.SectionB].LoadAddress; - assert((Value == SectionABase || Value == SectionBBase) && - "Unexpected SECTDIFF relocation value."); - Value = SectionABase - SectionBBase + RE.Addend; - return applyRelocationValue(LocalAddress, Value, 1 << RE.Size); - } - case MachO::GENERIC_RELOC_PB_LA_PTR: - return Error("Relocation type not implemented yet!"); - } -} - -bool RuntimeDyldMachO::resolveX86_64Relocation(const RelocationEntry &RE, - uint64_t Value) { - const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t* LocalAddress = Section.Address + RE.Offset; - - // If the relocation is PC-relative, the value to be encoded is the - // pointer difference. - if (RE.IsPCRel) { - // FIXME: It seems this value needs to be adjusted by 4 for an effective PC - // address. Is that expected? Only for branches, perhaps? - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; - Value -= FinalAddress + 4; // see MachOX86_64::resolveRelocation. - } - - switch (RE.RelType) { - default: - llvm_unreachable("Invalid relocation type!"); - case MachO::X86_64_RELOC_SIGNED_1: - case MachO::X86_64_RELOC_SIGNED_2: - case MachO::X86_64_RELOC_SIGNED_4: - case MachO::X86_64_RELOC_SIGNED: - case MachO::X86_64_RELOC_UNSIGNED: - case MachO::X86_64_RELOC_BRANCH: - return applyRelocationValue(LocalAddress, Value + RE.Addend, 1 << RE.Size); - case MachO::X86_64_RELOC_GOT_LOAD: - case MachO::X86_64_RELOC_GOT: - case MachO::X86_64_RELOC_SUBTRACTOR: - case MachO::X86_64_RELOC_TLV: - return Error("Relocation type not implemented yet!"); - } -} - -bool RuntimeDyldMachO::resolveARMRelocation(const RelocationEntry &RE, - uint64_t Value) { - const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t* LocalAddress = Section.Address + RE.Offset; - - // If the relocation is PC-relative, the value to be encoded is the - // pointer difference. - if (RE.IsPCRel) { - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; - Value -= FinalAddress; - // ARM PCRel relocations have an effective-PC offset of two instructions - // (four bytes in Thumb mode, 8 bytes in ARM mode). - // FIXME: For now, assume ARM mode. - Value -= 8; - } - - switch (RE.RelType) { - default: - llvm_unreachable("Invalid relocation type!"); - case MachO::ARM_RELOC_VANILLA: - return applyRelocationValue(LocalAddress, Value, 1 << RE.Size); - case MachO::ARM_RELOC_BR24: { - // Mask the value into the target address. We know instructions are - // 32-bit aligned, so we can do it all at once. - uint32_t *p = (uint32_t *)LocalAddress; - // The low two bits of the value are not encoded. - Value >>= 2; - // Mask the value to 24 bits. - uint64_t FinalValue = Value & 0xffffff; - // Check for overflow. - if (Value != FinalValue) - return Error("ARM BR24 relocation out of range."); - // FIXME: If the destination is a Thumb function (and the instruction - // is a non-predicated BL instruction), we need to change it to a BLX - // instruction instead. - - // Insert the value into the instruction. - *p = (*p & ~0xffffff) | FinalValue; + llvm_unreachable("Unsupported target for RuntimeDyldMachO."); break; + case Triple::arm: return make_unique(MM); + case Triple::arm64: return make_unique(MM); + case Triple::x86: return make_unique(MM); + case Triple::x86_64: return make_unique(MM); } - case MachO::ARM_THUMB_RELOC_BR22: - case MachO::ARM_THUMB_32BIT_BRANCH: - case MachO::ARM_RELOC_HALF: - case MachO::ARM_RELOC_HALF_SECTDIFF: - case MachO::ARM_RELOC_PAIR: - case MachO::ARM_RELOC_SECTDIFF: - case MachO::ARM_RELOC_LOCAL_SECTDIFF: - case MachO::ARM_RELOC_PB_LA_PTR: - return Error("Relocation type not implemented yet!"); - } - return false; -} - -bool RuntimeDyldMachO::resolveAArch64Relocation(const RelocationEntry &RE, - uint64_t Value) { - const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t* LocalAddress = Section.Address + RE.Offset; - - // If the relocation is PC-relative, the value to be encoded is the - // pointer difference. - if (RE.IsPCRel) { - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; - Value -= FinalAddress; - } - - switch (RE.RelType) { - default: - llvm_unreachable("Invalid relocation type!"); - case MachO::ARM64_RELOC_UNSIGNED: - return applyRelocationValue(LocalAddress, Value, 1 << RE.Size); - case MachO::ARM64_RELOC_BRANCH26: { - // Mask the value into the target address. We know instructions are - // 32-bit aligned, so we can do it all at once. - uint32_t *p = (uint32_t *)LocalAddress; - // The low two bits of the value are not encoded. - Value >>= 2; - // Mask the value to 26 bits. - uint64_t FinalValue = Value & 0x3ffffff; - // Check for overflow. - if (FinalValue != Value) - return Error("ARM64 BRANCH26 relocation out of range."); - // Insert the value into the instruction. - *p = (*p & ~0x3ffffff) | FinalValue; - break; - } - case MachO::ARM64_RELOC_SUBTRACTOR: - case MachO::ARM64_RELOC_PAGE21: - case MachO::ARM64_RELOC_PAGEOFF12: - case MachO::ARM64_RELOC_GOT_LOAD_PAGE21: - case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12: - case MachO::ARM64_RELOC_POINTER_TO_GOT: - case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21: - case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12: - case MachO::ARM64_RELOC_ADDEND: - return Error("Relocation type not implemented yet!"); - } - return false; -} - -void RuntimeDyldMachO::populateJumpTable(MachOObjectFile &Obj, - const SectionRef &JTSection, - unsigned JTSectionID) { - assert(!Obj.is64Bit() && - "__jump_table section not supported in 64-bit MachO."); - - MachO::dysymtab_command DySymTabCmd = Obj.getDysymtabLoadCommand(); - MachO::section Sec32 = Obj.getSection(JTSection.getRawDataRefImpl()); - uint32_t JTSectionSize = Sec32.size; - unsigned FirstIndirectSymbol = Sec32.reserved1; - unsigned JTEntrySize = Sec32.reserved2; - unsigned NumJTEntries = JTSectionSize / JTEntrySize; - uint8_t* JTSectionAddr = getSectionAddress(JTSectionID); - unsigned JTEntryOffset = 0; - - assert((JTSectionSize % JTEntrySize) == 0 && - "Jump-table section does not contain a whole number of stubs?"); - - for (unsigned i = 0; i < NumJTEntries; ++i) { - unsigned SymbolIndex = - Obj.getIndirectSymbolTableEntry(DySymTabCmd, FirstIndirectSymbol + i); - symbol_iterator SI = Obj.getSymbolByIndex(SymbolIndex); - StringRef IndirectSymbolName; - SI->getName(IndirectSymbolName); - uint8_t* JTEntryAddr = JTSectionAddr + JTEntryOffset; - createStubFunction(JTEntryAddr); - RelocationEntry RE(JTSectionID, JTEntryOffset + 1, - MachO::GENERIC_RELOC_VANILLA, 0, true, 2); - addRelocationForSymbol(RE, IndirectSymbolName); - JTEntryOffset += JTEntrySize; - } -} - -void RuntimeDyldMachO::populatePointersSection(MachOObjectFile &Obj, - const SectionRef &PTSection, - unsigned PTSectionID) { - assert(!Obj.is64Bit() && - "__pointers section not supported in 64-bit MachO."); - - MachO::dysymtab_command DySymTabCmd = Obj.getDysymtabLoadCommand(); - MachO::section Sec32 = Obj.getSection(PTSection.getRawDataRefImpl()); - uint32_t PTSectionSize = Sec32.size; - unsigned FirstIndirectSymbol = Sec32.reserved1; - const unsigned PTEntrySize = 4; - unsigned NumPTEntries = PTSectionSize / PTEntrySize; - unsigned PTEntryOffset = 0; - - assert((PTSectionSize % PTEntrySize) == 0 && - "Pointers section does not contain a whole number of stubs?"); - - DEBUG(dbgs() << "Populating __pointers, Section ID " << PTSectionID - << ", " << NumPTEntries << " entries, " - << PTEntrySize << " bytes each:\n"); - - for (unsigned i = 0; i < NumPTEntries; ++i) { - unsigned SymbolIndex = - Obj.getIndirectSymbolTableEntry(DySymTabCmd, FirstIndirectSymbol + i); - symbol_iterator SI = Obj.getSymbolByIndex(SymbolIndex); - StringRef IndirectSymbolName; - SI->getName(IndirectSymbolName); - DEBUG(dbgs() << " " << IndirectSymbolName << ": index " << SymbolIndex - << ", PT offset: " << PTEntryOffset << "\n"); - RelocationEntry RE(PTSectionID, PTEntryOffset, - MachO::GENERIC_RELOC_VANILLA, 0, false, 2); - addRelocationForSymbol(RE, IndirectSymbolName); - PTEntryOffset += PTEntrySize; - } -} - - -section_iterator getSectionByAddress(const MachOObjectFile &Obj, - uint64_t Addr) { - section_iterator SI = Obj.section_begin(); - section_iterator SE = Obj.section_end(); - - for (; SI != SE; ++SI) { - uint64_t SAddr, SSize; - SI->getAddress(SAddr); - SI->getSize(SSize); - if ((Addr >= SAddr) && (Addr < SAddr + SSize)) - return SI; - } - - return SE; -} - -relocation_iterator RuntimeDyldMachO::processSECTDIFFRelocation( - unsigned SectionID, - relocation_iterator RelI, - ObjectImage &Obj, - ObjSectionToIDMap &ObjSectionToID) { - const MachOObjectFile *MachO = - static_cast(Obj.getObjectFile()); - MachO::any_relocation_info RE = - MachO->getRelocation(RelI->getRawDataRefImpl()); - - SectionEntry &Section = Sections[SectionID]; - uint32_t RelocType = MachO->getAnyRelocationType(RE); - bool IsPCRel = MachO->getAnyRelocationPCRel(RE); - unsigned Size = MachO->getAnyRelocationLength(RE); - uint64_t Offset; - RelI->getOffset(Offset); - uint8_t *LocalAddress = Section.Address + Offset; - unsigned NumBytes = 1 << Size; - int64_t Addend = 0; - memcpy(&Addend, LocalAddress, NumBytes); - - ++RelI; - MachO::any_relocation_info RE2 = - MachO->getRelocation(RelI->getRawDataRefImpl()); - - uint32_t AddrA = MachO->getScatteredRelocationValue(RE); - section_iterator SAI = getSectionByAddress(*MachO, AddrA); - assert(SAI != MachO->section_end() && "Can't find section for address A"); - uint64_t SectionABase; - SAI->getAddress(SectionABase); - uint64_t SectionAOffset = AddrA - SectionABase; - SectionRef SectionA = *SAI; - bool IsCode; - SectionA.isText(IsCode); - uint32_t SectionAID = findOrEmitSection(Obj, SectionA, IsCode, - ObjSectionToID); - - uint32_t AddrB = MachO->getScatteredRelocationValue(RE2); - section_iterator SBI = getSectionByAddress(*MachO, AddrB); - assert(SBI != MachO->section_end() && "Can't find section for address B"); - uint64_t SectionBBase; - SBI->getAddress(SectionBBase); - uint64_t SectionBOffset = AddrB - SectionBBase; - SectionRef SectionB = *SBI; - uint32_t SectionBID = findOrEmitSection(Obj, SectionB, IsCode, - ObjSectionToID); - - if (Addend != AddrA - AddrB) - Error("Unexpected SECTDIFF relocation addend."); - - DEBUG(dbgs() << "Found SECTDIFF: AddrA: " << AddrA << ", AddrB: " << AddrB - << ", Addend: " << Addend << ", SectionA ID: " - << SectionAID << ", SectionAOffset: " << SectionAOffset - << ", SectionB ID: " << SectionBID << ", SectionBOffset: " - << SectionBOffset << "\n"); - RelocationEntry R(SectionID, Offset, RelocType, 0, - SectionAID, SectionAOffset, SectionBID, SectionBOffset, - IsPCRel, Size); - - addRelocationForSection(R, SectionAID); - addRelocationForSection(R, SectionBID); - - return ++RelI; -} - -relocation_iterator RuntimeDyldMachO::processI386ScatteredVANILLA( - unsigned SectionID, - relocation_iterator RelI, - ObjectImage &Obj, - ObjSectionToIDMap &ObjSectionToID) { - const MachOObjectFile *MachO = - static_cast(Obj.getObjectFile()); - MachO::any_relocation_info RE = - MachO->getRelocation(RelI->getRawDataRefImpl()); - - SectionEntry &Section = Sections[SectionID]; - uint32_t RelocType = MachO->getAnyRelocationType(RE); - bool IsPCRel = MachO->getAnyRelocationPCRel(RE); - unsigned Size = MachO->getAnyRelocationLength(RE); - uint64_t Offset; - RelI->getOffset(Offset); - uint8_t *LocalAddress = Section.Address + Offset; - unsigned NumBytes = 1 << Size; - int64_t Addend = 0; - memcpy(&Addend, LocalAddress, NumBytes); - - unsigned SymbolBaseAddr = MachO->getScatteredRelocationValue(RE); - section_iterator TargetSI = getSectionByAddress(*MachO, SymbolBaseAddr); - assert(TargetSI != MachO->section_end() && "Can't find section for symbol"); - uint64_t SectionBaseAddr; - TargetSI->getAddress(SectionBaseAddr); - SectionRef TargetSection = *TargetSI; - bool IsCode; - TargetSection.isText(IsCode); - uint32_t TargetSectionID = findOrEmitSection(Obj, TargetSection, IsCode, - ObjSectionToID); - - Addend -= SectionBaseAddr; - RelocationEntry R(SectionID, Offset, RelocType, Addend, - IsPCRel, Size); - - addRelocationForSection(R, TargetSectionID); - - return ++RelI; -} - -relocation_iterator RuntimeDyldMachO::processRelocationRef( - unsigned SectionID, relocation_iterator RelI, ObjectImage &Obj, - ObjSectionToIDMap &ObjSectionToID, const SymbolTableMap &Symbols, - StubMap &Stubs) { - const ObjectFile *OF = Obj.getObjectFile(); - const MachOObjectFile *MachO = static_cast(OF); - MachO::any_relocation_info RE = - MachO->getRelocation(RelI->getRawDataRefImpl()); - - uint32_t RelType = MachO->getAnyRelocationType(RE); - - // FIXME: Properly handle scattered relocations. - // Special case the couple of scattered relocations that we know how - // to handle: SECTDIFF relocations, and scattered VANILLA relocations - // on I386. - // For all other scattered relocations, just bail out and hope for the - // best, since the offsets computed by scattered relocations have often - // been optimisticaly filled in by the compiler. This will fail - // horribly where the relocations *do* need to be applied, but that was - // already the case. - if (MachO->isRelocationScattered(RE)) { - if (RelType == MachO::GENERIC_RELOC_SECTDIFF || - RelType == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) - return processSECTDIFFRelocation(SectionID, RelI, Obj, ObjSectionToID); - else if (Arch == Triple::x86 && RelType == MachO::GENERIC_RELOC_VANILLA) - return processI386ScatteredVANILLA(SectionID, RelI, Obj, ObjSectionToID); - else - return ++RelI; - } - - RelocationValueRef Value; - SectionEntry &Section = Sections[SectionID]; - - bool IsExtern = MachO->getPlainRelocationExternal(RE); - bool IsPCRel = MachO->getAnyRelocationPCRel(RE); - unsigned Size = MachO->getAnyRelocationLength(RE); - uint64_t Offset; - RelI->getOffset(Offset); - uint8_t *LocalAddress = Section.Address + Offset; - unsigned NumBytes = 1 << Size; - uint64_t Addend = 0; - memcpy(&Addend, LocalAddress, NumBytes); - - if (IsExtern) { - // Obtain the symbol name which is referenced in the relocation - symbol_iterator Symbol = RelI->getSymbol(); - StringRef TargetName; - Symbol->getName(TargetName); - // First search for the symbol in the local symbol table - SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data()); - if (lsi != Symbols.end()) { - Value.SectionID = lsi->second.first; - Value.Addend = lsi->second.second + Addend; - } else { - // Search for the symbol in the global symbol table - SymbolTableMap::const_iterator gsi = - GlobalSymbolTable.find(TargetName.data()); - if (gsi != GlobalSymbolTable.end()) { - Value.SectionID = gsi->second.first; - Value.Addend = gsi->second.second + Addend; - } else { - Value.SymbolName = TargetName.data(); - Value.Addend = Addend; - } - } - - // Addends for external, PC-rel relocations on i386 point back to the zero - // offset. Calculate the final offset from the relocation target instead. - // This allows us to use the same logic for both external and internal - // relocations in resolveI386RelocationRef. - if (Arch == Triple::x86 && IsPCRel) { - uint64_t RelocAddr = 0; - RelI->getAddress(RelocAddr); - Value.Addend += RelocAddr + 4; - } - - } else { - SectionRef Sec = MachO->getRelocationSection(RE); - bool IsCode = false; - Sec.isText(IsCode); - Value.SectionID = findOrEmitSection(Obj, Sec, IsCode, ObjSectionToID); - uint64_t Addr; - Sec.getAddress(Addr); - Value.Addend = Addend - Addr; - if (IsPCRel) - Value.Addend += Offset + NumBytes; - } - - if (Arch == Triple::x86_64 && (RelType == MachO::X86_64_RELOC_GOT || - RelType == MachO::X86_64_RELOC_GOT_LOAD)) { - assert(IsPCRel); - assert(Size == 2); - - // FIXME: Teach the generic code above not to prematurely conflate - // relocation addends and symbol offsets. - Value.Addend -= Addend; - StubMap::const_iterator i = Stubs.find(Value); - uint8_t *Addr; - if (i != Stubs.end()) { - Addr = Section.Address + i->second; - } else { - Stubs[Value] = Section.StubOffset; - uint8_t *GOTEntry = Section.Address + Section.StubOffset; - RelocationEntry GOTRE(SectionID, Section.StubOffset, - MachO::X86_64_RELOC_UNSIGNED, Value.Addend, false, - 3); - if (Value.SymbolName) - addRelocationForSymbol(GOTRE, Value.SymbolName); - else - addRelocationForSection(GOTRE, Value.SectionID); - Section.StubOffset += 8; - Addr = GOTEntry; - } - RelocationEntry TargetRE(SectionID, Offset, - MachO::X86_64_RELOC_UNSIGNED, Addend, true, - 2); - resolveRelocation(TargetRE, (uint64_t)Addr); - } else if (Arch == Triple::arm && (RelType & 0xf) == MachO::ARM_RELOC_BR24) { - // This is an ARM branch relocation, need to use a stub function. - - // Look up for existing stub. - StubMap::const_iterator i = Stubs.find(Value); - uint8_t *Addr; - if (i != Stubs.end()) { - Addr = Section.Address + i->second; - } else { - // Create a new stub function. - Stubs[Value] = Section.StubOffset; - uint8_t *StubTargetAddr = - createStubFunction(Section.Address + Section.StubOffset); - RelocationEntry StubRE(SectionID, StubTargetAddr - Section.Address, - MachO::GENERIC_RELOC_VANILLA, Value.Addend); - if (Value.SymbolName) - addRelocationForSymbol(StubRE, Value.SymbolName); - else - addRelocationForSection(StubRE, Value.SectionID); - Addr = Section.Address + Section.StubOffset; - Section.StubOffset += getMaxStubSize(); - } - RelocationEntry TargetRE(Value.SectionID, Offset, RelType, 0, IsPCRel, - Size); - resolveRelocation(TargetRE, (uint64_t)Addr); - } else { - RelocationEntry RE(SectionID, Offset, RelType, Value.Addend, IsPCRel, Size); - if (Value.SymbolName) - addRelocationForSymbol(RE, Value.SymbolName); - else - addRelocationForSection(RE, Value.SectionID); - } - return ++RelI; -} - -bool -RuntimeDyldMachO::isCompatibleFormat(const ObjectBuffer *InputBuffer) const { - if (InputBuffer->getBufferSize() < 4) - return false; - StringRef Magic(InputBuffer->getBufferStart(), 4); - if (Magic == "\xFE\xED\xFA\xCE") - return true; - if (Magic == "\xCE\xFA\xED\xFE") - return true; - if (Magic == "\xFE\xED\xFA\xCF") - return true; - if (Magic == "\xCF\xFA\xED\xFE") - return true; - return false; -} - -bool RuntimeDyldMachO::isCompatibleFile(const object::ObjectFile *Obj) const { - return Obj->isMachO(); } } // end namespace llvm diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h index 060eb8c29a2b..7d1dc0263db0 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h @@ -16,63 +16,21 @@ #include "ObjectImageCommon.h" #include "RuntimeDyldImpl.h" -#include "llvm/ADT/IndexedMap.h" #include "llvm/Object/MachO.h" #include "llvm/Support/Format.h" +#define DEBUG_TYPE "dyld" + using namespace llvm; using namespace llvm::object; namespace llvm { class RuntimeDyldMachO : public RuntimeDyldImpl { -private: - - /// Write the least significant 'Size' bytes in 'Value' out at the address - /// pointed to by Addr. - bool applyRelocationValue(uint8_t *Addr, uint64_t Value, unsigned Size) { - for (unsigned i = 0; i < Size; ++i) { - *Addr++ = (uint8_t)Value; - Value >>= 8; - } - - return false; - } - - bool resolveI386Relocation(const RelocationEntry &RE, uint64_t Value); - bool resolveX86_64Relocation(const RelocationEntry &RE, uint64_t Value); - bool resolveARMRelocation(const RelocationEntry &RE, uint64_t Value); - bool resolveAArch64Relocation(const RelocationEntry &RE, uint64_t Value); - - // Populate stubs in __jump_table section. - void populateJumpTable(MachOObjectFile &Obj, const SectionRef &JTSection, - unsigned JTSectionID); - - // Populate __pointers section. - void populatePointersSection(MachOObjectFile &Obj, const SectionRef &PTSection, - unsigned PTSectionID); - - unsigned getMaxStubSize() override { - if (Arch == Triple::arm || Arch == Triple::thumb) - return 8; // 32-bit instruction and 32-bit address - else if (Arch == Triple::x86_64) - return 8; // GOT entry - else - return 0; - } - - unsigned getStubAlignment() override { return 1; } - - relocation_iterator processSECTDIFFRelocation( - unsigned SectionID, - relocation_iterator RelI, - ObjectImage &ObjImg, - ObjSectionToIDMap &ObjSectionToID); - - relocation_iterator processI386ScatteredVANILLA( - unsigned SectionID, - relocation_iterator RelI, - ObjectImage &ObjImg, - ObjSectionToIDMap &ObjSectionToID); +protected: + struct SectionOffsetPair { + unsigned SectionID; + uint64_t Offset; + }; struct EHFrameRelatedSections { EHFrameRelatedSections() @@ -91,30 +49,129 @@ class RuntimeDyldMachO : public RuntimeDyldImpl { // EH frame sections with the memory manager. SmallVector UnregisteredEHFrameSections; -public: RuntimeDyldMachO(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {} - void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override; - relocation_iterator - processRelocationRef(unsigned SectionID, relocation_iterator RelI, - ObjectImage &Obj, ObjSectionToIDMap &ObjSectionToID, - const SymbolTableMap &Symbols, StubMap &Stubs) override; - bool isCompatibleFormat(const ObjectBuffer *Buffer) const override; - bool isCompatibleFile(const object::ObjectFile *Obj) const override; - void registerEHFrames() override; - void finalizeLoad(ObjectImage &ObjImg, - ObjSectionToIDMap &SectionMap) override; + /// Extract the addend encoded in the instruction. + uint64_t decodeAddend(uint8_t *LocalAddress, unsigned NumBytes, + uint32_t RelType) const; + + /// Construct a RelocationValueRef representing the relocation target. + /// For Symbols in known sections, this will return a RelocationValueRef + /// representing a (SectionID, Offset) pair. + /// For Symbols whose section is not known, this will return a + /// (SymbolName, Offset) pair, where the Offset is taken from the instruction + /// immediate (held in RE.Addend). + /// In both cases the Addend field is *NOT* fixed up to be PC-relative. That + /// should be done by the caller where appropriate by calling makePCRel on + /// the RelocationValueRef. + RelocationValueRef getRelocationValueRef(ObjectImage &ObjImg, + const relocation_iterator &RI, + const RelocationEntry &RE, + ObjSectionToIDMap &ObjSectionToID, + const SymbolTableMap &Symbols); + + /// Make the RelocationValueRef addend PC-relative. + void makeValueAddendPCRel(RelocationValueRef &Value, ObjectImage &ObjImg, + const relocation_iterator &RI); + + /// Dump information about the relocation entry (RE) and resolved value. + void dumpRelocationToResolve(const RelocationEntry &RE, uint64_t Value) const; +public: + /// Create an ObjectImage from the given ObjectBuffer. static ObjectImage *createObjectImage(ObjectBuffer *InputBuffer) { return new ObjectImageCommon(InputBuffer); } + /// Create an ObjectImage from the given ObjectFile. static ObjectImage * createObjectImageFromFile(std::unique_ptr InputObject) { return new ObjectImageCommon(std::move(InputObject)); } + + /// Create a RuntimeDyldMachO instance for the given target architecture. + static std::unique_ptr create(Triple::ArchType Arch, + RTDyldMemoryManager *mm); + + /// Write the least significant 'Size' bytes in 'Value' out at the address + /// pointed to by Addr. Check for overflow. + bool writeBytesUnaligned(uint8_t *Addr, uint64_t Value, unsigned Size); + + SectionEntry &getSection(unsigned SectionID) { return Sections[SectionID]; } + + bool isCompatibleFormat(const ObjectBuffer *Buffer) const override; + bool isCompatibleFile(const object::ObjectFile *Obj) const override; + void registerEHFrames() override; +}; + +/// RuntimeDyldMachOTarget - Templated base class for generic MachO linker +/// algorithms and data structures. +/// +/// Concrete, target specific sub-classes can be accessed via the impl() +/// methods. (i.e. the RuntimeDyldMachO hierarchy uses the Curiously +/// Recurring Template Idiom). Concrete subclasses for each target +/// can be found in ./Targets. +template +class RuntimeDyldMachOCRTPBase : public RuntimeDyldMachO { +private: + Impl &impl() { return static_cast(*this); } + const Impl &impl() const { return static_cast(*this); } + +protected: + + /// Parse the given relocation, which must be a non-scattered, and + /// return a RelocationEntry representing the information. The 'Addend' field + /// will contain the unmodified instruction immediate. + RelocationEntry getBasicRelocationEntry(unsigned SectionID, + ObjectImage &ObjImg, + const relocation_iterator &RI) const { + const MachOObjectFile &Obj = + static_cast(*ObjImg.getObjectFile()); + MachO::any_relocation_info RelInfo = + Obj.getRelocation(RI->getRawDataRefImpl()); + + const SectionEntry &Section = Sections[SectionID]; + bool IsPCRel = Obj.getAnyRelocationPCRel(RelInfo); + unsigned Size = Obj.getAnyRelocationLength(RelInfo); + uint64_t Offset; + RI->getOffset(Offset); + uint8_t *LocalAddress = Section.Address + Offset; + unsigned NumBytes = 1 << Size; + uint32_t RelType = Obj.getAnyRelocationType(RelInfo); + uint64_t Addend = impl().decodeAddend(LocalAddress, NumBytes, RelType); + + return RelocationEntry(SectionID, Offset, RelType, Addend, IsPCRel, Size); + } + +public: + RuntimeDyldMachOCRTPBase(RTDyldMemoryManager *mm) : RuntimeDyldMachO(mm) {} + + void finalizeLoad(ObjectImage &ObjImg, ObjSectionToIDMap &SectionMap) { + unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID; + unsigned TextSID = RTDYLD_INVALID_SECTION_ID; + unsigned ExceptTabSID = RTDYLD_INVALID_SECTION_ID; + ObjSectionToIDMap::iterator i, e; + + for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) { + const SectionRef &Section = i->first; + StringRef Name; + Section.getName(Name); + if (Name == "__eh_frame") + EHFrameSID = i->second; + else if (Name == "__text") + TextSID = i->second; + else if (Name == "__gcc_except_tab") + ExceptTabSID = i->second; + else + impl().finalizeSection(ObjImg, i->second, Section); + } + UnregisteredEHFrameSections.push_back( + EHFrameRelatedSections(EHFrameSID, TextSID, ExceptTabSID)); + } }; } // end namespace llvm +#undef DEBUG_TYPE + #endif diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h new file mode 100644 index 000000000000..775ed9ec3635 --- /dev/null +++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h @@ -0,0 +1,255 @@ +//===-- RuntimeDyldMachOAArch64.h -- MachO/AArch64 specific code. -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_RUNTIMEDYLDMACHOAARCH64_H +#define LLVM_RUNTIMEDYLDMACHOAARCH64_H + +#include "../RuntimeDyldMachO.h" + +#define DEBUG_TYPE "dyld" + +namespace llvm { + +class RuntimeDyldMachOAArch64 + : public RuntimeDyldMachOCRTPBase { +public: + RuntimeDyldMachOAArch64(RTDyldMemoryManager *MM) + : RuntimeDyldMachOCRTPBase(MM) {} + + unsigned getMaxStubSize() override { return 8; } + + unsigned getStubAlignment() override { return 8; } + + relocation_iterator + processRelocationRef(unsigned SectionID, relocation_iterator RelI, + ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID, + const SymbolTableMap &Symbols, StubMap &Stubs) override { + const MachOObjectFile &Obj = + static_cast(*ObjImg.getObjectFile()); + MachO::any_relocation_info RelInfo = + Obj.getRelocation(RelI->getRawDataRefImpl()); + + assert(!Obj.isRelocationScattered(RelInfo) && ""); + + // ARM64 has an ARM64_RELOC_ADDEND relocation type that carries an explicit + // addend for the following relocation. If found: (1) store the associated + // addend, (2) consume the next relocation, and (3) use the stored addend to + // override the addend. + bool HasExplicitAddend = false; + int64_t ExplicitAddend = 0; + if (Obj.getAnyRelocationType(RelInfo) == MachO::ARM64_RELOC_ADDEND) { + assert(!Obj.getPlainRelocationExternal(RelInfo)); + assert(!Obj.getAnyRelocationPCRel(RelInfo)); + assert(Obj.getAnyRelocationLength(RelInfo) == 2); + HasExplicitAddend = true; + int64_t RawAddend = Obj.getPlainRelocationSymbolNum(RelInfo); + // Sign-extend the 24-bit to 64-bit. + ExplicitAddend = (RawAddend << 40) >> 40; + ++RelI; + RelInfo = Obj.getRelocation(RelI->getRawDataRefImpl()); + } + + RelocationEntry RE(getBasicRelocationEntry(SectionID, ObjImg, RelI)); + RelocationValueRef Value( + getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols)); + + if (HasExplicitAddend) { + RE.Addend = ExplicitAddend; + Value.Addend = ExplicitAddend; + } + + bool IsExtern = Obj.getPlainRelocationExternal(RelInfo); + if (!IsExtern && RE.IsPCRel) + makeValueAddendPCRel(Value, ObjImg, RelI); + + RE.Addend = Value.Addend; + + if (RE.RelType == MachO::ARM64_RELOC_GOT_LOAD_PAGE21 || + RE.RelType == MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12) + processGOTRelocation(RE, Value, Stubs); + else { + if (Value.SymbolName) + addRelocationForSymbol(RE, Value.SymbolName); + else + addRelocationForSection(RE, Value.SectionID); + } + + return ++RelI; + } + + void resolveRelocation(const RelocationEntry &RE, uint64_t Value) { + DEBUG(dumpRelocationToResolve(RE, Value)); + + const SectionEntry &Section = Sections[RE.SectionID]; + uint8_t *LocalAddress = Section.Address + RE.Offset; + + switch (RE.RelType) { + default: + llvm_unreachable("Invalid relocation type!"); + case MachO::ARM64_RELOC_UNSIGNED: { + assert(!RE.IsPCRel && "PCRel and ARM64_RELOC_UNSIGNED not supported"); + // Mask in the target value a byte at a time (we don't have an alignment + // guarantee for the target address, so this is safest). + if (RE.Size < 2) + llvm_unreachable("Invalid size for ARM64_RELOC_UNSIGNED"); + + writeBytesUnaligned(LocalAddress, Value + RE.Addend, 1 << RE.Size); + break; + } + case MachO::ARM64_RELOC_BRANCH26: { + assert(RE.IsPCRel && "not PCRel and ARM64_RELOC_BRANCH26 not supported"); + // Mask the value into the target address. We know instructions are + // 32-bit aligned, so we can do it all at once. + uint32_t *p = (uint32_t *)LocalAddress; + // Check if the addend is encoded in the instruction. + uint32_t EncodedAddend = *p & 0x03FFFFFF; + if (EncodedAddend != 0) { + if (RE.Addend == 0) + llvm_unreachable("branch26 instruction has embedded addend."); + else + llvm_unreachable("branch26 instruction has embedded addend and" + "ARM64_RELOC_ADDEND."); + } + // Check if branch is in range. + uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + uint64_t PCRelVal = Value - FinalAddress + RE.Addend; + assert(isInt<26>(PCRelVal) && "Branch target out of range!"); + // Insert the value into the instruction. + *p = (*p & 0xFC000000) | ((uint32_t)(PCRelVal >> 2) & 0x03FFFFFF); + break; + } + case MachO::ARM64_RELOC_GOT_LOAD_PAGE21: + case MachO::ARM64_RELOC_PAGE21: { + assert(RE.IsPCRel && "not PCRel and ARM64_RELOC_PAGE21 not supported"); + // Mask the value into the target address. We know instructions are + // 32-bit aligned, so we can do it all at once. + uint32_t *p = (uint32_t *)LocalAddress; + // Check if the addend is encoded in the instruction. + uint32_t EncodedAddend = + ((*p & 0x60000000) >> 29) | ((*p & 0x01FFFFE0) >> 3); + if (EncodedAddend != 0) { + if (RE.Addend == 0) + llvm_unreachable("adrp instruction has embedded addend."); + else + llvm_unreachable("adrp instruction has embedded addend and" + "ARM64_RELOC_ADDEND."); + } + // Adjust for PC-relative relocation and offset. + uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + uint64_t PCRelVal = + ((Value + RE.Addend) & (-4096)) - (FinalAddress & (-4096)); + // Check that the value fits into 21 bits (+ 12 lower bits). + assert(isInt<33>(PCRelVal) && "Invalid page reloc value!"); + // Insert the value into the instruction. + uint32_t ImmLoValue = (uint32_t)(PCRelVal << 17) & 0x60000000; + uint32_t ImmHiValue = (uint32_t)(PCRelVal >> 9) & 0x00FFFFE0; + *p = (*p & 0x9F00001F) | ImmHiValue | ImmLoValue; + break; + } + case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12: + case MachO::ARM64_RELOC_PAGEOFF12: { + assert(!RE.IsPCRel && "PCRel and ARM64_RELOC_PAGEOFF21 not supported"); + // Mask the value into the target address. We know instructions are + // 32-bit aligned, so we can do it all at once. + uint32_t *p = (uint32_t *)LocalAddress; + // Check if the addend is encoded in the instruction. + uint32_t EncodedAddend = *p & 0x003FFC00; + if (EncodedAddend != 0) { + if (RE.Addend == 0) + llvm_unreachable("adrp instruction has embedded addend."); + else + llvm_unreachable("adrp instruction has embedded addend and" + "ARM64_RELOC_ADDEND."); + } + // Add the offset from the symbol. + Value += RE.Addend; + // Mask out the page address and only use the lower 12 bits. + Value &= 0xFFF; + // Check which instruction we are updating to obtain the implicit shift + // factor from LDR/STR instructions. + if (*p & 0x08000000) { + uint32_t ImplicitShift = ((*p >> 30) & 0x3); + switch (ImplicitShift) { + case 0: + // Check if this a vector op. + if ((*p & 0x04800000) == 0x04800000) { + ImplicitShift = 4; + assert(((Value & 0xF) == 0) && + "128-bit LDR/STR not 16-byte aligned."); + } + break; + case 1: + assert(((Value & 0x1) == 0) && "16-bit LDR/STR not 2-byte aligned."); + case 2: + assert(((Value & 0x3) == 0) && "32-bit LDR/STR not 4-byte aligned."); + case 3: + assert(((Value & 0x7) == 0) && "64-bit LDR/STR not 8-byte aligned."); + } + // Compensate for implicit shift. + Value >>= ImplicitShift; + } + // Insert the value into the instruction. + *p = (*p & 0xFFC003FF) | ((uint32_t)(Value << 10) & 0x003FFC00); + break; + } + case MachO::ARM64_RELOC_SUBTRACTOR: + case MachO::ARM64_RELOC_POINTER_TO_GOT: + case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21: + case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12: + llvm_unreachable("Relocation type not implemented yet!"); + case MachO::ARM64_RELOC_ADDEND: + llvm_unreachable("ARM64_RELOC_ADDEND should have been handeled by " + "processRelocationRef!"); + } + } + + void finalizeSection(ObjectImage &ObjImg, unsigned SectionID, + const SectionRef &Section) {} + +private: + void processGOTRelocation(const RelocationEntry &RE, + RelocationValueRef &Value, StubMap &Stubs) { + assert(RE.Size == 2); + SectionEntry &Section = Sections[RE.SectionID]; + StubMap::const_iterator i = Stubs.find(Value); + uint8_t *Addr; + if (i != Stubs.end()) + Addr = Section.Address + i->second; + else { + // FIXME: There must be a better way to do this then to check and fix the + // alignment every time!!! + uintptr_t BaseAddress = uintptr_t(Section.Address); + uintptr_t StubAlignment = getStubAlignment(); + uintptr_t StubAddress = + (BaseAddress + Section.StubOffset + StubAlignment - 1) & + -StubAlignment; + unsigned StubOffset = StubAddress - BaseAddress; + Stubs[Value] = StubOffset; + assert(((StubAddress % getStubAlignment()) == 0) && + "GOT entry not aligned"); + RelocationEntry GOTRE(RE.SectionID, StubOffset, + MachO::ARM64_RELOC_UNSIGNED, Value.Addend, + /*IsPCRel=*/false, /*Size=*/3); + if (Value.SymbolName) + addRelocationForSymbol(GOTRE, Value.SymbolName); + else + addRelocationForSection(GOTRE, Value.SectionID); + Section.StubOffset = StubOffset + getMaxStubSize(); + Addr = (uint8_t *)StubAddress; + } + RelocationEntry TargetRE(RE.SectionID, RE.Offset, RE.RelType, /*Addend=*/0, + RE.IsPCRel, RE.Size); + resolveRelocation(TargetRE, (uint64_t)Addr); + } +}; +} + +#undef DEBUG_TYPE + +#endif // LLVM_RUNTIMEDYLDMACHOAARCH64_H diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h new file mode 100644 index 000000000000..1de994219824 --- /dev/null +++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h @@ -0,0 +1,154 @@ +//===----- RuntimeDyldMachOARM.h ---- MachO/ARM specific code. ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_RUNTIMEDYLDMACHOARM_H +#define LLVM_RUNTIMEDYLDMACHOARM_H + +#include "../RuntimeDyldMachO.h" + +#define DEBUG_TYPE "dyld" + +namespace llvm { + +class RuntimeDyldMachOARM + : public RuntimeDyldMachOCRTPBase { +public: + RuntimeDyldMachOARM(RTDyldMemoryManager *MM) : RuntimeDyldMachOCRTPBase(MM) {} + + unsigned getMaxStubSize() override { return 8; } + + unsigned getStubAlignment() override { return 4; } + + relocation_iterator + processRelocationRef(unsigned SectionID, relocation_iterator RelI, + ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID, + const SymbolTableMap &Symbols, StubMap &Stubs) override { + const MachOObjectFile &Obj = + static_cast(*ObjImg.getObjectFile()); + MachO::any_relocation_info RelInfo = + Obj.getRelocation(RelI->getRawDataRefImpl()); + + if (Obj.isRelocationScattered(RelInfo)) + return ++++RelI; + + RelocationEntry RE(getBasicRelocationEntry(SectionID, ObjImg, RelI)); + RelocationValueRef Value( + getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols)); + + bool IsExtern = Obj.getPlainRelocationExternal(RelInfo); + if (!IsExtern && RE.IsPCRel) + makeValueAddendPCRel(Value, ObjImg, RelI); + + if ((RE.RelType & 0xf) == MachO::ARM_RELOC_BR24) + processBranchRelocation(RE, Value, Stubs); + else { + RE.Addend = Value.Addend; + if (Value.SymbolName) + addRelocationForSymbol(RE, Value.SymbolName); + else + addRelocationForSection(RE, Value.SectionID); + } + + return ++RelI; + } + + void resolveRelocation(const RelocationEntry &RE, uint64_t Value) { + DEBUG(dumpRelocationToResolve(RE, Value)); + const SectionEntry &Section = Sections[RE.SectionID]; + uint8_t *LocalAddress = Section.Address + RE.Offset; + + // If the relocation is PC-relative, the value to be encoded is the + // pointer difference. + if (RE.IsPCRel) { + uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + Value -= FinalAddress; + // ARM PCRel relocations have an effective-PC offset of two instructions + // (four bytes in Thumb mode, 8 bytes in ARM mode). + // FIXME: For now, assume ARM mode. + Value -= 8; + } + + switch (RE.RelType) { + default: + llvm_unreachable("Invalid relocation type!"); + case MachO::ARM_RELOC_VANILLA: + writeBytesUnaligned(LocalAddress, Value, 1 << RE.Size); + break; + case MachO::ARM_RELOC_BR24: { + // Mask the value into the target address. We know instructions are + // 32-bit aligned, so we can do it all at once. + uint32_t *p = (uint32_t *)LocalAddress; + // The low two bits of the value are not encoded. + Value >>= 2; + // Mask the value to 24 bits. + uint64_t FinalValue = Value & 0xffffff; + // Check for overflow. + if (Value != FinalValue) { + Error("ARM BR24 relocation out of range."); + return; + } + // FIXME: If the destination is a Thumb function (and the instruction + // is a non-predicated BL instruction), we need to change it to a BLX + // instruction instead. + + // Insert the value into the instruction. + *p = (*p & ~0xffffff) | FinalValue; + break; + } + case MachO::ARM_THUMB_RELOC_BR22: + case MachO::ARM_THUMB_32BIT_BRANCH: + case MachO::ARM_RELOC_HALF: + case MachO::ARM_RELOC_HALF_SECTDIFF: + case MachO::ARM_RELOC_PAIR: + case MachO::ARM_RELOC_SECTDIFF: + case MachO::ARM_RELOC_LOCAL_SECTDIFF: + case MachO::ARM_RELOC_PB_LA_PTR: + Error("Relocation type not implemented yet!"); + return; + } + } + + void finalizeSection(ObjectImage &ObjImg, unsigned SectionID, + const SectionRef &Section) {} + +private: + void processBranchRelocation(const RelocationEntry &RE, + const RelocationValueRef &Value, + StubMap &Stubs) { + // This is an ARM branch relocation, need to use a stub function. + // Look up for existing stub. + SectionEntry &Section = Sections[RE.SectionID]; + RuntimeDyldMachO::StubMap::const_iterator i = Stubs.find(Value); + uint8_t *Addr; + if (i != Stubs.end()) { + Addr = Section.Address + i->second; + } else { + // Create a new stub function. + Stubs[Value] = Section.StubOffset; + uint8_t *StubTargetAddr = + createStubFunction(Section.Address + Section.StubOffset); + RelocationEntry StubRE(RE.SectionID, StubTargetAddr - Section.Address, + MachO::GENERIC_RELOC_VANILLA, Value.Addend); + if (Value.SymbolName) + addRelocationForSymbol(StubRE, Value.SymbolName); + else + addRelocationForSection(StubRE, Value.SectionID); + Addr = Section.Address + Section.StubOffset; + Section.StubOffset += getMaxStubSize(); + } + RelocationEntry TargetRE(Value.SectionID, RE.Offset, RE.RelType, 0, + RE.IsPCRel, RE.Size); + resolveRelocation(TargetRE, (uint64_t)Addr); + } +}; +} + +#undef DEBUG_TYPE + +#endif // LLVM_RUNTIMEDYLDMACHOARM_H diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h new file mode 100644 index 000000000000..856c6ca3035c --- /dev/null +++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h @@ -0,0 +1,315 @@ +//===---- RuntimeDyldMachOI386.h ---- MachO/I386 specific code. ---*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_RUNTIMEDYLDMACHOI386_H +#define LLVM_RUNTIMEDYLDMACHOI386_H + +#include "../RuntimeDyldMachO.h" + +#define DEBUG_TYPE "dyld" + +namespace llvm { + +class RuntimeDyldMachOI386 + : public RuntimeDyldMachOCRTPBase { +public: + RuntimeDyldMachOI386(RTDyldMemoryManager *MM) + : RuntimeDyldMachOCRTPBase(MM) {} + + unsigned getMaxStubSize() override { return 0; } + + unsigned getStubAlignment() override { return 1; } + + relocation_iterator + processRelocationRef(unsigned SectionID, relocation_iterator RelI, + ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID, + const SymbolTableMap &Symbols, StubMap &Stubs) override { + const MachOObjectFile &Obj = + static_cast(*ObjImg.getObjectFile()); + MachO::any_relocation_info RelInfo = + Obj.getRelocation(RelI->getRawDataRefImpl()); + uint32_t RelType = Obj.getAnyRelocationType(RelInfo); + + if (Obj.isRelocationScattered(RelInfo)) { + if (RelType == MachO::GENERIC_RELOC_SECTDIFF || + RelType == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) + return processSECTDIFFRelocation(SectionID, RelI, ObjImg, + ObjSectionToID); + else if (Arch == Triple::x86 && RelType == MachO::GENERIC_RELOC_VANILLA) + return processI386ScatteredVANILLA(SectionID, RelI, ObjImg, + ObjSectionToID); + llvm_unreachable("Unhandled scattered relocation."); + } + + RelocationEntry RE(getBasicRelocationEntry(SectionID, ObjImg, RelI)); + RelocationValueRef Value( + getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols)); + + // Addends for external, PC-rel relocations on i386 point back to the zero + // offset. Calculate the final offset from the relocation target instead. + // This allows us to use the same logic for both external and internal + // relocations in resolveI386RelocationRef. + // bool IsExtern = Obj.getPlainRelocationExternal(RelInfo); + // if (IsExtern && RE.IsPCRel) { + // uint64_t RelocAddr = 0; + // RelI->getAddress(RelocAddr); + // Value.Addend += RelocAddr + 4; + // } + if (RE.IsPCRel) + makeValueAddendPCRel(Value, ObjImg, RelI); + + RE.Addend = Value.Addend; + + if (Value.SymbolName) + addRelocationForSymbol(RE, Value.SymbolName); + else + addRelocationForSection(RE, Value.SectionID); + + return ++RelI; + } + + void resolveRelocation(const RelocationEntry &RE, uint64_t Value) { + DEBUG(dumpRelocationToResolve(RE, Value)); + + const SectionEntry &Section = Sections[RE.SectionID]; + uint8_t *LocalAddress = Section.Address + RE.Offset; + + if (RE.IsPCRel) { + uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + Value -= FinalAddress + 4; // see MachOX86_64::resolveRelocation. + } + + switch (RE.RelType) { + default: + llvm_unreachable("Invalid relocation type!"); + case MachO::GENERIC_RELOC_VANILLA: + writeBytesUnaligned(LocalAddress, Value + RE.Addend, 1 << RE.Size); + break; + case MachO::GENERIC_RELOC_SECTDIFF: + case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: { + uint64_t SectionABase = Sections[RE.Sections.SectionA].LoadAddress; + uint64_t SectionBBase = Sections[RE.Sections.SectionB].LoadAddress; + assert((Value == SectionABase || Value == SectionBBase) && + "Unexpected SECTDIFF relocation value."); + Value = SectionABase - SectionBBase + RE.Addend; + writeBytesUnaligned(LocalAddress, Value, 1 << RE.Size); + break; + } + case MachO::GENERIC_RELOC_PB_LA_PTR: + Error("Relocation type not implemented yet!"); + } + } + + void finalizeSection(ObjectImage &ObjImg, unsigned SectionID, + const SectionRef &Section) { + StringRef Name; + Section.getName(Name); + + if (Name == "__jump_table") + populateJumpTable(cast(*ObjImg.getObjectFile()), Section, + SectionID); + else if (Name == "__pointers") + populatePointersSection(cast(*ObjImg.getObjectFile()), + Section, SectionID); + } + +private: + relocation_iterator + processSECTDIFFRelocation(unsigned SectionID, relocation_iterator RelI, + ObjectImage &Obj, + ObjSectionToIDMap &ObjSectionToID) { + const MachOObjectFile *MachO = + static_cast(Obj.getObjectFile()); + MachO::any_relocation_info RE = + MachO->getRelocation(RelI->getRawDataRefImpl()); + + SectionEntry &Section = Sections[SectionID]; + uint32_t RelocType = MachO->getAnyRelocationType(RE); + bool IsPCRel = MachO->getAnyRelocationPCRel(RE); + unsigned Size = MachO->getAnyRelocationLength(RE); + uint64_t Offset; + RelI->getOffset(Offset); + uint8_t *LocalAddress = Section.Address + Offset; + unsigned NumBytes = 1 << Size; + int64_t Addend = 0; + memcpy(&Addend, LocalAddress, NumBytes); + + ++RelI; + MachO::any_relocation_info RE2 = + MachO->getRelocation(RelI->getRawDataRefImpl()); + + uint32_t AddrA = MachO->getScatteredRelocationValue(RE); + section_iterator SAI = getSectionByAddress(*MachO, AddrA); + assert(SAI != MachO->section_end() && "Can't find section for address A"); + uint64_t SectionABase; + SAI->getAddress(SectionABase); + uint64_t SectionAOffset = AddrA - SectionABase; + SectionRef SectionA = *SAI; + bool IsCode; + SectionA.isText(IsCode); + uint32_t SectionAID = + findOrEmitSection(Obj, SectionA, IsCode, ObjSectionToID); + + uint32_t AddrB = MachO->getScatteredRelocationValue(RE2); + section_iterator SBI = getSectionByAddress(*MachO, AddrB); + assert(SBI != MachO->section_end() && "Can't find section for address B"); + uint64_t SectionBBase; + SBI->getAddress(SectionBBase); + uint64_t SectionBOffset = AddrB - SectionBBase; + SectionRef SectionB = *SBI; + uint32_t SectionBID = + findOrEmitSection(Obj, SectionB, IsCode, ObjSectionToID); + + if (Addend != AddrA - AddrB) + Error("Unexpected SECTDIFF relocation addend."); + + DEBUG(dbgs() << "Found SECTDIFF: AddrA: " << AddrA << ", AddrB: " << AddrB + << ", Addend: " << Addend << ", SectionA ID: " << SectionAID + << ", SectionAOffset: " << SectionAOffset + << ", SectionB ID: " << SectionBID + << ", SectionBOffset: " << SectionBOffset << "\n"); + RelocationEntry R(SectionID, Offset, RelocType, 0, SectionAID, + SectionAOffset, SectionBID, SectionBOffset, IsPCRel, + Size); + + addRelocationForSection(R, SectionAID); + addRelocationForSection(R, SectionBID); + + return ++RelI; + } + + relocation_iterator processI386ScatteredVANILLA( + unsigned SectionID, relocation_iterator RelI, ObjectImage &Obj, + RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID) { + const MachOObjectFile *MachO = + static_cast(Obj.getObjectFile()); + MachO::any_relocation_info RE = + MachO->getRelocation(RelI->getRawDataRefImpl()); + + SectionEntry &Section = Sections[SectionID]; + uint32_t RelocType = MachO->getAnyRelocationType(RE); + bool IsPCRel = MachO->getAnyRelocationPCRel(RE); + unsigned Size = MachO->getAnyRelocationLength(RE); + uint64_t Offset; + RelI->getOffset(Offset); + uint8_t *LocalAddress = Section.Address + Offset; + unsigned NumBytes = 1 << Size; + int64_t Addend = 0; + memcpy(&Addend, LocalAddress, NumBytes); + + unsigned SymbolBaseAddr = MachO->getScatteredRelocationValue(RE); + section_iterator TargetSI = getSectionByAddress(*MachO, SymbolBaseAddr); + assert(TargetSI != MachO->section_end() && "Can't find section for symbol"); + uint64_t SectionBaseAddr; + TargetSI->getAddress(SectionBaseAddr); + SectionRef TargetSection = *TargetSI; + bool IsCode; + TargetSection.isText(IsCode); + uint32_t TargetSectionID = + findOrEmitSection(Obj, TargetSection, IsCode, ObjSectionToID); + + Addend -= SectionBaseAddr; + RelocationEntry R(SectionID, Offset, RelocType, Addend, IsPCRel, Size); + + addRelocationForSection(R, TargetSectionID); + + return ++RelI; + } + + // Populate stubs in __jump_table section. + void populateJumpTable(MachOObjectFile &Obj, const SectionRef &JTSection, + unsigned JTSectionID) { + assert(!Obj.is64Bit() && + "__jump_table section not supported in 64-bit MachO."); + + MachO::dysymtab_command DySymTabCmd = Obj.getDysymtabLoadCommand(); + MachO::section Sec32 = Obj.getSection(JTSection.getRawDataRefImpl()); + uint32_t JTSectionSize = Sec32.size; + unsigned FirstIndirectSymbol = Sec32.reserved1; + unsigned JTEntrySize = Sec32.reserved2; + unsigned NumJTEntries = JTSectionSize / JTEntrySize; + uint8_t *JTSectionAddr = getSectionAddress(JTSectionID); + unsigned JTEntryOffset = 0; + + assert((JTSectionSize % JTEntrySize) == 0 && + "Jump-table section does not contain a whole number of stubs?"); + + for (unsigned i = 0; i < NumJTEntries; ++i) { + unsigned SymbolIndex = + Obj.getIndirectSymbolTableEntry(DySymTabCmd, FirstIndirectSymbol + i); + symbol_iterator SI = Obj.getSymbolByIndex(SymbolIndex); + StringRef IndirectSymbolName; + SI->getName(IndirectSymbolName); + uint8_t *JTEntryAddr = JTSectionAddr + JTEntryOffset; + createStubFunction(JTEntryAddr); + RelocationEntry RE(JTSectionID, JTEntryOffset + 1, + MachO::GENERIC_RELOC_VANILLA, 0, true, 2); + addRelocationForSymbol(RE, IndirectSymbolName); + JTEntryOffset += JTEntrySize; + } + } + + // Populate __pointers section. + void populatePointersSection(MachOObjectFile &Obj, + const SectionRef &PTSection, + unsigned PTSectionID) { + assert(!Obj.is64Bit() && + "__pointers section not supported in 64-bit MachO."); + + MachO::dysymtab_command DySymTabCmd = Obj.getDysymtabLoadCommand(); + MachO::section Sec32 = Obj.getSection(PTSection.getRawDataRefImpl()); + uint32_t PTSectionSize = Sec32.size; + unsigned FirstIndirectSymbol = Sec32.reserved1; + const unsigned PTEntrySize = 4; + unsigned NumPTEntries = PTSectionSize / PTEntrySize; + unsigned PTEntryOffset = 0; + + assert((PTSectionSize % PTEntrySize) == 0 && + "Pointers section does not contain a whole number of stubs?"); + + DEBUG(dbgs() << "Populating __pointers, Section ID " << PTSectionID << ", " + << NumPTEntries << " entries, " << PTEntrySize + << " bytes each:\n"); + + for (unsigned i = 0; i < NumPTEntries; ++i) { + unsigned SymbolIndex = + Obj.getIndirectSymbolTableEntry(DySymTabCmd, FirstIndirectSymbol + i); + symbol_iterator SI = Obj.getSymbolByIndex(SymbolIndex); + StringRef IndirectSymbolName; + SI->getName(IndirectSymbolName); + DEBUG(dbgs() << " " << IndirectSymbolName << ": index " << SymbolIndex + << ", PT offset: " << PTEntryOffset << "\n"); + RelocationEntry RE(PTSectionID, PTEntryOffset, + MachO::GENERIC_RELOC_VANILLA, 0, false, 2); + addRelocationForSymbol(RE, IndirectSymbolName); + PTEntryOffset += PTEntrySize; + } + } + + static section_iterator getSectionByAddress(const MachOObjectFile &Obj, + uint64_t Addr) { + section_iterator SI = Obj.section_begin(); + section_iterator SE = Obj.section_end(); + + for (; SI != SE; ++SI) { + uint64_t SAddr, SSize; + SI->getAddress(SAddr); + SI->getSize(SSize); + if ((Addr >= SAddr) && (Addr < SAddr + SSize)) + return SI; + } + + return SE; + } +}; +} + +#undef DEBUG_TYPE + +#endif // LLVM_RUNTIMEDYLDMACHOI386_H diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h new file mode 100644 index 000000000000..99efe9da4486 --- /dev/null +++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h @@ -0,0 +1,132 @@ +//===-- RuntimeDyldMachOX86_64.h ---- MachO/X86_64 specific code. -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_RUNTIMEDYLDMACHOX86_64_H +#define LLVM_RUNTIMEDYLDMACHOX86_64_H + +#include "../RuntimeDyldMachO.h" + +#define DEBUG_TYPE "dyld" + +namespace llvm { + +class RuntimeDyldMachOX86_64 + : public RuntimeDyldMachOCRTPBase { +public: + RuntimeDyldMachOX86_64(RTDyldMemoryManager *MM) + : RuntimeDyldMachOCRTPBase(MM) {} + + unsigned getMaxStubSize() override { return 8; } + + unsigned getStubAlignment() override { return 1; } + + relocation_iterator + processRelocationRef(unsigned SectionID, relocation_iterator RelI, + ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID, + const SymbolTableMap &Symbols, StubMap &Stubs) override { + const MachOObjectFile &Obj = + static_cast(*ObjImg.getObjectFile()); + MachO::any_relocation_info RelInfo = + Obj.getRelocation(RelI->getRawDataRefImpl()); + + assert(!Obj.isRelocationScattered(RelInfo) && + "Scattered relocations not supported on X86_64"); + + RelocationEntry RE(getBasicRelocationEntry(SectionID, ObjImg, RelI)); + RelocationValueRef Value( + getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols)); + + bool IsExtern = Obj.getPlainRelocationExternal(RelInfo); + if (!IsExtern && RE.IsPCRel) + makeValueAddendPCRel(Value, ObjImg, RelI); + + if (RE.RelType == MachO::X86_64_RELOC_GOT || + RE.RelType == MachO::X86_64_RELOC_GOT_LOAD) + processGOTRelocation(RE, Value, Stubs); + else { + RE.Addend = Value.Addend; + if (Value.SymbolName) + addRelocationForSymbol(RE, Value.SymbolName); + else + addRelocationForSection(RE, Value.SectionID); + } + + return ++RelI; + } + + void resolveRelocation(const RelocationEntry &RE, uint64_t Value) { + DEBUG(dumpRelocationToResolve(RE, Value)); + const SectionEntry &Section = Sections[RE.SectionID]; + uint8_t *LocalAddress = Section.Address + RE.Offset; + + // If the relocation is PC-relative, the value to be encoded is the + // pointer difference. + if (RE.IsPCRel) { + // FIXME: It seems this value needs to be adjusted by 4 for an effective + // PC address. Is that expected? Only for branches, perhaps? + uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + Value -= FinalAddress + 4; + } + + switch (RE.RelType) { + default: + llvm_unreachable("Invalid relocation type!"); + case MachO::X86_64_RELOC_SIGNED_1: + case MachO::X86_64_RELOC_SIGNED_2: + case MachO::X86_64_RELOC_SIGNED_4: + case MachO::X86_64_RELOC_SIGNED: + case MachO::X86_64_RELOC_UNSIGNED: + case MachO::X86_64_RELOC_BRANCH: + writeBytesUnaligned(LocalAddress, Value + RE.Addend, 1 << RE.Size); + break; + case MachO::X86_64_RELOC_GOT_LOAD: + case MachO::X86_64_RELOC_GOT: + case MachO::X86_64_RELOC_SUBTRACTOR: + case MachO::X86_64_RELOC_TLV: + Error("Relocation type not implemented yet!"); + } + } + + void finalizeSection(ObjectImage &ObjImg, unsigned SectionID, + const SectionRef &Section) {} + +private: + void processGOTRelocation(const RelocationEntry &RE, + RelocationValueRef &Value, StubMap &Stubs) { + SectionEntry &Section = Sections[RE.SectionID]; + assert(RE.IsPCRel); + assert(RE.Size == 2); + Value.Addend -= RE.Addend; + RuntimeDyldMachO::StubMap::const_iterator i = Stubs.find(Value); + uint8_t *Addr; + if (i != Stubs.end()) { + Addr = Section.Address + i->second; + } else { + Stubs[Value] = Section.StubOffset; + uint8_t *GOTEntry = Section.Address + Section.StubOffset; + RelocationEntry GOTRE(RE.SectionID, Section.StubOffset, + MachO::X86_64_RELOC_UNSIGNED, Value.Addend, false, + 3); + if (Value.SymbolName) + addRelocationForSymbol(GOTRE, Value.SymbolName); + else + addRelocationForSection(GOTRE, Value.SectionID); + Section.StubOffset += 8; + Addr = GOTEntry; + } + RelocationEntry TargetRE(RE.SectionID, RE.Offset, + MachO::X86_64_RELOC_UNSIGNED, RE.Addend, true, 2); + resolveRelocation(TargetRE, (uint64_t)Addr); + } +}; +} + +#undef DEBUG_TYPE + +#endif // LLVM_RUNTIMEDYLDMACHOX86_64_H diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp index 398e3d5f2a1b..a7499bc09b30 100644 --- a/lib/IR/AsmWriter.cpp +++ b/lib/IR/AsmWriter.cpp @@ -106,6 +106,7 @@ static void PrintEscapedString(StringRef Name, raw_ostream &Out) { enum PrefixType { GlobalPrefix, + ComdatPrefix, LabelPrefix, LocalPrefix, NoPrefix @@ -119,6 +120,7 @@ static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) { switch (Prefix) { case NoPrefix: break; case GlobalPrefix: OS << '@'; break; + case ComdatPrefix: OS << '$'; break; case LabelPrefix: break; case LocalPrefix: OS << '%'; break; } @@ -1165,8 +1167,15 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V, } void AssemblyWriter::init() { - if (TheModule) - TypePrinter.incorporateTypes(*TheModule); + if (!TheModule) + return; + TypePrinter.incorporateTypes(*TheModule); + for (const Function &F : *TheModule) + if (const Comdat *C = F.getComdat()) + Comdats.insert(C); + for (const GlobalVariable &GV : TheModule->globals()) + if (const Comdat *C = GV.getComdat()) + Comdats.insert(C); } @@ -1308,6 +1317,15 @@ void AssemblyWriter::printModule(const Module *M) { printTypeIdentities(); + // Output all comdats. + if (!Comdats.empty()) + Out << '\n'; + for (const Comdat *C : Comdats) { + printComdat(C); + if (C != Comdats.back()) + Out << '\n'; + } + // Output all globals. if (!M->global_empty()) Out << '\n'; for (Module::const_global_iterator I = M->global_begin(), E = M->global_end(); @@ -1470,6 +1488,10 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) { PrintEscapedString(GV->getSection(), Out); Out << '"'; } + if (GV->hasComdat()) { + Out << ", comdat "; + PrintLLVMName(Out, GV->getComdat()->getName(), ComdatPrefix); + } if (GV->getAlignment()) Out << ", align " << GV->getAlignment(); @@ -1510,6 +1532,10 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) { Out << '\n'; } +void AssemblyWriter::printComdat(const Comdat *C) { + C->print(Out); +} + void AssemblyWriter::printTypeIdentities() { if (TypePrinter.NumberedTypes.empty() && TypePrinter.NamedTypes.empty()) @@ -1647,6 +1673,10 @@ void AssemblyWriter::printFunction(const Function *F) { PrintEscapedString(F->getSection(), Out); Out << '"'; } + if (F->hasComdat()) { + Out << " comdat "; + PrintLLVMName(Out, F->getComdat()->getName(), ComdatPrefix); + } if (F->getAlignment()) Out << " align " << F->getAlignment(); if (F->hasGC()) @@ -2158,6 +2188,31 @@ void NamedMDNode::print(raw_ostream &ROS) const { W.printNamedMDNode(this); } +void Comdat::print(raw_ostream &ROS) const { + PrintLLVMName(ROS, getName(), ComdatPrefix); + ROS << " = comdat "; + + switch (getSelectionKind()) { + case Comdat::Any: + ROS << "any"; + break; + case Comdat::ExactMatch: + ROS << "exactmatch"; + break; + case Comdat::Largest: + ROS << "largest"; + break; + case Comdat::NoDuplicates: + ROS << "noduplicates"; + break; + case Comdat::SameSize: + ROS << "samesize"; + break; + } + + ROS << '\n'; +} + void Type::print(raw_ostream &OS) const { TypePrinting TP; TP.print(const_cast(this), OS); @@ -2241,5 +2296,8 @@ void Type::dump() const { print(dbgs()); } // Module::dump() - Allow printing of Modules from the debugger. void Module::dump() const { print(dbgs(), nullptr); } +// \brief Allow printing of Comdats from the debugger. +void Comdat::dump() const { print(dbgs()); } + // NamedMDNode::dump() - Allow printing of NamedMDNodes from the debugger. void NamedMDNode::dump() const { print(dbgs()); } diff --git a/lib/IR/AsmWriter.h b/lib/IR/AsmWriter.h index b4ce6de10dde..aef9c8a3e9f7 100644 --- a/lib/IR/AsmWriter.h +++ b/lib/IR/AsmWriter.h @@ -16,6 +16,7 @@ #define LLVM_IR_ASSEMBLYWRITER_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/TypeFinder.h" @@ -26,6 +27,7 @@ namespace llvm { class BasicBlock; class Function; class GlobalValue; +class Comdat; class Module; class NamedMDNode; class Value; @@ -70,6 +72,7 @@ class AssemblyWriter { SlotTracker &Machine; TypePrinting TypePrinter; AssemblyAnnotationWriter *AnnotationWriter; + SetVector Comdats; public: /// Construct an AssemblyWriter with an external SlotTracker @@ -101,6 +104,7 @@ class AssemblyWriter { void printTypeIdentities(); void printGlobal(const GlobalVariable *GV); void printAlias(const GlobalAlias *GV); + void printComdat(const Comdat *C); void printFunction(const Function *F); void printArgument(const Argument *FA, AttributeSet Attrs, unsigned Idx); void printBasicBlock(const BasicBlock *BB); diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h index 9f3fd3e606d3..cc6d557ab4c7 100644 --- a/lib/IR/AttributeImpl.h +++ b/lib/IR/AttributeImpl.h @@ -39,7 +39,7 @@ class AttributeImpl : public FoldingSetNode { protected: enum AttrEntryKind { EnumAttrEntry, - AlignAttrEntry, + IntAttrEntry, StringAttrEntry }; @@ -49,7 +49,7 @@ class AttributeImpl : public FoldingSetNode { virtual ~AttributeImpl(); bool isEnumAttribute() const { return KindID == EnumAttrEntry; } - bool isAlignAttribute() const { return KindID == AlignAttrEntry; } + bool isIntAttribute() const { return KindID == IntAttrEntry; } bool isStringAttribute() const { return KindID == StringAttrEntry; } bool hasAttribute(Attribute::AttrKind A) const; @@ -67,7 +67,7 @@ class AttributeImpl : public FoldingSetNode { void Profile(FoldingSetNodeID &ID) const { if (isEnumAttribute()) Profile(ID, getKindAsEnum(), 0); - else if (isAlignAttribute()) + else if (isIntAttribute()) Profile(ID, getKindAsEnum(), getValueAsInt()); else Profile(ID, getKindAsString(), getValueAsString()); @@ -108,19 +108,20 @@ class EnumAttributeImpl : public AttributeImpl { Attribute::AttrKind getEnumKind() const { return Kind; } }; -class AlignAttributeImpl : public EnumAttributeImpl { +class IntAttributeImpl : public EnumAttributeImpl { void anchor() override; - unsigned Align; + uint64_t Val; public: - AlignAttributeImpl(Attribute::AttrKind Kind, unsigned Align) - : EnumAttributeImpl(AlignAttrEntry, Kind), Align(Align) { + IntAttributeImpl(Attribute::AttrKind Kind, uint64_t Val) + : EnumAttributeImpl(IntAttrEntry, Kind), Val(Val) { assert( - (Kind == Attribute::Alignment || Kind == Attribute::StackAlignment) && - "Wrong kind for alignment attribute!"); + (Kind == Attribute::Alignment || Kind == Attribute::StackAlignment || + Kind == Attribute::Dereferenceable) && + "Wrong kind for int attribute!"); } - unsigned getAlignment() const { return Align; } + uint64_t getValue() const { return Val; } }; class StringAttributeImpl : public AttributeImpl { @@ -164,6 +165,7 @@ class AttributeSetNode : public FoldingSetNode { unsigned getAlignment() const; unsigned getStackAlignment() const; + uint64_t getDereferenceableBytes() const; std::string getAsString(bool InAttrGrp) const; typedef const Attribute *iterator; diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp index 58475e2d3cba..04545ea919a4 100644 --- a/lib/IR/Attributes.cpp +++ b/lib/IR/Attributes.cpp @@ -47,7 +47,7 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind, if (!Val) PA = new EnumAttributeImpl(Kind); else - PA = new AlignAttributeImpl(Kind, Val); + PA = new IntAttributeImpl(Kind, Val); pImpl->AttrsSet.InsertNode(PA, InsertPoint); } @@ -88,6 +88,12 @@ Attribute Attribute::getWithStackAlignment(LLVMContext &Context, return get(Context, StackAlignment, Align); } +Attribute Attribute::getWithDereferenceableBytes(LLVMContext &Context, + uint64_t Bytes) { + assert(Bytes && "Bytes must be non-zero."); + return get(Context, Dereferenceable, Bytes); +} + //===----------------------------------------------------------------------===// // Attribute Accessor Methods //===----------------------------------------------------------------------===// @@ -96,8 +102,8 @@ bool Attribute::isEnumAttribute() const { return pImpl && pImpl->isEnumAttribute(); } -bool Attribute::isAlignAttribute() const { - return pImpl && pImpl->isAlignAttribute(); +bool Attribute::isIntAttribute() const { + return pImpl && pImpl->isIntAttribute(); } bool Attribute::isStringAttribute() const { @@ -106,15 +112,15 @@ bool Attribute::isStringAttribute() const { Attribute::AttrKind Attribute::getKindAsEnum() const { if (!pImpl) return None; - assert((isEnumAttribute() || isAlignAttribute()) && + assert((isEnumAttribute() || isIntAttribute()) && "Invalid attribute type to get the kind as an enum!"); return pImpl ? pImpl->getKindAsEnum() : None; } uint64_t Attribute::getValueAsInt() const { if (!pImpl) return 0; - assert(isAlignAttribute() && - "Expected the attribute to be an alignment attribute!"); + assert(isIntAttribute() && + "Expected the attribute to be an integer attribute!"); return pImpl ? pImpl->getValueAsInt() : 0; } @@ -156,6 +162,14 @@ unsigned Attribute::getStackAlignment() const { return pImpl->getValueAsInt(); } +/// This returns the number of dereferenceable bytes. +uint64_t Attribute::getDereferenceableBytes() const { + assert(hasAttribute(Attribute::Dereferenceable) && + "Trying to get dereferenceable bytes from " + "non-dereferenceable attribute!"); + return pImpl->getValueAsInt(); +} + std::string Attribute::getAsString(bool InAttrGrp) const { if (!pImpl) return ""; @@ -263,6 +277,20 @@ std::string Attribute::getAsString(bool InAttrGrp) const { return Result; } + if (hasAttribute(Attribute::Dereferenceable)) { + std::string Result; + Result += "dereferenceable"; + if (InAttrGrp) { + Result += "="; + Result += utostr(getValueAsInt()); + } else { + Result += "("; + Result += utostr(getValueAsInt()); + Result += ")"; + } + return Result; + } + // Convert target-dependent attributes to strings of the form: // // "kind" @@ -293,10 +321,10 @@ bool Attribute::operator<(Attribute A) const { // AttributeImpl Definition //===----------------------------------------------------------------------===// -// Pin the vtabels to this file. +// Pin the vtables to this file. AttributeImpl::~AttributeImpl() {} void EnumAttributeImpl::anchor() {} -void AlignAttributeImpl::anchor() {} +void IntAttributeImpl::anchor() {} void StringAttributeImpl::anchor() {} bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const { @@ -310,13 +338,13 @@ bool AttributeImpl::hasAttribute(StringRef Kind) const { } Attribute::AttrKind AttributeImpl::getKindAsEnum() const { - assert(isEnumAttribute() || isAlignAttribute()); + assert(isEnumAttribute() || isIntAttribute()); return static_cast(this)->getEnumKind(); } uint64_t AttributeImpl::getValueAsInt() const { - assert(isAlignAttribute()); - return static_cast(this)->getAlignment(); + assert(isIntAttribute()); + return static_cast(this)->getValue(); } StringRef AttributeImpl::getKindAsString() const { @@ -334,18 +362,18 @@ bool AttributeImpl::operator<(const AttributeImpl &AI) const { // relative to their enum value) and then strings. if (isEnumAttribute()) { if (AI.isEnumAttribute()) return getKindAsEnum() < AI.getKindAsEnum(); - if (AI.isAlignAttribute()) return true; + if (AI.isIntAttribute()) return true; if (AI.isStringAttribute()) return true; } - if (isAlignAttribute()) { + if (isIntAttribute()) { if (AI.isEnumAttribute()) return false; - if (AI.isAlignAttribute()) return getValueAsInt() < AI.getValueAsInt(); + if (AI.isIntAttribute()) return getValueAsInt() < AI.getValueAsInt(); if (AI.isStringAttribute()) return true; } if (AI.isEnumAttribute()) return false; - if (AI.isAlignAttribute()) return false; + if (AI.isIntAttribute()) return false; if (getKindAsString() == AI.getKindAsString()) return getValueAsString() < AI.getValueAsString(); return getKindAsString() < AI.getKindAsString(); @@ -398,6 +426,8 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) { case Attribute::InAlloca: return 1ULL << 43; case Attribute::NonNull: return 1ULL << 44; case Attribute::JumpTable: return 1ULL << 45; + case Attribute::Dereferenceable: + llvm_unreachable("dereferenceable attribute not supported in raw format"); } llvm_unreachable("Unsupported attribute type"); } @@ -482,6 +512,13 @@ unsigned AttributeSetNode::getStackAlignment() const { return 0; } +uint64_t AttributeSetNode::getDereferenceableBytes() const { + for (iterator I = begin(), E = end(); I != E; ++I) + if (I->hasAttribute(Attribute::Dereferenceable)) + return I->getDereferenceableBytes(); + return 0; +} + std::string AttributeSetNode::getAsString(bool InAttrGrp) const { std::string Str; for (iterator I = begin(), E = end(); I != E; ++I) { @@ -515,6 +552,8 @@ uint64_t AttributeSetImpl::Raw(unsigned Index) const { Mask |= (Log2_32(ASN->getAlignment()) + 1) << 16; else if (Kind == Attribute::StackAlignment) Mask |= (Log2_32(ASN->getStackAlignment()) + 1) << 26; + else if (Kind == Attribute::Dereferenceable) + llvm_unreachable("dereferenceable not supported in bit mask"); else Mask |= AttributeImpl::getAttrMask(Kind); } @@ -620,6 +659,10 @@ AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index, else if (Kind == Attribute::StackAlignment) Attrs.push_back(std::make_pair(Index, Attribute:: getWithStackAlignment(C, B.getStackAlignment()))); + else if (Kind == Attribute::Dereferenceable) + Attrs.push_back(std::make_pair(Index, + Attribute::getWithDereferenceableBytes(C, + B.getDereferenceableBytes()))); else Attrs.push_back(std::make_pair(Index, Attribute::get(C, Kind))); } @@ -877,6 +920,11 @@ unsigned AttributeSet::getStackAlignment(unsigned Index) const { return ASN ? ASN->getStackAlignment() : 0; } +uint64_t AttributeSet::getDereferenceableBytes(unsigned Index) const { + AttributeSetNode *ASN = getAttributes(Index); + return ASN ? ASN->getDereferenceableBytes() : 0; +} + std::string AttributeSet::getAsString(unsigned Index, bool InAttrGrp) const { AttributeSetNode *ASN = getAttributes(Index); @@ -956,7 +1004,7 @@ void AttributeSet::dump() const { //===----------------------------------------------------------------------===// AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Index) - : Attrs(0), Alignment(0), StackAlignment(0) { + : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0) { AttributeSetImpl *pImpl = AS.pImpl; if (!pImpl) return; @@ -973,13 +1021,14 @@ AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Index) void AttrBuilder::clear() { Attrs.reset(); - Alignment = StackAlignment = 0; + Alignment = StackAlignment = DerefBytes = 0; } AttrBuilder &AttrBuilder::addAttribute(Attribute::AttrKind Val) { assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!"); assert(Val != Attribute::Alignment && Val != Attribute::StackAlignment && - "Adding alignment attribute without adding alignment value!"); + Val != Attribute::Dereferenceable && + "Adding integer attribute without adding a value!"); Attrs[Val] = true; return *this; } @@ -997,6 +1046,8 @@ AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) { Alignment = Attr.getAlignment(); else if (Kind == Attribute::StackAlignment) StackAlignment = Attr.getStackAlignment(); + else if (Kind == Attribute::Dereferenceable) + DerefBytes = Attr.getDereferenceableBytes(); return *this; } @@ -1013,6 +1064,8 @@ AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) { Alignment = 0; else if (Val == Attribute::StackAlignment) StackAlignment = 0; + else if (Val == Attribute::Dereferenceable) + DerefBytes = 0; return *this; } @@ -1029,7 +1082,7 @@ AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) { for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) { Attribute Attr = *I; - if (Attr.isEnumAttribute() || Attr.isAlignAttribute()) { + if (Attr.isEnumAttribute() || Attr.isIntAttribute()) { Attribute::AttrKind Kind = I->getKindAsEnum(); Attrs[Kind] = false; @@ -1037,6 +1090,8 @@ AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) { Alignment = 0; else if (Kind == Attribute::StackAlignment) StackAlignment = 0; + else if (Kind == Attribute::Dereferenceable) + DerefBytes = 0; } else { assert(Attr.isStringAttribute() && "Invalid attribute type!"); std::map::iterator @@ -1079,6 +1134,14 @@ AttrBuilder &AttrBuilder::addStackAlignmentAttr(unsigned Align) { return *this; } +AttrBuilder &AttrBuilder::addDereferenceableAttr(uint64_t Bytes) { + if (Bytes == 0) return *this; + + Attrs[Attribute::Dereferenceable] = true; + DerefBytes = Bytes; + return *this; +} + AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) { // FIXME: What if both have alignments, but they don't match?! if (!Alignment) @@ -1087,6 +1150,9 @@ AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) { if (!StackAlignment) StackAlignment = B.StackAlignment; + if (!DerefBytes) + DerefBytes = B.DerefBytes; + Attrs |= B.Attrs; for (td_const_iterator I = B.TargetDepAttrs.begin(), @@ -1117,7 +1183,7 @@ bool AttrBuilder::hasAttributes(AttributeSet A, uint64_t Index) const { for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) { Attribute Attr = *I; - if (Attr.isEnumAttribute() || Attr.isAlignAttribute()) { + if (Attr.isEnumAttribute() || Attr.isIntAttribute()) { if (Attrs[I->getKindAsEnum()]) return true; } else { @@ -1142,7 +1208,8 @@ bool AttrBuilder::operator==(const AttrBuilder &B) { if (B.TargetDepAttrs.find(I->first) == B.TargetDepAttrs.end()) return false; - return Alignment == B.Alignment && StackAlignment == B.StackAlignment; + return Alignment == B.Alignment && StackAlignment == B.StackAlignment && + DerefBytes == B.DerefBytes; } AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) { @@ -1151,6 +1218,8 @@ AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) { for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds; I = Attribute::AttrKind(I + 1)) { + if (I == Attribute::Dereferenceable) + continue; if (uint64_t A = (Val & AttributeImpl::getAttrMask(I))) { Attrs[I] = true; @@ -1184,6 +1253,7 @@ AttributeSet AttributeFuncs::typeIncompatible(Type *Ty, uint64_t Index) { .addAttribute(Attribute::NoAlias) .addAttribute(Attribute::NoCapture) .addAttribute(Attribute::NonNull) + .addDereferenceableAttr(1) // the int here is ignored .addAttribute(Attribute::ReadNone) .addAttribute(Attribute::ReadOnly) .addAttribute(Attribute::StructRet) diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 05b3745ab0f9..459bd880ccb0 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -577,3 +577,12 @@ bool llvm::UpgradeDebugInfo(Module &M) { } return RetCode; } + +void llvm::UpgradeMDStringConstant(std::string &String) { + const std::string OldPrefix = "llvm.vectorizer."; + if (String == "llvm.vectorizer.unroll") { + String = "llvm.loop.interleave.count"; + } else if (String.find(OldPrefix) == 0) { + String.replace(0, OldPrefix.size(), "llvm.loop.vectorize."); + } +} diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt index b027ae5f503c..38a80b18bd5e 100644 --- a/lib/IR/CMakeLists.txt +++ b/lib/IR/CMakeLists.txt @@ -3,6 +3,7 @@ add_llvm_library(LLVMCore Attributes.cpp AutoUpgrade.cpp BasicBlock.cpp + Comdat.cpp ConstantFold.cpp ConstantRange.cpp Constants.cpp diff --git a/lib/IR/Comdat.cpp b/lib/IR/Comdat.cpp new file mode 100644 index 000000000000..80715ff40ba9 --- /dev/null +++ b/lib/IR/Comdat.cpp @@ -0,0 +1,25 @@ +//===-- Comdat.cpp - Implement Metadata classes --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Comdat class. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/Comdat.h" +#include "llvm/ADT/StringMap.h" +using namespace llvm; + +Comdat::Comdat(SelectionKind SK, StringMapEntry *Name) + : Name(Name), SK(SK) {} + +Comdat::Comdat(Comdat &&C) : Name(C.Name), SK(C.SK) {} + +Comdat::Comdat() : Name(nullptr), SK(Comdat::Any) {} + +StringRef Comdat::getName() const { return Name->first(); } diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp index c23ab71eaf3a..395ac3907baf 100644 --- a/lib/IR/ConstantFold.cpp +++ b/lib/IR/ConstantFold.cpp @@ -1334,6 +1334,15 @@ static FCmpInst::Predicate evaluateFCmpRelation(Constant *V1, Constant *V2) { return FCmpInst::BAD_FCMP_PREDICATE; } +static ICmpInst::Predicate areGlobalsPotentiallyEqual(const GlobalValue *GV1, + const GlobalValue *GV2) { + // Don't try to decide equality of aliases. + if (!isa(GV1) && !isa(GV2)) + if (!GV1->hasExternalWeakLinkage() || !GV2->hasExternalWeakLinkage()) + return ICmpInst::ICMP_NE; + return ICmpInst::BAD_ICMP_PREDICATE; +} + /// evaluateICmpRelation - This function determines if there is anything we can /// decide about the two constants provided. This doesn't need to handle simple /// things like integer comparisons, but should instead handle ConstantExprs @@ -1395,10 +1404,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2, // constant (which, since the types must match, means that it's a // ConstantPointerNull). if (const GlobalValue *GV2 = dyn_cast(V2)) { - // Don't try to decide equality of aliases. - if (!isa(GV) && !isa(GV2)) - if (!GV->hasExternalWeakLinkage() || !GV2->hasExternalWeakLinkage()) - return ICmpInst::ICMP_NE; + return areGlobalsPotentiallyEqual(GV, GV2); } else if (isa(V2)) { return ICmpInst::ICMP_NE; // Globals never equal labels. } else { @@ -1463,7 +1469,8 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2, } break; - case Instruction::GetElementPtr: + case Instruction::GetElementPtr: { + GEPOperator *CE1GEP = cast(CE1); // Ok, since this is a getelementptr, we know that the constant has a // pointer type. Check the various cases. if (isa(V2)) { @@ -1510,7 +1517,8 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2, "Surprising getelementptr!"); return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; } else { - // If they are different globals, we don't know what the value is. + if (CE1GEP->hasAllZeroIndices()) + return areGlobalsPotentiallyEqual(GV, GV2); return ICmpInst::BAD_ICMP_PREDICATE; } } @@ -1526,8 +1534,14 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2, // By far the most common case to handle is when the base pointers are // obviously to the same global. if (isa(CE1Op0) && isa(CE2Op0)) { - if (CE1Op0 != CE2Op0) // Don't know relative ordering. + // Don't know relative ordering, but check for inequality. + if (CE1Op0 != CE2Op0) { + GEPOperator *CE2GEP = cast(CE2); + if (CE1GEP->hasAllZeroIndices() && CE2GEP->hasAllZeroIndices()) + return areGlobalsPotentiallyEqual(cast(CE1Op0), + cast(CE2Op0)); return ICmpInst::BAD_ICMP_PREDICATE; + } // Ok, we know that both getelementptr instructions are based on the // same global. From this, we can precisely determine the relative // ordering of the resultant pointers. @@ -1573,6 +1587,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2, } } } + } default: break; } diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp index aa26cff6a7b6..b815936ac428 100644 --- a/lib/IR/Constants.cpp +++ b/lib/IR/Constants.cpp @@ -107,6 +107,28 @@ bool Constant::isAllOnesValue() const { return false; } +bool Constant::isMinSignedValue() const { + // Check for INT_MIN integers + if (const ConstantInt *CI = dyn_cast(this)) + return CI->isMinValue(/*isSigned=*/true); + + // Check for FP which are bitcasted from INT_MIN integers + if (const ConstantFP *CFP = dyn_cast(this)) + return CFP->getValueAPF().bitcastToAPInt().isMinSignedValue(); + + // Check for constant vectors which are splats of INT_MIN values. + if (const ConstantVector *CV = dyn_cast(this)) + if (Constant *Splat = CV->getSplatValue()) + return Splat->isMinSignedValue(); + + // Check for constant vectors which are splats of INT_MIN values. + if (const ConstantDataVector *CV = dyn_cast(this)) + if (Constant *Splat = CV->getSplatValue()) + return Splat->isMinSignedValue(); + + return false; +} + // Constructor to create a '0' constant of arbitrary type... Constant *Constant::getNullValue(Type *Ty) { switch (Ty->getTypeID()) { @@ -278,35 +300,48 @@ bool Constant::canTrap() const { return canTrapImpl(this, NonTrappingOps); } -/// isThreadDependent - Return true if the value can vary between threads. -bool Constant::isThreadDependent() const { - SmallPtrSet Visited; - SmallVector WorkList; - WorkList.push_back(this); - Visited.insert(this); +/// Check if C contains a GlobalValue for which Predicate is true. +static bool +ConstHasGlobalValuePredicate(const Constant *C, + bool (*Predicate)(const GlobalValue *)) { + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(C); + Visited.insert(C); while (!WorkList.empty()) { - const Constant *C = WorkList.pop_back_val(); - - if (const GlobalVariable *GV = dyn_cast(C)) { - if (GV->isThreadLocal()) + const Constant *WorkItem = WorkList.pop_back_val(); + if (const auto *GV = dyn_cast(WorkItem)) + if (Predicate(GV)) return true; - } - - for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I) { - const Constant *D = dyn_cast(C->getOperand(I)); - if (!D) + for (const Value *Op : WorkItem->operands()) { + const Constant *ConstOp = dyn_cast(Op); + if (!ConstOp) continue; - if (Visited.insert(D)) - WorkList.push_back(D); + if (Visited.insert(ConstOp)) + WorkList.push_back(ConstOp); } } - return false; } -/// isConstantUsed - Return true if the constant has users other than constant -/// exprs and other dangling things. +/// Return true if the value can vary between threads. +bool Constant::isThreadDependent() const { + auto DLLImportPredicate = [](const GlobalValue *GV) { + return GV->isThreadLocal(); + }; + return ConstHasGlobalValuePredicate(this, DLLImportPredicate); +} + +bool Constant::isDLLImportDependent() const { + auto DLLImportPredicate = [](const GlobalValue *GV) { + return GV->hasDLLImportStorageClass(); + }; + return ConstHasGlobalValuePredicate(this, DLLImportPredicate); +} + +/// Return true if the constant has users other than constant exprs and other +/// dangling things. bool Constant::isConstantUsed() const { for (const User *U : users()) { const Constant *UC = dyn_cast(U); diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp index 197b6cb9054e..87099a6c4e13 100644 --- a/lib/IR/Core.cpp +++ b/lib/IR/Core.cpp @@ -281,8 +281,11 @@ char *LLVMPrintTypeToString(LLVMTypeRef Ty) { std::string buf; raw_string_ostream os(buf); - assert(unwrap(Ty) != nullptr && "Expecting non-null Type"); - unwrap(Ty)->print(os); + if (unwrap(Ty)) + unwrap(Ty)->print(os); + else + os << "Printing Type"; + os.flush(); return strdup(buf.c_str()); @@ -532,8 +535,11 @@ char* LLVMPrintValueToString(LLVMValueRef Val) { std::string buf; raw_string_ostream os(buf); - assert(unwrap(Val) != nullptr && "Expecting non-null Value"); - unwrap(Val)->print(os); + if (unwrap(Val)) + unwrap(Val)->print(os); + else + os << "Printing Value"; + os.flush(); return strdup(buf.c_str()); @@ -2600,28 +2606,24 @@ LLVMBool LLVMCreateMemoryBufferWithContentsOfFile( LLVMMemoryBufferRef *OutMemBuf, char **OutMessage) { - std::unique_ptr MB; - std::error_code ec; - if (!(ec = MemoryBuffer::getFile(Path, MB))) { - *OutMemBuf = wrap(MB.release()); - return 0; + ErrorOr> MBOrErr = MemoryBuffer::getFile(Path); + if (std::error_code EC = MBOrErr.getError()) { + *OutMessage = strdup(EC.message().c_str()); + return 1; } - - *OutMessage = strdup(ec.message().c_str()); - return 1; + *OutMemBuf = wrap(MBOrErr.get().release()); + return 0; } LLVMBool LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf, char **OutMessage) { - std::unique_ptr MB; - std::error_code ec; - if (!(ec = MemoryBuffer::getSTDIN(MB))) { - *OutMemBuf = wrap(MB.release()); - return 0; + ErrorOr> MBOrErr = MemoryBuffer::getSTDIN(); + if (std::error_code EC = MBOrErr.getError()) { + *OutMessage = strdup(EC.message().c_str()); + return 1; } - - *OutMessage = strdup(ec.message().c_str()); - return 1; + *OutMemBuf = wrap(MBOrErr.get().release()); + return 0; } LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRange( @@ -2702,11 +2704,10 @@ void LLVMDisposePassManager(LLVMPassManagerRef PM) { /*===-- Threading ------------------------------------------------------===*/ LLVMBool LLVMStartMultithreaded() { - return llvm_start_multithreaded(); + return LLVMIsMultithreaded(); } void LLVMStopMultithreaded() { - llvm_stop_multithreaded(); } LLVMBool LLVMIsMultithreaded() { diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp index 92edacc04a77..218787c9933a 100644 --- a/lib/IR/DIBuilder.cpp +++ b/lib/IR/DIBuilder.cpp @@ -102,7 +102,8 @@ DICompileUnit DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename, StringRef Producer, bool isOptimized, StringRef Flags, unsigned RunTimeVer, StringRef SplitName, - DebugEmissionKind Kind) { + DebugEmissionKind Kind, + bool EmitDebugInfo) { assert(((Lang <= dwarf::DW_LANG_OCaml && Lang >= dwarf::DW_LANG_C89) || (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) && @@ -140,8 +141,14 @@ DICompileUnit DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename, MDNode *CUNode = MDNode::get(VMContext, Elts); // Create a named metadata so that it is easier to find cu in a module. - NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu"); - NMD->addOperand(CUNode); + // Note that we only generate this when the caller wants to actually + // emit debug information. When we are only interested in tracking + // source line locations throughout the backend, we prevent codegen from + // emitting debug info in the final output by not generating llvm.dbg.cu. + if (EmitDebugInfo) { + NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu"); + NMD->addOperand(CUNode); + } return DICompileUnit(CUNode); } @@ -1068,18 +1075,19 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope, DITypeRef Ty, ArrayRef Addr, unsigned ArgNo) { - SmallVector Elts; - Elts.push_back(GetTagConstant(VMContext, Tag)); - Elts.push_back(getNonCompileUnitScope(Scope)), - Elts.push_back(MDString::get(VMContext, Name)); - Elts.push_back(F); - Elts.push_back(ConstantInt::get(Type::getInt32Ty(VMContext), - (LineNo | (ArgNo << 24)))); - Elts.push_back(Ty); - Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))); - Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))); - Elts.append(Addr.begin(), Addr.end()); - + assert(Addr.size() > 0 && "complex address is empty"); + Value *Elts[] = { + GetTagConstant(VMContext, Tag), + getNonCompileUnitScope(Scope), + MDString::get(VMContext, Name), + F, + ConstantInt::get(Type::getInt32Ty(VMContext), + (LineNo | (ArgNo << 24))), + Ty, + Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)), + MDNode::get(VMContext, Addr) + }; return DIVariable(MDNode::get(VMContext, Elts)); } diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp index db9e56defa12..5e39b242dbbc 100644 --- a/lib/IR/DebugInfo.cpp +++ b/lib/IR/DebugInfo.cpp @@ -138,8 +138,14 @@ void DIDescriptor::replaceFunctionField(unsigned Elt, Function *F) { } } -unsigned DIVariable::getNumAddrElements() const { - return DbgNode->getNumOperands() - 8; +uint64_t DIVariable::getAddrElement(unsigned Idx) const { + DIDescriptor ComplexExpr = getDescriptorField(8); + if (Idx < ComplexExpr->getNumOperands()) + if (auto *CI = dyn_cast_or_null(ComplexExpr->getOperand(Idx))) + return CI->getZExtValue(); + + assert(false && "non-existing complex address element requested"); + return 0; } /// getInlinedAt - If this variable is inlined then return inline location. @@ -566,7 +572,13 @@ bool DIVariable::Verify() const { // Make sure that type @ field 5 is a DITypeRef. if (!fieldIsTypeRef(DbgNode, 5)) return false; - return DbgNode->getNumOperands() >= 8; + + // Variable without a complex expression. + if (DbgNode->getNumOperands() == 8) + return true; + + // Make sure the complex expression is an MDNode. + return (DbgNode->getNumOperands() == 9 && fieldIsMDNode(DbgNode, 8)); } /// Verify - Verify that a location descriptor is well formed. @@ -1514,3 +1526,23 @@ unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) { return 0; return cast(Val)->getZExtValue(); } + +llvm::DenseMap +llvm::makeSubprogramMap(const Module &M) { + DenseMap R; + + NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu"); + if (!CU_Nodes) + return R; + + for (MDNode *N : CU_Nodes->operands()) { + DICompileUnit CUNode(N); + DIArray SPs = CUNode.getSubprograms(); + for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) { + DISubprogram SP(SPs.getElement(i)); + if (Function *F = SP.getFunction()) + R.insert(std::make_pair(F, SP)); + } + } + return R; +} diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp index 27270636004f..37cce2b0d781 100644 --- a/lib/IR/DiagnosticInfo.cpp +++ b/lib/IR/DiagnosticInfo.cpp @@ -127,20 +127,20 @@ void DiagnosticInfoSampleProfile::print(DiagnosticPrinter &DP) const { DP << getMsg(); } -bool DiagnosticInfoOptimizationRemarkBase::isLocationAvailable() const { +bool DiagnosticInfoOptimizationBase::isLocationAvailable() const { return getDebugLoc().isUnknown() == false; } -void DiagnosticInfoOptimizationRemarkBase::getLocation(StringRef *Filename, - unsigned *Line, - unsigned *Column) const { +void DiagnosticInfoOptimizationBase::getLocation(StringRef *Filename, + unsigned *Line, + unsigned *Column) const { DILocation DIL(getDebugLoc().getAsMDNode(getFunction().getContext())); *Filename = DIL.getFilename(); *Line = DIL.getLineNumber(); *Column = DIL.getColumnNumber(); } -const std::string DiagnosticInfoOptimizationRemarkBase::getLocationStr() const { +const std::string DiagnosticInfoOptimizationBase::getLocationStr() const { StringRef Filename(""); unsigned Line = 0; unsigned Column = 0; @@ -149,7 +149,7 @@ const std::string DiagnosticInfoOptimizationRemarkBase::getLocationStr() const { return Twine(Filename + ":" + Twine(Line) + ":" + Twine(Column)).str(); } -void DiagnosticInfoOptimizationRemarkBase::print(DiagnosticPrinter &DP) const { +void DiagnosticInfoOptimizationBase::print(DiagnosticPrinter &DP) const { DP << getLocationStr() << ": " << getMsg(); } @@ -189,3 +189,20 @@ void llvm::emitOptimizationRemarkAnalysis(LLVMContext &Ctx, Ctx.diagnose( DiagnosticInfoOptimizationRemarkAnalysis(PassName, Fn, DLoc, Msg)); } + +bool DiagnosticInfoOptimizationFailure::isEnabled() const { + // Only print warnings. + return getSeverity() == DS_Warning; +} + +void llvm::emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn, + const DebugLoc &DLoc, const Twine &Msg) { + Ctx.diagnose(DiagnosticInfoOptimizationFailure( + Fn, DLoc, Twine("loop not vectorized: " + Msg))); +} + +void llvm::emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn, + const DebugLoc &DLoc, const Twine &Msg) { + Ctx.diagnose(DiagnosticInfoOptimizationFailure( + Fn, DLoc, Twine("loop not interleaved: " + Msg))); +} diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp index fe32c4613e72..de59b26ec92a 100644 --- a/lib/IR/Function.cpp +++ b/lib/IR/Function.cpp @@ -77,11 +77,17 @@ unsigned Argument::getArgNo() const { } /// hasNonNullAttr - Return true if this argument has the nonnull attribute on -/// it in its containing function. +/// it in its containing function. Also returns true if at least one byte is +/// known to be dereferenceable and the pointer is in addrspace(0). bool Argument::hasNonNullAttr() const { if (!getType()->isPointerTy()) return false; - return getParent()->getAttributes(). - hasAttribute(getArgNo()+1, Attribute::NonNull); + if (getParent()->getAttributes(). + hasAttribute(getArgNo()+1, Attribute::NonNull)) + return true; + else if (getDereferenceableBytes() > 0 && + getType()->getPointerAddressSpace() == 0) + return true; + return false; } /// hasByValAttr - Return true if this argument has the byval attribute on it @@ -113,6 +119,12 @@ unsigned Argument::getParamAlignment() const { } +uint64_t Argument::getDereferenceableBytes() const { + assert(getType()->isPointerTy() && + "Only pointers have dereferenceable bytes"); + return getParent()->getDereferenceableBytes(getArgNo()+1); +} + /// hasNestAttr - Return true if this argument has the nest attribute on /// it in its containing function. bool Argument::hasNestAttr() const { @@ -735,6 +747,11 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef Tys) { #include "llvm/IR/Intrinsics.gen" #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN +// This defines the "Intrinsic::getIntrinsicForMSBuiltin()" method. +#define GET_LLVM_INTRINSIC_FOR_MS_BUILTIN +#include "llvm/IR/Intrinsics.gen" +#undef GET_LLVM_INTRINSIC_FOR_MS_BUILTIN + /// hasAddressTaken - returns true if there are any uses of this function /// other than direct calls or invokes to it. bool Function::hasAddressTaken(const User* *PutOffender) const { diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp index b4c7977d52e5..1667401f88d0 100644 --- a/lib/IR/GCOV.cpp +++ b/lib/IR/GCOV.cpp @@ -438,11 +438,15 @@ class LineConsumer { StringRef Remaining; public: LineConsumer(StringRef Filename) { - if (std::error_code EC = MemoryBuffer::getFileOrSTDIN(Filename, Buffer)) { + ErrorOr> BufferOrErr = + MemoryBuffer::getFileOrSTDIN(Filename); + if (std::error_code EC = BufferOrErr.getError()) { errs() << Filename << ": " << EC.message() << "\n"; Remaining = ""; - } else + } else { + Buffer = std::move(BufferOrErr.get()); Remaining = Buffer->getBuffer(); + } } bool empty() { return Remaining.empty(); } void printNext(raw_ostream &OS, uint32_t LineNum) { diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp index 5410cc031dad..244e3e4baeed 100644 --- a/lib/IR/Globals.cpp +++ b/lib/IR/Globals.cpp @@ -59,15 +59,10 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) { setDLLStorageClass(Src->getDLLStorageClass()); } -static const GlobalObject *getBaseObject(const Constant &C) { - // FIXME: We should probably return a base + offset pair for non-zero GEPs. - return dyn_cast(C.stripPointerCasts()); -} - unsigned GlobalValue::getAlignment() const { if (auto *GA = dyn_cast(this)) { // In general we cannot compute this at the IR level, but we try. - if (const GlobalObject *GO = getBaseObject(*GA->getAliasee())) + if (const GlobalObject *GO = GA->getBaseObject()) return GO->getAlignment(); // FIXME: we should also be able to handle: @@ -96,13 +91,23 @@ void GlobalObject::copyAttributesFrom(const GlobalValue *Src) { const char *GlobalValue::getSection() const { if (auto *GA = dyn_cast(this)) { // In general we cannot compute this at the IR level, but we try. - if (const GlobalObject *GO = getBaseObject(*GA->getAliasee())) + if (const GlobalObject *GO = GA->getBaseObject()) return GO->getSection(); return ""; } return cast(this)->getSection(); } +Comdat *GlobalValue::getComdat() { + if (auto *GA = dyn_cast(this)) { + // In general we cannot compute this at the IR level, but we try. + if (const GlobalObject *GO = GA->getBaseObject()) + return const_cast(GO)->getComdat(); + return nullptr; + } + return cast(this)->getComdat(); +} + void GlobalObject::setSection(StringRef S) { Section = S; } bool GlobalValue::isDeclaration() const { diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp index a5ceacb5637c..9553252f4e96 100644 --- a/lib/IR/Instructions.cpp +++ b/lib/IR/Instructions.cpp @@ -2478,11 +2478,7 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty, if (Ty->isIntOrIntVectorTy()) return Create(Instruction::PtrToInt, S, Ty, Name, InsertAtEnd); - Type *STy = S->getType(); - if (STy->getPointerAddressSpace() != Ty->getPointerAddressSpace()) - return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertAtEnd); - - return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd); + return CreatePointerBitCastOrAddrSpaceCast(S, Ty, Name, InsertAtEnd); } /// @brief Create a BitCast or a PtrToInt cast instruction @@ -2500,14 +2496,36 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty, if (Ty->isIntOrIntVectorTy()) return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore); - Type *STy = S->getType(); - if (STy->getPointerAddressSpace() != Ty->getPointerAddressSpace()) + return CreatePointerBitCastOrAddrSpaceCast(S, Ty, Name, InsertBefore); +} + +CastInst *CastInst::CreatePointerBitCastOrAddrSpaceCast( + Value *S, Type *Ty, + const Twine &Name, + BasicBlock *InsertAtEnd) { + assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast"); + assert(Ty->isPtrOrPtrVectorTy() && "Invalid cast"); + + if (S->getType()->getPointerAddressSpace() != Ty->getPointerAddressSpace()) + return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertAtEnd); + + return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd); +} + +CastInst *CastInst::CreatePointerBitCastOrAddrSpaceCast( + Value *S, Type *Ty, + const Twine &Name, + Instruction *InsertBefore) { + assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast"); + assert(Ty->isPtrOrPtrVectorTy() && "Invalid cast"); + + if (S->getType()->getPointerAddressSpace() != Ty->getPointerAddressSpace()) return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertBefore); return Create(Instruction::BitCast, S, Ty, Name, InsertBefore); } -CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, +CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, bool isSigned, const Twine &Name, Instruction *InsertBefore) { assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() && diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp index d3f3482dc024..d14f139db185 100644 --- a/lib/IR/LegacyPassManager.cpp +++ b/lib/IR/LegacyPassManager.cpp @@ -607,8 +607,7 @@ void PMTopLevelManager::schedulePass(Pass *P) { // If P is an analysis pass and it is available then do not // generate the analysis again. Stale analysis info should not be // available at this point. - const PassInfo *PI = - PassRegistry::getPassRegistry()->getPassInfo(P->getPassID()); + const PassInfo *PI = P->getPassInfo(); if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) { delete P; return; @@ -723,8 +722,7 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) { return *I; // If Pass not found then check the interfaces implemented by Immutable Pass - const PassInfo *PassInf = - PassRegistry::getPassRegistry()->getPassInfo(PI); + const PassInfo *PassInf = (*I)->getPassInfo(); assert(PassInf && "Expected all immutable passes to be initialized"); const std::vector &ImmPI = PassInf->getInterfacesImplemented(); @@ -766,8 +764,7 @@ void PMTopLevelManager::dumpArguments() const { dbgs() << "Pass Arguments: "; for (SmallVectorImpl::const_iterator I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I) - if (const PassInfo *PI = - PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) { + if (const PassInfo *PI = (*I)->getPassInfo()) { assert(PI && "Expected all immutable passes to be initialized"); if (!PI->isAnalysisGroup()) dbgs() << " -" << PI->getPassArgument(); @@ -831,8 +828,8 @@ void PMDataManager::recordAvailableAnalysis(Pass *P) { // This pass is the current implementation of all of the interfaces it // implements as well. - const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI); - if (!PInf) return; + const PassInfo *PInf = P->getPassInfo(); + if (PInf == 0) return; const std::vector &II = PInf->getInterfacesImplemented(); for (unsigned i = 0, e = II.size(); i != e; ++i) AvailableAnalysis[II[i]->getTypeInfo()] = P; @@ -963,10 +960,9 @@ void PMDataManager::freePass(Pass *P, StringRef Msg, P->releaseMemory(); } - AnalysisID PI = P->getPassID(); - if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) { + if (const PassInfo *PInf = P->getPassInfo()) { // Remove the pass itself (if it is not already removed). - AvailableAnalysis.erase(PI); + AvailableAnalysis.erase(P->getPassID()); // Remove all interfaces this pass implements, for which it is also // listed as the available implementation. @@ -1148,8 +1144,7 @@ void PMDataManager::dumpPassArguments() const { if (PMDataManager *PMD = (*I)->getAsPMDataManager()) PMD->dumpPassArguments(); else - if (const PassInfo *PI = - PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) + if (const PassInfo *PI = (*I)->getPassInfo()) if (!PI->isAnalysisGroup()) dbgs() << " -" << PI->getPassArgument(); } diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp index 4d932d03962b..59137e47fa02 100644 --- a/lib/IR/Metadata.cpp +++ b/lib/IR/Metadata.cpp @@ -663,7 +663,7 @@ void Instruction::setMetadata(unsigned KindID, MDNode *Node) { // Otherwise, we're removing metadata from an instruction. assert((hasMetadataHashEntry() == - getContext().pImpl->MetadataStore.count(this)) && + (getContext().pImpl->MetadataStore.count(this) > 0)) && "HasMetadata bit out of date!"); if (!hasMetadataHashEntry()) return; // Nothing to remove! diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp index 6a5b386c199f..f1b1f9a2acc8 100644 --- a/lib/IR/Module.cpp +++ b/lib/IR/Module.cpp @@ -24,6 +24,8 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/LeakDetector.h" #include "llvm/Support/Dwarf.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/RandomNumberGenerator.h" #include #include #include @@ -44,7 +46,7 @@ template class llvm::SymbolTableListTraits; // Module::Module(StringRef MID, LLVMContext &C) - : Context(C), Materializer(), ModuleID(MID), DL("") { + : Context(C), Materializer(), ModuleID(MID), RNG(nullptr), DL("") { ValSymTab = new ValueSymbolTable(); NamedMDSymTab = new StringMap(); Context.addModule(this); @@ -59,6 +61,7 @@ Module::~Module() { NamedMDList.clear(); delete ValSymTab; delete static_cast *>(NamedMDSymTab); + delete RNG; } /// getNamedValue - Return the first global value in the module with @@ -355,6 +358,16 @@ const DataLayout *Module::getDataLayout() const { return &DL; } +// We want reproducible builds, but ModuleID may be a full path so we just use +// the filename to salt the RNG (although it is not guaranteed to be unique). +RandomNumberGenerator &Module::getRNG() const { + if (RNG == nullptr) { + StringRef Salt = sys::path::filename(ModuleID); + RNG = new RandomNumberGenerator(Salt); + } + return *RNG; +} + //===----------------------------------------------------------------------===// // Methods to control the materialization of GlobalValues in the Module. // @@ -400,10 +413,13 @@ std::error_code Module::materializeAll() { return Materializer->MaterializeModule(this); } -std::error_code Module::materializeAllPermanently() { +std::error_code Module::materializeAllPermanently(bool ReleaseBuffer) { if (std::error_code EC = materializeAll()) return EC; + if (ReleaseBuffer) + Materializer->releaseBuffer(); + Materializer.reset(); return std::error_code(); } @@ -421,14 +437,14 @@ std::error_code Module::materializeAllPermanently() { // has "dropped all references", except operator delete. // void Module::dropAllReferences() { - for(Module::iterator I = begin(), E = end(); I != E; ++I) - I->dropAllReferences(); + for (Function &F : *this) + F.dropAllReferences(); - for(Module::global_iterator I = global_begin(), E = global_end(); I != E; ++I) - I->dropAllReferences(); + for (GlobalVariable &GV : globals()) + GV.dropAllReferences(); - for(Module::alias_iterator I = alias_begin(), E = alias_end(); I != E; ++I) - I->dropAllReferences(); + for (GlobalAlias &GA : aliases()) + GA.dropAllReferences(); } unsigned Module::getDwarfVersion() const { @@ -437,3 +453,11 @@ unsigned Module::getDwarfVersion() const { return dwarf::DWARF_VERSION; return cast(Val)->getZExtValue(); } + +Comdat *Module::getOrInsertComdat(StringRef Name) { + Comdat C; + StringMapEntry &Entry = + ComdatSymTab.GetOrCreateValue(Name, std::move(C)); + Entry.second.Name = &Entry; + return &Entry.second; +} diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp index 0defb6ab3e42..2e2a7cb4956c 100644 --- a/lib/IR/PassManager.cpp +++ b/lib/IR/PassManager.cpp @@ -53,7 +53,7 @@ ModuleAnalysisManager::getResultImpl(void *PassID, Module *M) { // If we don't have a cached result for this module, look up the pass and run // it to produce a result, which we then add to the cache. if (Inserted) - RI->second = std::move(lookupPass(PassID).run(M, this)); + RI->second = lookupPass(PassID).run(M, this); return *RI->second; } diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp index 463024a28c76..1ab2183b6565 100644 --- a/lib/IR/Value.cpp +++ b/lib/IR/Value.cpp @@ -15,8 +15,10 @@ #include "LLVMContextImpl.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallString.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/InstrTypes.h" @@ -453,7 +455,8 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, return V; Offset = GEPOffset; V = GEP->getPointerOperand(); - } else if (Operator::getOpcode(V) == Instruction::BitCast) { + } else if (Operator::getOpcode(V) == Instruction::BitCast || + Operator::getOpcode(V) == Instruction::AddrSpaceCast) { V = cast(V)->getOperand(0); } else if (GlobalAlias *GA = dyn_cast(V)) { V = GA->getAliasee(); @@ -472,32 +475,67 @@ Value *Value::stripInBoundsOffsets() { /// isDereferenceablePointer - Test if this value is always a pointer to /// allocated and suitably aligned memory for a simple load or store. -static bool isDereferenceablePointer(const Value *V, +static bool isDereferenceablePointer(const Value *V, const DataLayout *DL, SmallPtrSet &Visited) { // Note that it is not safe to speculate into a malloc'd region because // malloc may return null. - // It's also not always safe to follow a bitcast, for example: - // bitcast i8* (alloca i8) to i32* - // would result in a 4-byte load from a 1-byte alloca. Some cases could - // be handled using DataLayout to check sizes and alignments though. // These are obviously ok. if (isa(V)) return true; + // It's not always safe to follow a bitcast, for example: + // bitcast i8* (alloca i8) to i32* + // would result in a 4-byte load from a 1-byte alloca. However, + // if we're casting from a pointer from a type of larger size + // to a type of smaller size (or the same size), and the alignment + // is at least as large as for the resulting pointer type, then + // we can look through the bitcast. + if (DL) + if (const BitCastInst* BC = dyn_cast(V)) { + Type *STy = BC->getSrcTy()->getPointerElementType(), + *DTy = BC->getDestTy()->getPointerElementType(); + if (STy->isSized() && DTy->isSized() && + (DL->getTypeStoreSize(STy) >= + DL->getTypeStoreSize(DTy)) && + (DL->getABITypeAlignment(STy) >= + DL->getABITypeAlignment(DTy))) + return isDereferenceablePointer(BC->getOperand(0), DL, Visited); + } + // Global variables which can't collapse to null are ok. if (const GlobalVariable *GV = dyn_cast(V)) return !GV->hasExternalWeakLinkage(); - // byval arguments are ok. - if (const Argument *A = dyn_cast(V)) - return A->hasByValAttr(); + // byval arguments are okay. Arguments specifically marked as + // dereferenceable are okay too. + if (const Argument *A = dyn_cast(V)) { + if (A->hasByValAttr()) + return true; + else if (uint64_t Bytes = A->getDereferenceableBytes()) { + Type *Ty = V->getType()->getPointerElementType(); + if (Ty->isSized() && DL && DL->getTypeStoreSize(Ty) <= Bytes) + return true; + } + + return false; + } + + // Return values from call sites specifically marked as dereferenceable are + // also okay. + if (ImmutableCallSite CS = V) { + if (uint64_t Bytes = CS.getDereferenceableBytes(0)) { + Type *Ty = V->getType()->getPointerElementType(); + if (Ty->isSized() && DL && DL->getTypeStoreSize(Ty) <= Bytes) + return true; + } + } // For GEPs, determine if the indexing lands within the allocated object. if (const GEPOperator *GEP = dyn_cast(V)) { // Conservatively require that the base pointer be fully dereferenceable. if (!Visited.insert(GEP->getOperand(0))) return false; - if (!isDereferenceablePointer(GEP->getOperand(0), Visited)) + if (!isDereferenceablePointer(GEP->getOperand(0), DL, Visited)) return false; // Check the indices. gep_type_iterator GTI = gep_type_begin(GEP); @@ -527,15 +565,39 @@ static bool isDereferenceablePointer(const Value *V, return true; } + if (const AddrSpaceCastInst *ASC = dyn_cast(V)) + return isDereferenceablePointer(ASC->getOperand(0), DL, Visited); + // If we don't know, assume the worst. return false; } /// isDereferenceablePointer - Test if this value is always a pointer to /// allocated and suitably aligned memory for a simple load or store. -bool Value::isDereferenceablePointer() const { +bool Value::isDereferenceablePointer(const DataLayout *DL) const { + // When dereferenceability information is provided by a dereferenceable + // attribute, we know exactly how many bytes are dereferenceable. If we can + // determine the exact offset to the attributed variable, we can use that + // information here. + Type *Ty = getType()->getPointerElementType(); + if (Ty->isSized() && DL) { + APInt Offset(DL->getTypeStoreSizeInBits(getType()), 0); + const Value *BV = stripAndAccumulateInBoundsConstantOffsets(*DL, Offset); + + APInt DerefBytes(Offset.getBitWidth(), 0); + if (const Argument *A = dyn_cast(BV)) + DerefBytes = A->getDereferenceableBytes(); + else if (ImmutableCallSite CS = BV) + DerefBytes = CS.getDereferenceableBytes(0); + + if (DerefBytes.getBoolValue() && Offset.isNonNegative()) { + if (DerefBytes.uge(Offset + DL->getTypeStoreSize(Ty))) + return true; + } + } + SmallPtrSet Visited; - return ::isDereferenceablePointer(this, Visited); + return ::isDereferenceablePointer(this, DL, Visited); } /// DoPHITranslation - If this value is a PHI node with CurBB as its parent, diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index d1c7f7d25c39..9cf911b51a4b 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -107,6 +107,12 @@ struct VerifierSupport { OS << ' ' << *T; } + void WriteComdat(const Comdat *C) { + if (!C) + return; + OS << *C; + } + // CheckFailed - A check failed, so print out the condition and the message // that failed. This provides a nice place to put a breakpoint if you want // to see why something is not correct. @@ -138,6 +144,12 @@ struct VerifierSupport { WriteType(T3); Broken = true; } + + void CheckFailed(const Twine &Message, const Comdat *C) { + OS << Message.str() << "\n"; + WriteComdat(C); + Broken = true; + } }; class Verifier : public InstVisitor, VerifierSupport { friend class InstVisitor; @@ -230,6 +242,9 @@ class Verifier : public InstVisitor, VerifierSupport { I != E; ++I) visitNamedMDNode(*I); + for (const StringMapEntry &SMEC : M.getComdatSymbolTable()) + visitComdat(SMEC.getValue()); + visitModuleFlags(M); visitModuleIdents(M); @@ -246,6 +261,7 @@ class Verifier : public InstVisitor, VerifierSupport { const GlobalAlias &A, const Constant &C); void visitNamedMDNode(const NamedMDNode &NMD); void visitMDNode(MDNode &MD, Function *F); + void visitComdat(const Comdat &C); void visitModuleIdents(const Module &M); void visitModuleFlags(const Module &M); void visitModuleFlag(const MDNode *Op, @@ -364,6 +380,8 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) { "Global is external, but doesn't have external or weak linkage!", &GV); + Assert1(GV.getAlignment() <= Value::MaximumAlignment, + "huge alignment values are unsupported", &GV); Assert1(!GV.hasAppendingLinkage() || isa(GV), "Only global variables can have appending linkage!", &GV); @@ -387,6 +405,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) { "'common' global must have a zero initializer!", &GV); Assert1(!GV.isConstant(), "'common' global may not be marked constant!", &GV); + Assert1(!GV.hasComdat(), "'common' global may not be in a Comdat!", &GV); } } else { Assert1(GV.hasExternalLinkage() || GV.hasExternalWeakLinkage(), @@ -516,7 +535,9 @@ void Verifier::visitGlobalAlias(const GlobalAlias &GA) { Assert1(!GA.getName().empty(), "Alias name cannot be empty!", &GA); Assert1(GlobalAlias::isValidLinkage(GA.getLinkage()), - "Alias should have external or external weak linkage!", &GA); + "Alias should have private, internal, linkonce, weak, linkonce_odr, " + "weak_odr, or external linkage!", + &GA); const Constant *Aliasee = GA.getAliasee(); Assert1(Aliasee, "Aliasee cannot be NULL!", &GA); Assert1(GA.getType() == Aliasee->getType(), @@ -578,6 +599,21 @@ void Verifier::visitMDNode(MDNode &MD, Function *F) { } } +void Verifier::visitComdat(const Comdat &C) { + // All Comdat::SelectionKind values other than Comdat::Any require a + // GlobalValue with the same name as the Comdat. + const GlobalValue *GV = M->getNamedValue(C.getName()); + if (C.getSelectionKind() != Comdat::Any) + Assert1(GV, + "comdat selection kind requires a global value with the same name", + &C); + // The Module is invalid if the GlobalValue has private linkage. Entities + // with private linkage don't have entries in the symbol table. + if (GV) + Assert1(!GV->hasPrivateLinkage(), "comdat global value has private linkage", + GV); +} + void Verifier::visitModuleIdents(const Module &M) { const NamedMDNode *Idents = M.getNamedMetadata("llvm.ident"); if (!Idents) @@ -1857,6 +1893,8 @@ void Verifier::visitLoadInst(LoadInst &LI) { Type *ElTy = PTy->getElementType(); Assert2(ElTy == LI.getType(), "Load result type does not match pointer operand type!", &LI, ElTy); + Assert1(LI.getAlignment() <= Value::MaximumAlignment, + "huge alignment values are unsupported", &LI); if (LI.isAtomic()) { Assert1(LI.getOrdering() != Release && LI.getOrdering() != AcquireRelease, "Load cannot have Release ordering", &LI); @@ -1932,6 +1970,8 @@ void Verifier::visitStoreInst(StoreInst &SI) { Assert2(ElTy == SI.getOperand(0)->getType(), "Stored value type does not match pointer operand type!", &SI, ElTy); + Assert1(SI.getAlignment() <= Value::MaximumAlignment, + "huge alignment values are unsupported", &SI); if (SI.isAtomic()) { Assert1(SI.getOrdering() != Acquire && SI.getOrdering() != AcquireRelease, "Store cannot have Acquire ordering", &SI); @@ -1963,6 +2003,8 @@ void Verifier::visitAllocaInst(AllocaInst &AI) { &AI); Assert1(AI.getArraySize()->getType()->isIntegerTy(), "Alloca array size must have integer type", &AI); + Assert1(AI.getAlignment() <= Value::MaximumAlignment, + "huge alignment values are unsupported", &AI); visitInstruction(AI); } @@ -2233,7 +2275,8 @@ void Verifier::visitInstruction(Instruction &I) { } MDNode *MD = I.getMetadata(LLVMContext::MD_range); - Assert1(!MD || isa(I), "Ranges are only for loads!", &I); + Assert1(!MD || isa(I) || isa(I) || isa(I), + "Ranges are only for loads, calls and invokes!", &I); InstsInThisBlock.insert(&I); } diff --git a/lib/IRReader/IRReader.cpp b/lib/IRReader/IRReader.cpp index 01aa074abad6..f8d2f5a9bd8c 100644 --- a/lib/IRReader/IRReader.cpp +++ b/lib/IRReader/IRReader.cpp @@ -29,9 +29,8 @@ namespace llvm { static const char *const TimeIRParsingGroupName = "LLVM IR Parsing"; static const char *const TimeIRParsingName = "Parse IR"; - -Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err, - LLVMContext &Context) { +static Module *getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err, + LLVMContext &Context) { if (isBitcode((const unsigned char *)Buffer->getBufferStart(), (const unsigned char *)Buffer->getBufferEnd())) { std::string ErrMsg; @@ -39,7 +38,7 @@ Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err, if (std::error_code EC = ModuleOrErr.getError()) { Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error, EC.message()); - // ParseBitcodeFile does not take ownership of the Buffer in the + // getLazyBitcodeModule does not take ownership of the Buffer in the // case of an error. delete Buffer; return nullptr; @@ -52,14 +51,15 @@ Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err, Module *llvm::getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err, LLVMContext &Context) { - std::unique_ptr File; - if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) { + ErrorOr> FileOrErr = + MemoryBuffer::getFileOrSTDIN(Filename); + if (std::error_code EC = FileOrErr.getError()) { Err = SMDiagnostic(Filename, SourceMgr::DK_Error, - "Could not open input file: " + ec.message()); + "Could not open input file: " + EC.message()); return nullptr; } - return getLazyIRModule(File.release(), Err, Context); + return getLazyIRModule(FileOrErr.get().release(), Err, Context); } Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err, @@ -76,23 +76,25 @@ Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err, else M = ModuleOrErr.get(); // parseBitcodeFile does not take ownership of the Buffer. - delete Buffer; return M; } - return ParseAssembly(Buffer, nullptr, Err, Context); + return ParseAssembly(MemoryBuffer::getMemBuffer( + Buffer->getBuffer(), Buffer->getBufferIdentifier()), + nullptr, Err, Context); } Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err, LLVMContext &Context) { - std::unique_ptr File; - if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) { + ErrorOr> FileOrErr = + MemoryBuffer::getFileOrSTDIN(Filename); + if (std::error_code EC = FileOrErr.getError()) { Err = SMDiagnostic(Filename, SourceMgr::DK_Error, - "Could not open input file: " + ec.message()); + "Could not open input file: " + EC.message()); return nullptr; } - return ParseIR(File.release(), Err, Context); + return ParseIR(FileOrErr.get().get(), Err, Context); } //===----------------------------------------------------------------------===// @@ -104,7 +106,8 @@ LLVMBool LLVMParseIRInContext(LLVMContextRef ContextRef, char **OutMessage) { SMDiagnostic Diag; - *OutM = wrap(ParseIR(unwrap(MemBuf), Diag, *unwrap(ContextRef))); + std::unique_ptr MB(unwrap(MemBuf)); + *OutM = wrap(ParseIR(MB.get(), Diag, *unwrap(ContextRef))); if(!*OutM) { if (OutMessage) { diff --git a/lib/LTO/LLVMBuild.txt b/lib/LTO/LLVMBuild.txt index c9b5212da164..c493f436acf5 100644 --- a/lib/LTO/LLVMBuild.txt +++ b/lib/LTO/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = LTO parent = Libraries -required_libraries = BitReader BitWriter Core IPA IPO InstCombine Linker MC MCParser ObjCARC Scalar Support Target TransformUtils +required_libraries = BitReader BitWriter Core IPA IPO InstCombine Linker MC ObjCARC Object Scalar Support Target TransformUtils diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp index 9009958613ed..45a49e4817b9 100644 --- a/lib/LTO/LTOCodeGenerator.cpp +++ b/lib/LTO/LTOCodeGenerator.cpp @@ -107,6 +107,7 @@ void LTOCodeGenerator::initializeLTOPasses() { initializeFunctionAttrsPass(R); initializeGlobalsModRefPass(R); initializeLICMPass(R); + initializeMergedLoadStoreMotionPass(R); initializeGVNPass(R); initializeMemCpyOptPass(R); initializeDCEPass(R); @@ -114,7 +115,7 @@ void LTOCodeGenerator::initializeLTOPasses() { } bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) { - bool ret = IRLinker.linkInModule(mod->getLLVVMModule(), &errMsg); + bool ret = IRLinker.linkInModule(&mod->getModule(), &errMsg); const std::vector &undefs = mod->getAsmUndefinedRefs(); for (int i = 0, e = undefs.size(); i != e; ++i) @@ -124,23 +125,7 @@ bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) { } void LTOCodeGenerator::setTargetOptions(TargetOptions options) { - Options.LessPreciseFPMADOption = options.LessPreciseFPMADOption; - Options.NoFramePointerElim = options.NoFramePointerElim; - Options.AllowFPOpFusion = options.AllowFPOpFusion; - Options.UnsafeFPMath = options.UnsafeFPMath; - Options.NoInfsFPMath = options.NoInfsFPMath; - Options.NoNaNsFPMath = options.NoNaNsFPMath; - Options.HonorSignDependentRoundingFPMathOption = - options.HonorSignDependentRoundingFPMathOption; - Options.UseSoftFloat = options.UseSoftFloat; - Options.FloatABIType = options.FloatABIType; - Options.NoZerosInBSS = options.NoZerosInBSS; - Options.GuaranteedTailCallOpt = options.GuaranteedTailCallOpt; - Options.DisableTailCalls = options.DisableTailCalls; - Options.StackAlignmentOverride = options.StackAlignmentOverride; - Options.TrapFuncName = options.TrapFuncName; - Options.PositionIndependentExecutable = options.PositionIndependentExecutable; - Options.UseInitArray = options.UseInitArray; + Options = options; } void LTOCodeGenerator::setDebugInfo(lto_debug_model debug) { @@ -252,13 +237,14 @@ const void* LTOCodeGenerator::compile(size_t* length, delete NativeObjectFile; // read .o file into memory buffer - std::unique_ptr BuffPtr; - if (std::error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) { - errMsg = ec.message(); + ErrorOr> BufferOrErr = + MemoryBuffer::getFile(name, -1, false); + if (std::error_code EC = BufferOrErr.getError()) { + errMsg = EC.message(); sys::fs::remove(NativeObjectPath); return nullptr; } - NativeObjectFile = BuffPtr.release(); + NativeObjectFile = BufferOrErr.get().release(); // remove temp files sys::fs::remove(NativeObjectPath); diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp index eac48e16e8a0..844c0f2d8e1f 100644 --- a/lib/LTO/LTOModule.cpp +++ b/lib/LTO/LTOModule.cpp @@ -24,7 +24,6 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCSection.h" -#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCTargetAsmParser.h" @@ -44,14 +43,9 @@ #include using namespace llvm; -LTOModule::LTOModule(llvm::Module *m, llvm::TargetMachine *t) - : _module(m), _target(t), - _context(_target->getMCAsmInfo(), _target->getRegisterInfo(), &ObjFileInfo), - _mangler(t->getDataLayout()) { - ObjFileInfo.InitMCObjectFileInfo(t->getTargetTriple(), - t->getRelocationModel(), t->getCodeModel(), - _context); -} +LTOModule::LTOModule(std::unique_ptr Obj, + llvm::TargetMachine *TM) + : IRFile(std::move(Obj)), _target(TM) {} /// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM /// bitcode. @@ -67,87 +61,63 @@ bool LTOModule::isBitcodeFile(const char *path) { return type == sys::fs::file_magic::bitcode; } -/// isBitcodeFileForTarget - Returns 'true' if the file (or memory contents) is -/// LLVM bitcode for the specified triple. -bool LTOModule::isBitcodeFileForTarget(const void *mem, size_t length, - const char *triplePrefix) { - MemoryBuffer *buffer = makeBuffer(mem, length); - if (!buffer) - return false; - return isTargetMatch(buffer, triplePrefix); -} - -bool LTOModule::isBitcodeFileForTarget(const char *path, - const char *triplePrefix) { - std::unique_ptr buffer; - if (MemoryBuffer::getFile(path, buffer)) - return false; - return isTargetMatch(buffer.release(), triplePrefix); -} - -/// isTargetMatch - Returns 'true' if the memory buffer is for the specified -/// target triple. -bool LTOModule::isTargetMatch(MemoryBuffer *buffer, const char *triplePrefix) { +bool LTOModule::isBitcodeForTarget(MemoryBuffer *buffer, + StringRef triplePrefix) { std::string Triple = getBitcodeTargetTriple(buffer, getGlobalContext()); - delete buffer; - return strncmp(Triple.c_str(), triplePrefix, strlen(triplePrefix)) == 0; + return StringRef(Triple).startswith(triplePrefix); } -/// makeLTOModule - Create an LTOModule. N.B. These methods take ownership of -/// the buffer. -LTOModule *LTOModule::makeLTOModule(const char *path, TargetOptions options, - std::string &errMsg) { - std::unique_ptr buffer; - if (std::error_code ec = MemoryBuffer::getFile(path, buffer)) { - errMsg = ec.message(); +LTOModule *LTOModule::createFromFile(const char *path, TargetOptions options, + std::string &errMsg) { + ErrorOr> BufferOrErr = + MemoryBuffer::getFile(path); + if (std::error_code EC = BufferOrErr.getError()) { + errMsg = EC.message(); return nullptr; } - return makeLTOModule(buffer.release(), options, errMsg); + return makeLTOModule(std::move(BufferOrErr.get()), options, errMsg); } -LTOModule *LTOModule::makeLTOModule(int fd, const char *path, - size_t size, TargetOptions options, - std::string &errMsg) { - return makeLTOModule(fd, path, size, 0, options, errMsg); +LTOModule *LTOModule::createFromOpenFile(int fd, const char *path, size_t size, + TargetOptions options, + std::string &errMsg) { + return createFromOpenFileSlice(fd, path, size, 0, options, errMsg); } -LTOModule *LTOModule::makeLTOModule(int fd, const char *path, - size_t map_size, - off_t offset, - TargetOptions options, - std::string &errMsg) { - std::unique_ptr buffer; - if (std::error_code ec = - MemoryBuffer::getOpenFileSlice(fd, path, buffer, map_size, offset)) { - errMsg = ec.message(); +LTOModule *LTOModule::createFromOpenFileSlice(int fd, const char *path, + size_t map_size, off_t offset, + TargetOptions options, + std::string &errMsg) { + ErrorOr> BufferOrErr = + MemoryBuffer::getOpenFileSlice(fd, path, map_size, offset); + if (std::error_code EC = BufferOrErr.getError()) { + errMsg = EC.message(); return nullptr; } - return makeLTOModule(buffer.release(), options, errMsg); + return makeLTOModule(std::move(BufferOrErr.get()), options, errMsg); } -LTOModule *LTOModule::makeLTOModule(const void *mem, size_t length, - TargetOptions options, - std::string &errMsg, StringRef path) { +LTOModule *LTOModule::createFromBuffer(const void *mem, size_t length, + TargetOptions options, + std::string &errMsg, StringRef path) { std::unique_ptr buffer(makeBuffer(mem, length, path)); if (!buffer) return nullptr; - return makeLTOModule(buffer.release(), options, errMsg); + return makeLTOModule(std::move(buffer), options, errMsg); } -LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer, +LTOModule *LTOModule::makeLTOModule(std::unique_ptr Buffer, TargetOptions options, std::string &errMsg) { - // parse bitcode buffer - ErrorOr ModuleOrErr = - getLazyBitcodeModule(buffer, getGlobalContext()); - if (std::error_code EC = ModuleOrErr.getError()) { + ErrorOr MOrErr = + getLazyBitcodeModule(Buffer.get(), getGlobalContext()); + if (std::error_code EC = MOrErr.getError()) { errMsg = EC.message(); - delete buffer; return nullptr; } - std::unique_ptr m(ModuleOrErr.get()); + std::unique_ptr M(MOrErr.get()); - std::string TripleStr = m->getTargetTriple(); + std::string TripleStr = M->getTargetTriple(); if (TripleStr.empty()) TripleStr = sys::getDefaultTargetTriple(); llvm::Triple Triple(TripleStr); @@ -175,18 +145,13 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer, TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr, options); - m->materializeAllPermanently(); + M->materializeAllPermanently(true); + M->setDataLayout(target->getDataLayout()); - LTOModule *Ret = new LTOModule(m.release(), target); + std::unique_ptr IRObj( + new object::IRObjectFile(std::move(Buffer), std::move(M))); - // We need a MCContext set up in order to get mangled names of private - // symbols. It is a bit odd that we need to report uses and definitions - // of private symbols, but it does look like ld64 expects to be informed - // of at least the ones with an 'l' prefix. - MCContext &Context = Ret->_context; - const TargetLoweringObjectFile &TLOF = - target->getTargetLowering()->getObjFileLowering(); - const_cast(TLOF).Initialize(Context, *target); + LTOModule *Ret = new LTOModule(std::move(IRObj), target); if (Ret->parseSymbols(errMsg)) { delete Ret; @@ -305,10 +270,20 @@ void LTOModule::addObjCClassRef(const GlobalVariable *clgv) { entry.setValue(info); } -/// addDefinedDataSymbol - Add a data symbol as defined to the list. -void LTOModule::addDefinedDataSymbol(const GlobalValue *v) { +void LTOModule::addDefinedDataSymbol(const object::BasicSymbolRef &Sym) { + SmallString<64> Buffer; + { + raw_svector_ostream OS(Buffer); + Sym.printName(OS); + } + + const GlobalValue *V = IRFile->getSymbolGV(Sym.getRawDataRefImpl()); + addDefinedDataSymbol(Buffer.c_str(), V); +} + +void LTOModule::addDefinedDataSymbol(const char *Name, const GlobalValue *v) { // Add to list of defined symbols. - addDefinedSymbol(v, false); + addDefinedSymbol(Name, v, false); if (!v->hasSection() /* || !isTargetDarwin */) return; @@ -356,10 +331,21 @@ void LTOModule::addDefinedDataSymbol(const GlobalValue *v) { } } -/// addDefinedFunctionSymbol - Add a function symbol as defined to the list. -void LTOModule::addDefinedFunctionSymbol(const Function *f) { +void LTOModule::addDefinedFunctionSymbol(const object::BasicSymbolRef &Sym) { + SmallString<64> Buffer; + { + raw_svector_ostream OS(Buffer); + Sym.printName(OS); + } + + const Function *F = + cast(IRFile->getSymbolGV(Sym.getRawDataRefImpl())); + addDefinedFunctionSymbol(Buffer.c_str(), F); +} + +void LTOModule::addDefinedFunctionSymbol(const char *Name, const Function *F) { // add to list of defined symbols - addDefinedSymbol(f, true); + addDefinedSymbol(Name, F, true); } static bool canBeHidden(const GlobalValue *GV) { @@ -386,16 +372,8 @@ static bool canBeHidden(const GlobalValue *GV) { return !GS.IsCompared; } -/// addDefinedSymbol - Add a defined symbol to the list. -void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) { - // ignore all llvm.* symbols - if (def->getName().startswith("llvm.")) - return; - - // string is owned by _defines - SmallString<64> Buffer; - _target->getNameWithPrefix(Buffer, def, _mangler); - +void LTOModule::addDefinedSymbol(const char *Name, const GlobalValue *def, + bool isFunction) { // set alignment part log2() can have rounding errors uint32_t align = def->getAlignment(); uint32_t attr = align ? countTrailingZeros(align) : 0; @@ -432,14 +410,14 @@ void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) { else attr |= LTO_SYMBOL_SCOPE_DEFAULT; - StringSet::value_type &entry = _defines.GetOrCreateValue(Buffer); + StringSet::value_type &entry = _defines.GetOrCreateValue(Name); entry.setValue(1); // fill information structure NameAndAttributes info; - StringRef Name = entry.getKey(); - info.name = Name.data(); - assert(info.name[Name.size()] == '\0'); + StringRef NameRef = entry.getKey(); + info.name = NameRef.data(); + assert(info.name[NameRef.size()] == '\0'); info.attributes = attr; info.isFunction = isFunction; info.symbol = def; @@ -484,9 +462,9 @@ void LTOModule::addAsmGlobalSymbol(const char *name, } if (info.isFunction) - addDefinedFunctionSymbol(cast(info.symbol)); + addDefinedFunctionSymbol(info.name, cast(info.symbol)); else - addDefinedDataSymbol(info.symbol); + addDefinedDataSymbol(info.name, info.symbol); _symbols.back().attributes &= ~LTO_SYMBOL_SCOPE_MASK; _symbols.back().attributes |= scope; @@ -515,20 +493,14 @@ void LTOModule::addAsmGlobalSymbolUndef(const char *name) { entry.setValue(info); } -/// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet to a -/// list to be resolved later. -void -LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) { - // ignore all llvm.* symbols - if (decl->getName().startswith("llvm.")) - return; - - // ignore all aliases - if (isa(decl)) - return; - +/// Add a symbol which isn't defined just yet to a list to be resolved later. +void LTOModule::addPotentialUndefinedSymbol(const object::BasicSymbolRef &Sym, + bool isFunc) { SmallString<64> name; - _target->getNameWithPrefix(name, decl, _mangler); + { + raw_svector_ostream OS(name); + Sym.printName(OS); + } StringMap::value_type &entry = _undefines.GetOrCreateValue(name); @@ -541,6 +513,8 @@ LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) { info.name = entry.getKey().data(); + const GlobalValue *decl = IRFile->getSymbolGV(Sym.getRawDataRefImpl()); + if (decl->hasExternalWeakLinkage()) info.attributes = LTO_SYMBOL_DEFINITION_WEAKUNDEF; else @@ -552,259 +526,54 @@ LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) { entry.setValue(info); } -namespace { - - class RecordStreamer : public MCStreamer { - public: - enum State { NeverSeen, Global, Defined, DefinedGlobal, Used }; - - private: - StringMap Symbols; - - void markDefined(const MCSymbol &Symbol) { - State &S = Symbols[Symbol.getName()]; - switch (S) { - case DefinedGlobal: - case Global: - S = DefinedGlobal; - break; - case NeverSeen: - case Defined: - case Used: - S = Defined; - break; - } - } - void markGlobal(const MCSymbol &Symbol) { - State &S = Symbols[Symbol.getName()]; - switch (S) { - case DefinedGlobal: - case Defined: - S = DefinedGlobal; - break; - - case NeverSeen: - case Global: - case Used: - S = Global; - break; - } - } - void markUsed(const MCSymbol &Symbol) { - State &S = Symbols[Symbol.getName()]; - switch (S) { - case DefinedGlobal: - case Defined: - case Global: - break; - - case NeverSeen: - case Used: - S = Used; - break; - } - } - - // FIXME: mostly copied for the obj streamer. - void AddValueSymbols(const MCExpr *Value) { - switch (Value->getKind()) { - case MCExpr::Target: - // FIXME: What should we do in here? - break; - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast(Value); - AddValueSymbols(BE->getLHS()); - AddValueSymbols(BE->getRHS()); - break; - } - - case MCExpr::SymbolRef: - markUsed(cast(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbols(cast(Value)->getSubExpr()); - break; +/// parseSymbols - Parse the symbols from the module and model-level ASM and add +/// them to either the defined or undefined lists. +bool LTOModule::parseSymbols(std::string &errMsg) { + for (auto &Sym : IRFile->symbols()) { + const GlobalValue *GV = IRFile->getSymbolGV(Sym.getRawDataRefImpl()); + uint32_t Flags = Sym.getFlags(); + if (Flags & object::BasicSymbolRef::SF_FormatSpecific) + continue; + + bool IsUndefined = Flags & object::BasicSymbolRef::SF_Undefined; + + if (!GV) { + SmallString<64> Buffer; + { + raw_svector_ostream OS(Buffer); + Sym.printName(OS); } + const char *Name = Buffer.c_str(); + + if (IsUndefined) + addAsmGlobalSymbolUndef(Name); + else if (Flags & object::BasicSymbolRef::SF_Global) + addAsmGlobalSymbol(Name, LTO_SYMBOL_SCOPE_DEFAULT); + else + addAsmGlobalSymbol(Name, LTO_SYMBOL_SCOPE_INTERNAL); + continue; } - public: - typedef StringMap::const_iterator const_iterator; - - const_iterator begin() { - return Symbols.begin(); + auto *F = dyn_cast(GV); + if (IsUndefined) { + addPotentialUndefinedSymbol(Sym, F != nullptr); + continue; } - const_iterator end() { - return Symbols.end(); + if (F) { + addDefinedFunctionSymbol(Sym); + continue; } - RecordStreamer(MCContext &Context) : MCStreamer(Context) {} - - void EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) override { - // Scan for values. - for (unsigned i = Inst.getNumOperands(); i--; ) - if (Inst.getOperand(i).isExpr()) - AddValueSymbols(Inst.getOperand(i).getExpr()); - } - void EmitLabel(MCSymbol *Symbol) override { - Symbol->setSection(*getCurrentSection().first); - markDefined(*Symbol); - } - void EmitDebugLabel(MCSymbol *Symbol) override { - EmitLabel(Symbol); - } - void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override { - // FIXME: should we handle aliases? - markDefined(*Symbol); - AddValueSymbols(Value); - } - bool EmitSymbolAttribute(MCSymbol *Symbol, - MCSymbolAttr Attribute) override { - if (Attribute == MCSA_Global) - markGlobal(*Symbol); - return true; + if (isa(GV)) { + addDefinedDataSymbol(Sym); + continue; } - void EmitZerofill(const MCSection *Section, MCSymbol *Symbol, - uint64_t Size , unsigned ByteAlignment) override { - markDefined(*Symbol); - } - void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, - unsigned ByteAlignment) override { - markDefined(*Symbol); - } - - void EmitBundleAlignMode(unsigned AlignPow2) override {} - void EmitBundleLock(bool AlignToEnd) override {} - void EmitBundleUnlock() override {} - - // Noop calls. - void ChangeSection(const MCSection *Section, - const MCExpr *Subsection) override {} - void EmitAssemblerFlag(MCAssemblerFlag Flag) override {} - void EmitThumbFunc(MCSymbol *Func) override {} - void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override {} - void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override {} - void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {} - void EmitCOFFSymbolStorageClass(int StorageClass) override {} - void EmitCOFFSymbolType(int Type) override {} - void EndCOFFSymbolDef() override {} - void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) override {} - void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, - unsigned ByteAlignment) override {} - void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, - uint64_t Size, unsigned ByteAlignment) override {} - void EmitBytes(StringRef Data) override {} - void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) override {} - void EmitULEB128Value(const MCExpr *Value) override {} - void EmitSLEB128Value(const MCExpr *Value) override {} - void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value, - unsigned ValueSize, - unsigned MaxBytesToEmit) override {} - void EmitCodeAlignment(unsigned ByteAlignment, - unsigned MaxBytesToEmit) override {} - bool EmitValueToOffset(const MCExpr *Offset, - unsigned char Value) override { return false; } - void EmitFileDirective(StringRef Filename) override {} - void FinishImpl() override {} - void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override { - RecordProcEnd(Frame); - } - }; -} // end anonymous namespace - -/// addAsmGlobalSymbols - Add global symbols from module-level ASM to the -/// defined or undefined lists. -bool LTOModule::addAsmGlobalSymbols(std::string &errMsg) { - const std::string &inlineAsm = _module->getModuleInlineAsm(); - if (inlineAsm.empty()) - return false; - - std::unique_ptr Streamer(new RecordStreamer(_context)); - MemoryBuffer *Buffer = MemoryBuffer::getMemBuffer(inlineAsm); - SourceMgr SrcMgr; - SrcMgr.AddNewSourceBuffer(Buffer, SMLoc()); - std::unique_ptr Parser( - createMCAsmParser(SrcMgr, _context, *Streamer, *_target->getMCAsmInfo())); - const Target &T = _target->getTarget(); - std::unique_ptr MCII(T.createMCInstrInfo()); - std::unique_ptr STI(T.createMCSubtargetInfo( - _target->getTargetTriple(), _target->getTargetCPU(), - _target->getTargetFeatureString())); - std::unique_ptr TAP( - T.createMCAsmParser(*STI, *Parser.get(), *MCII, - _target->Options.MCOptions)); - if (!TAP) { - errMsg = "target " + std::string(T.getName()) + - " does not define AsmParser."; - return true; - } - - Parser->setTargetParser(*TAP); - if (Parser->Run(false)) - return true; - for (RecordStreamer::const_iterator i = Streamer->begin(), - e = Streamer->end(); i != e; ++i) { - StringRef Key = i->first(); - RecordStreamer::State Value = i->second; - if (Value == RecordStreamer::DefinedGlobal) - addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_DEFAULT); - else if (Value == RecordStreamer::Defined) - addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_INTERNAL); - else if (Value == RecordStreamer::Global || - Value == RecordStreamer::Used) - addAsmGlobalSymbolUndef(Key.data()); + assert(isa(GV)); + addDefinedDataSymbol(Sym); } - return false; -} - -/// isDeclaration - Return 'true' if the global value is a declaration. -static bool isDeclaration(const GlobalValue &V) { - if (V.hasAvailableExternallyLinkage()) - return true; - - if (V.isMaterializable()) - return false; - - return V.isDeclaration(); -} - -/// parseSymbols - Parse the symbols from the module and model-level ASM and add -/// them to either the defined or undefined lists. -bool LTOModule::parseSymbols(std::string &errMsg) { - // add functions - for (Module::iterator f = _module->begin(), e = _module->end(); f != e; ++f) { - if (isDeclaration(*f)) - addPotentialUndefinedSymbol(f, true); - else - addDefinedFunctionSymbol(f); - } - - // add data - for (Module::global_iterator v = _module->global_begin(), - e = _module->global_end(); v != e; ++v) { - if (isDeclaration(*v)) - addPotentialUndefinedSymbol(v, false); - else - addDefinedDataSymbol(v); - } - - // add asm globals - if (addAsmGlobalSymbols(errMsg)) - return true; - - // add aliases - for (const auto &Alias : _module->aliases()) - addDefinedDataSymbol(&Alias); - // make symbols for all undefines for (StringMap::iterator u =_undefines.begin(), e = _undefines.end(); u != e; ++u) { @@ -821,7 +590,7 @@ bool LTOModule::parseSymbols(std::string &errMsg) { /// parseMetadata - Parse metadata from the module void LTOModule::parseMetadata() { // Linker Options - if (Value *Val = _module->getModuleFlag("Linker Options")) { + if (Value *Val = getModule().getModuleFlag("Linker Options")) { MDNode *LinkerOptions = cast(Val); for (unsigned i = 0, e = LinkerOptions->getNumOperands(); i != e; ++i) { MDNode *MDOptions = cast(LinkerOptions->getOperand(i)); diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index 688b13f228d0..5bb2862cca08 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -24,6 +24,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Cloning.h" #include +#include using namespace llvm; @@ -426,6 +427,18 @@ namespace { return true; } + bool getComdatLeader(Module *M, StringRef ComdatName, + const GlobalVariable *&GVar); + bool computeResultingSelectionKind(StringRef ComdatName, + Comdat::SelectionKind Src, + Comdat::SelectionKind Dst, + Comdat::SelectionKind &Result, + bool &LinkFromSrc); + std::map> + ComdatsChosen; + bool getComdatResult(const Comdat *SrcC, Comdat::SelectionKind &SK, + bool &LinkFromSrc); + /// getLinkageResult - This analyzes the two global values and determines /// what the result will look like in the destination module. bool getLinkageResult(GlobalValue *Dest, const GlobalValue *Src, @@ -534,6 +547,115 @@ Value *ValueMaterializerTy::materializeValueFor(Value *V) { return DF; } +bool ModuleLinker::getComdatLeader(Module *M, StringRef ComdatName, + const GlobalVariable *&GVar) { + const GlobalValue *GVal = M->getNamedValue(ComdatName); + if (const auto *GA = dyn_cast_or_null(GVal)) { + GVal = GA->getBaseObject(); + if (!GVal) + // We cannot resolve the size of the aliasee yet. + return emitError("Linking COMDATs named '" + ComdatName + + "': COMDAT key involves incomputable alias size."); + } + + GVar = dyn_cast_or_null(GVal); + if (!GVar) + return emitError( + "Linking COMDATs named '" + ComdatName + + "': GlobalVariable required for data dependent selection!"); + + return false; +} + +bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName, + Comdat::SelectionKind Src, + Comdat::SelectionKind Dst, + Comdat::SelectionKind &Result, + bool &LinkFromSrc) { + // The ability to mix Comdat::SelectionKind::Any with + // Comdat::SelectionKind::Largest is a behavior that comes from COFF. + bool DstAnyOrLargest = Dst == Comdat::SelectionKind::Any || + Dst == Comdat::SelectionKind::Largest; + bool SrcAnyOrLargest = Src == Comdat::SelectionKind::Any || + Src == Comdat::SelectionKind::Largest; + if (DstAnyOrLargest && SrcAnyOrLargest) { + if (Dst == Comdat::SelectionKind::Largest || + Src == Comdat::SelectionKind::Largest) + Result = Comdat::SelectionKind::Largest; + else + Result = Comdat::SelectionKind::Any; + } else if (Src == Dst) { + Result = Dst; + } else { + return emitError("Linking COMDATs named '" + ComdatName + + "': invalid selection kinds!"); + } + + switch (Result) { + case Comdat::SelectionKind::Any: + // Go with Dst. + LinkFromSrc = false; + break; + case Comdat::SelectionKind::NoDuplicates: + return emitError("Linking COMDATs named '" + ComdatName + + "': noduplicates has been violated!"); + case Comdat::SelectionKind::ExactMatch: + case Comdat::SelectionKind::Largest: + case Comdat::SelectionKind::SameSize: { + const GlobalVariable *DstGV; + const GlobalVariable *SrcGV; + if (getComdatLeader(DstM, ComdatName, DstGV) || + getComdatLeader(SrcM, ComdatName, SrcGV)) + return true; + + const DataLayout *DstDL = DstM->getDataLayout(); + const DataLayout *SrcDL = SrcM->getDataLayout(); + if (!DstDL || !SrcDL) { + return emitError( + "Linking COMDATs named '" + ComdatName + + "': can't do size dependent selection without DataLayout!"); + } + uint64_t DstSize = + DstDL->getTypeAllocSize(DstGV->getType()->getPointerElementType()); + uint64_t SrcSize = + SrcDL->getTypeAllocSize(SrcGV->getType()->getPointerElementType()); + if (Result == Comdat::SelectionKind::ExactMatch) { + if (SrcGV->getInitializer() != DstGV->getInitializer()) + return emitError("Linking COMDATs named '" + ComdatName + + "': ExactMatch violated!"); + LinkFromSrc = false; + } else if (Result == Comdat::SelectionKind::Largest) { + LinkFromSrc = SrcSize > DstSize; + } else if (Result == Comdat::SelectionKind::SameSize) { + if (SrcSize != DstSize) + return emitError("Linking COMDATs named '" + ComdatName + + "': SameSize violated!"); + LinkFromSrc = false; + } else { + llvm_unreachable("unknown selection kind"); + } + break; + } + } + + return false; +} + +bool ModuleLinker::getComdatResult(const Comdat *SrcC, + Comdat::SelectionKind &Result, + bool &LinkFromSrc) { + StringRef ComdatName = SrcC->getName(); + Module::ComdatSymTabType &ComdatSymTab = DstM->getComdatSymbolTable(); + Module::ComdatSymTabType::iterator DstCI = ComdatSymTab.find(ComdatName); + if (DstCI != ComdatSymTab.end()) { + const Comdat *DstC = &DstCI->second; + Comdat::SelectionKind SSK = SrcC->getSelectionKind(); + Comdat::SelectionKind DSK = DstC->getSelectionKind(); + if (computeResultingSelectionKind(ComdatName, SSK, DSK, Result, LinkFromSrc)) + return true; + } + return false; +} /// getLinkageResult - This analyzes the two global values and determines what /// the result will look like in the destination module. In particular, it @@ -764,34 +886,47 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) { llvm::Optional NewVisibility; bool HasUnnamedAddr = SGV->hasUnnamedAddr(); + bool LinkFromSrc = false; + Comdat *DC = nullptr; + if (const Comdat *SC = SGV->getComdat()) { + Comdat::SelectionKind SK; + std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; + DC = DstM->getOrInsertComdat(SC->getName()); + DC->setSelectionKind(SK); + } + if (DGV) { - // Concatenation of appending linkage variables is magic and handled later. - if (DGV->hasAppendingLinkage() || SGV->hasAppendingLinkage()) - return linkAppendingVarProto(cast(DGV), SGV); - - // Determine whether linkage of these two globals follows the source - // module's definition or the destination module's definition. - GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage; - GlobalValue::VisibilityTypes NV; - bool LinkFromSrc = false; - if (getLinkageResult(DGV, SGV, NewLinkage, NV, LinkFromSrc)) - return true; - NewVisibility = NV; - HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); + if (!DC) { + // Concatenation of appending linkage variables is magic and handled later. + if (DGV->hasAppendingLinkage() || SGV->hasAppendingLinkage()) + return linkAppendingVarProto(cast(DGV), SGV); + + // Determine whether linkage of these two globals follows the source + // module's definition or the destination module's definition. + GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage; + GlobalValue::VisibilityTypes NV; + if (getLinkageResult(DGV, SGV, NewLinkage, NV, LinkFromSrc)) + return true; + NewVisibility = NV; + HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); + + // If we're not linking from the source, then keep the definition that we + // have. + if (!LinkFromSrc) { + // Special case for const propagation. + if (GlobalVariable *DGVar = dyn_cast(DGV)) + if (DGVar->isDeclaration() && SGV->isConstant() && + !DGVar->isConstant()) + DGVar->setConstant(true); + + // Set calculated linkage, visibility and unnamed_addr. + DGV->setLinkage(NewLinkage); + DGV->setVisibility(*NewVisibility); + DGV->setUnnamedAddr(HasUnnamedAddr); + } + } - // If we're not linking from the source, then keep the definition that we - // have. if (!LinkFromSrc) { - // Special case for const propagation. - if (GlobalVariable *DGVar = dyn_cast(DGV)) - if (DGVar->isDeclaration() && SGV->isConstant() && !DGVar->isConstant()) - DGVar->setConstant(true); - - // Set calculated linkage, visibility and unnamed_addr. - DGV->setLinkage(NewLinkage); - DGV->setVisibility(*NewVisibility); - DGV->setUnnamedAddr(HasUnnamedAddr); - // Make sure to remember this mapping. ValueMap[SGV] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGV->getType())); @@ -803,6 +938,12 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) { } } + // If the Comdat this variable was inside of wasn't selected, skip it. + if (DC && !DGV && !LinkFromSrc) { + DoNotLinkFromSource.insert(SGV); + return false; + } + // No linking to be performed or linking from the source: simply create an // identical version of the symbol over in the dest module... the // initializer will be filled in later by LinkGlobalInits. @@ -818,6 +959,9 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) { NewDGV->setVisibility(*NewVisibility); NewDGV->setUnnamedAddr(HasUnnamedAddr); + if (DC) + NewDGV->setComdat(DC); + if (DGV) { DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDGV, DGV->getType())); DGV->eraseFromParent(); @@ -835,21 +979,33 @@ bool ModuleLinker::linkFunctionProto(Function *SF) { llvm::Optional NewVisibility; bool HasUnnamedAddr = SF->hasUnnamedAddr(); + bool LinkFromSrc = false; + Comdat *DC = nullptr; + if (const Comdat *SC = SF->getComdat()) { + Comdat::SelectionKind SK; + std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; + DC = DstM->getOrInsertComdat(SC->getName()); + DC->setSelectionKind(SK); + } + if (DGV) { - GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage; - bool LinkFromSrc = false; - GlobalValue::VisibilityTypes NV; - if (getLinkageResult(DGV, SF, NewLinkage, NV, LinkFromSrc)) - return true; - NewVisibility = NV; - HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); + if (!DC) { + GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage; + GlobalValue::VisibilityTypes NV; + if (getLinkageResult(DGV, SF, NewLinkage, NV, LinkFromSrc)) + return true; + NewVisibility = NV; + HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); + + if (!LinkFromSrc) { + // Set calculated linkage + DGV->setLinkage(NewLinkage); + DGV->setVisibility(*NewVisibility); + DGV->setUnnamedAddr(HasUnnamedAddr); + } + } if (!LinkFromSrc) { - // Set calculated linkage - DGV->setLinkage(NewLinkage); - DGV->setVisibility(*NewVisibility); - DGV->setUnnamedAddr(HasUnnamedAddr); - // Make sure to remember this mapping. ValueMap[SF] = ConstantExpr::getBitCast(DGV, TypeMap.get(SF->getType())); @@ -869,6 +1025,12 @@ bool ModuleLinker::linkFunctionProto(Function *SF) { return false; } + // If the Comdat this function was inside of wasn't selected, skip it. + if (DC && !DGV && !LinkFromSrc) { + DoNotLinkFromSource.insert(SF); + return false; + } + // If there is no linkage to be performed or we are linking from the source, // bring SF over. Function *NewDF = Function::Create(TypeMap.get(SF->getFunctionType()), @@ -878,6 +1040,9 @@ bool ModuleLinker::linkFunctionProto(Function *SF) { NewDF->setVisibility(*NewVisibility); NewDF->setUnnamedAddr(HasUnnamedAddr); + if (DC) + NewDF->setComdat(DC); + if (DGV) { // Any uses of DF need to change to NewDF, with cast. DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDF, DGV->getType())); @@ -895,21 +1060,33 @@ bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) { llvm::Optional NewVisibility; bool HasUnnamedAddr = SGA->hasUnnamedAddr(); + bool LinkFromSrc = false; + Comdat *DC = nullptr; + if (const Comdat *SC = SGA->getComdat()) { + Comdat::SelectionKind SK; + std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; + DC = DstM->getOrInsertComdat(SC->getName()); + DC->setSelectionKind(SK); + } + if (DGV) { - GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage; - GlobalValue::VisibilityTypes NV; - bool LinkFromSrc = false; - if (getLinkageResult(DGV, SGA, NewLinkage, NV, LinkFromSrc)) - return true; - NewVisibility = NV; - HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); + if (!DC) { + GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage; + GlobalValue::VisibilityTypes NV; + if (getLinkageResult(DGV, SGA, NewLinkage, NV, LinkFromSrc)) + return true; + NewVisibility = NV; + HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); + + if (!LinkFromSrc) { + // Set calculated linkage. + DGV->setLinkage(NewLinkage); + DGV->setVisibility(*NewVisibility); + DGV->setUnnamedAddr(HasUnnamedAddr); + } + } if (!LinkFromSrc) { - // Set calculated linkage. - DGV->setLinkage(NewLinkage); - DGV->setVisibility(*NewVisibility); - DGV->setUnnamedAddr(HasUnnamedAddr); - // Make sure to remember this mapping. ValueMap[SGA] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGA->getType())); @@ -920,6 +1097,12 @@ bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) { } } + // If the Comdat this alias was inside of wasn't selected, skip it. + if (DC && !DGV && !LinkFromSrc) { + DoNotLinkFromSource.insert(SGA); + return false; + } + // If there is no linkage to be performed or we're linking from the source, // bring over SGA. auto *PTy = cast(TypeMap.get(SGA->getType())); @@ -1254,6 +1437,18 @@ bool ModuleLinker::run() { // Loop over all of the linked values to compute type mappings. computeTypeMapping(); + ComdatsChosen.clear(); + for (const StringMapEntry &SMEC : SrcM->getComdatSymbolTable()) { + const Comdat &C = SMEC.getValue(); + if (ComdatsChosen.count(&C)) + continue; + Comdat::SelectionKind SK; + bool LinkFromSrc; + if (getComdatResult(&C, SK, LinkFromSrc)) + return true; + ComdatsChosen[&C] = std::make_pair(SK, LinkFromSrc); + } + // Insert all of the globals in src into the DstM module... without linking // initializers (which could refer to functions not yet mapped over). for (Module::global_iterator I = SrcM->global_begin(), diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt index 78bd8c4ba144..330519ece009 100644 --- a/lib/MC/CMakeLists.txt +++ b/lib/MC/CMakeLists.txt @@ -8,7 +8,6 @@ add_llvm_library(LLVMMC MCAsmInfoELF.cpp MCAsmStreamer.cpp MCAssembler.cpp - MCAtom.cpp MCCodeEmitter.cpp MCCodeGenInfo.cpp MCContext.cpp @@ -17,7 +16,6 @@ add_llvm_library(LLVMMC MCELF.cpp MCELFObjectTargetWriter.cpp MCELFStreamer.cpp - MCFunction.cpp MCExpr.cpp MCExternalSymbolizer.cpp MCInst.cpp @@ -27,13 +25,9 @@ add_llvm_library(LLVMMC MCLinkerOptimizationHint.cpp MCMachOStreamer.cpp MCMachObjectTargetWriter.cpp - MCModule.cpp - MCModuleYAML.cpp MCNullStreamer.cpp MCObjectFileInfo.cpp - MCObjectDisassembler.cpp MCObjectStreamer.cpp - MCObjectSymbolizer.cpp MCObjectWriter.cpp MCRegisterInfo.cpp MCRelocationInfo.cpp @@ -49,10 +43,13 @@ add_llvm_library(LLVMMC MCValue.cpp MCWin64EH.cpp MachObjectWriter.cpp + StringTableBuilder.cpp SubtargetFeature.cpp WinCOFFObjectWriter.cpp WinCOFFStreamer.cpp + YAML.cpp ) +add_subdirectory(MCAnalysis) add_subdirectory(MCParser) add_subdirectory(MCDisassembler) diff --git a/lib/MC/ConstantPools.cpp b/lib/MC/ConstantPools.cpp index f979dad47da5..c4cea604b146 100644 --- a/lib/MC/ConstantPools.cpp +++ b/lib/MC/ConstantPools.cpp @@ -24,21 +24,22 @@ using namespace llvm; void ConstantPool::emitEntries(MCStreamer &Streamer) { if (Entries.empty()) return; - Streamer.EmitCodeAlignment(4); // align to 4-byte address Streamer.EmitDataRegion(MCDR_DataRegion); for (EntryVecTy::const_iterator I = Entries.begin(), E = Entries.end(); I != E; ++I) { - Streamer.EmitLabel(I->first); - Streamer.EmitValue(I->second, 4); + Streamer.EmitCodeAlignment(I->Size); // align naturally + Streamer.EmitLabel(I->Label); + Streamer.EmitValue(I->Value, I->Size); } Streamer.EmitDataRegion(MCDR_DataRegionEnd); Entries.clear(); } -const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context) { +const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context, + unsigned Size) { MCSymbol *CPEntryLabel = Context.CreateTempSymbol(); - Entries.push_back(std::make_pair(CPEntryLabel, Value)); + Entries.push_back(ConstantPoolEntry(CPEntryLabel, Value, Size)); return MCSymbolRefExpr::Create(CPEntryLabel, Context); } @@ -89,7 +90,9 @@ void AssemblerConstantPools::emitForCurrentSection(MCStreamer &Streamer) { } const MCExpr *AssemblerConstantPools::addEntry(MCStreamer &Streamer, - const MCExpr *Expr) { + const MCExpr *Expr, + unsigned Size) { const MCSection *Section = Streamer.getCurrentSection().first; - return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext()); + return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext(), + Size); } diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp index a98a13e0cd42..5779b27a2c40 100644 --- a/lib/MC/ELFObjectWriter.cpp +++ b/lib/MC/ELFObjectWriter.cpp @@ -28,7 +28,7 @@ #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCValue.h" -#include "llvm/Object/StringTableBuilder.h" +#include "llvm/MC/StringTableBuilder.h" #include "llvm/Support/Compression.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Endian.h" @@ -782,7 +782,7 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm, if (Asm.isThumbFunc(&Sym)) return true; - if (TargetObjectWriter->needsRelocateWithSymbol(Type)) + if (TargetObjectWriter->needsRelocateWithSymbol(*SD, Type)) return true; return false; } @@ -1565,6 +1565,7 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm, case ELF::SHT_X86_64_UNWIND: case ELF::SHT_MIPS_REGINFO: case ELF::SHT_MIPS_OPTIONS: + case ELF::SHT_MIPS_ABIFLAGS: // Nothing to do. break; @@ -1574,8 +1575,7 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm, break; default: - assert(0 && "FIXME: sh_type value not supported!"); - break; + llvm_unreachable("FIXME: sh_type value not supported!"); } if (TargetObjectWriter->getEMachine() == ELF::EM_ARM && diff --git a/lib/MC/LLVMBuild.txt b/lib/MC/LLVMBuild.txt index f35dbe4d5d35..3fcb50b97c6d 100644 --- a/lib/MC/LLVMBuild.txt +++ b/lib/MC/LLVMBuild.txt @@ -16,10 +16,10 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = MCDisassembler MCParser +subdirectories = MCAnalysis MCDisassembler MCParser [component_0] type = Library name = MC parent = Libraries -required_libraries = Object Support +required_libraries = Support diff --git a/lib/MC/MCAnalysis/CMakeLists.txt b/lib/MC/MCAnalysis/CMakeLists.txt new file mode 100644 index 000000000000..81eae2dfb153 --- /dev/null +++ b/lib/MC/MCAnalysis/CMakeLists.txt @@ -0,0 +1,8 @@ +add_llvm_library(LLVMMCAnalysis + MCAtom.cpp + MCFunction.cpp + MCModule.cpp + MCModuleYAML.cpp + MCObjectDisassembler.cpp + MCObjectSymbolizer.cpp +) diff --git a/lib/MC/MCAnalysis/LLVMBuild.txt b/lib/MC/MCAnalysis/LLVMBuild.txt new file mode 100644 index 000000000000..1b58fec6cc45 --- /dev/null +++ b/lib/MC/MCAnalysis/LLVMBuild.txt @@ -0,0 +1,5 @@ +[component_0] +type = Library +name = MCAnalysis +parent = Libraries +required_libraries = MC Object Support diff --git a/lib/MC/MCAtom.cpp b/lib/MC/MCAnalysis/MCAtom.cpp similarity index 97% rename from lib/MC/MCAtom.cpp rename to lib/MC/MCAnalysis/MCAtom.cpp index bc353cdcf65d..82056eed1ea9 100644 --- a/lib/MC/MCAtom.cpp +++ b/lib/MC/MCAnalysis/MCAtom.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#include "llvm/MC/MCAtom.h" -#include "llvm/MC/MCModule.h" +#include "llvm/MC/MCAnalysis/MCAtom.h" +#include "llvm/MC/MCAnalysis/MCModule.h" #include "llvm/Support/ErrorHandling.h" #include diff --git a/lib/MC/MCFunction.cpp b/lib/MC/MCAnalysis/MCFunction.cpp similarity index 94% rename from lib/MC/MCFunction.cpp rename to lib/MC/MCAnalysis/MCFunction.cpp index 1ddc2505f071..4e09d1a52dac 100644 --- a/lib/MC/MCFunction.cpp +++ b/lib/MC/MCAnalysis/MCFunction.cpp @@ -7,9 +7,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/MC/MCFunction.h" -#include "llvm/MC/MCAtom.h" -#include "llvm/MC/MCModule.h" +#include "llvm/MC/MCAnalysis/MCFunction.h" +#include "llvm/MC/MCAnalysis/MCAtom.h" +#include "llvm/MC/MCAnalysis/MCModule.h" #include using namespace llvm; diff --git a/lib/MC/MCModule.cpp b/lib/MC/MCAnalysis/MCModule.cpp similarity index 97% rename from lib/MC/MCModule.cpp rename to lib/MC/MCAnalysis/MCModule.cpp index 3ed735689d74..7512299c9e0a 100644 --- a/lib/MC/MCModule.cpp +++ b/lib/MC/MCAnalysis/MCModule.cpp @@ -7,10 +7,10 @@ // //===----------------------------------------------------------------------===// +#include "llvm/MC/MCAnalysis/MCModule.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/MC/MCModule.h" -#include "llvm/MC/MCAtom.h" -#include "llvm/MC/MCFunction.h" +#include "llvm/MC/MCAnalysis/MCAtom.h" +#include "llvm/MC/MCAnalysis/MCFunction.h" #include using namespace llvm; diff --git a/lib/MC/MCModuleYAML.cpp b/lib/MC/MCAnalysis/MCModuleYAML.cpp similarity index 98% rename from lib/MC/MCModuleYAML.cpp rename to lib/MC/MCAnalysis/MCModuleYAML.cpp index f6b7431eb3bf..876b06de9c90 100644 --- a/lib/MC/MCModuleYAML.cpp +++ b/lib/MC/MCAnalysis/MCModuleYAML.cpp @@ -11,13 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/MC/MCModuleYAML.h" +#include "llvm/MC/MCAnalysis/MCModuleYAML.h" #include "llvm/ADT/StringMap.h" -#include "llvm/MC/MCAtom.h" -#include "llvm/MC/MCFunction.h" +#include "llvm/MC/MCAnalysis/MCAtom.h" +#include "llvm/MC/MCAnalysis/MCFunction.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" -#include "llvm/Object/YAML.h" +#include "llvm/MC/YAML.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Casting.h" #include "llvm/Support/MathExtras.h" @@ -102,7 +102,7 @@ struct Atom { uint64_t Size; std::vector Insts; - object::yaml::BinaryRef Data; + yaml::BinaryRef Data; }; struct BasicBlock { diff --git a/lib/MC/MCObjectDisassembler.cpp b/lib/MC/MCAnalysis/MCObjectDisassembler.cpp similarity index 99% rename from lib/MC/MCObjectDisassembler.cpp rename to lib/MC/MCAnalysis/MCObjectDisassembler.cpp index 8a258cb09097..0f789ff040ff 100644 --- a/lib/MC/MCObjectDisassembler.cpp +++ b/lib/MC/MCAnalysis/MCObjectDisassembler.cpp @@ -13,11 +13,11 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" -#include "llvm/MC/MCAtom.h" +#include "llvm/MC/MCAnalysis/MCAtom.h" +#include "llvm/MC/MCAnalysis/MCFunction.h" +#include "llvm/MC/MCAnalysis/MCModule.h" #include "llvm/MC/MCDisassembler.h" -#include "llvm/MC/MCFunction.h" #include "llvm/MC/MCInstrAnalysis.h" -#include "llvm/MC/MCModule.h" #include "llvm/MC/MCObjectSymbolizer.h" #include "llvm/Object/MachO.h" #include "llvm/Object/ObjectFile.h" diff --git a/lib/MC/MCObjectSymbolizer.cpp b/lib/MC/MCAnalysis/MCObjectSymbolizer.cpp similarity index 100% rename from lib/MC/MCObjectSymbolizer.cpp rename to lib/MC/MCAnalysis/MCObjectSymbolizer.cpp diff --git a/unittests/Object/Makefile b/lib/MC/MCAnalysis/Makefile similarity index 56% rename from unittests/Object/Makefile rename to lib/MC/MCAnalysis/Makefile index 9062149a24d9..add2dbd81eaa 100644 --- a/unittests/Object/Makefile +++ b/lib/MC/MCAnalysis/Makefile @@ -1,4 +1,4 @@ -##===- unittests/Object/Makefile ---------------------------*- Makefile -*-===## +##===- lib/MC/MCAnalysys/Makefile --------------------------*- Makefile -*-===## # # The LLVM Compiler Infrastructure # @@ -7,9 +7,8 @@ # ##===----------------------------------------------------------------------===## -LEVEL = ../.. -TESTNAME = Object -LINK_COMPONENTS := object +LEVEL = ../../.. +LIBRARYNAME = LLVMMCAnalysis +BUILD_ARCHIVE := 1 -include $(LEVEL)/Makefile.config -include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest +include $(LEVEL)/Makefile.common diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp index 29105834499d..f8081ef97d3f 100644 --- a/lib/MC/MCAsmInfo.cpp +++ b/lib/MC/MCAsmInfo.cpp @@ -39,7 +39,7 @@ MCAsmInfo::MCAsmInfo() { SeparatorString = ";"; CommentString = "#"; LabelSuffix = ":"; - DebugLabelSuffix = ":"; + UseAssignmentForEHBegin = false; PrivateGlobalPrefix = "L"; LinkerPrivateGlobalPrefix = ""; InlineAsmStart = "APP"; diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp index bccf5b9cd287..14f0f05edd1f 100644 --- a/lib/MC/MCAsmStreamer.cpp +++ b/lib/MC/MCAsmStreamer.cpp @@ -120,7 +120,6 @@ class MCAsmStreamer : public MCStreamer { void EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override; void EmitLabel(MCSymbol *Symbol) override; - void EmitDebugLabel(MCSymbol *Symbol) override; void EmitAssemblerFlag(MCAssemblerFlag Flag) override; void EmitLinkerOptions(ArrayRef Options) override; @@ -213,20 +212,20 @@ class MCAsmStreamer : public MCStreamer { void EmitCFIRegister(int64_t Register1, int64_t Register2) override; void EmitCFIWindowSave() override; - void EmitWin64EHStartProc(const MCSymbol *Symbol) override; - void EmitWin64EHEndProc() override; - void EmitWin64EHStartChained() override; - void EmitWin64EHEndChained() override; - void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind, - bool Except) override; - void EmitWin64EHHandlerData() override; - void EmitWin64EHPushReg(unsigned Register) override; - void EmitWin64EHSetFrame(unsigned Register, unsigned Offset) override; - void EmitWin64EHAllocStack(unsigned Size) override; - void EmitWin64EHSaveReg(unsigned Register, unsigned Offset) override; - void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) override; - void EmitWin64EHPushFrame(bool Code) override; - void EmitWin64EHEndProlog() override; + void EmitWinCFIStartProc(const MCSymbol *Symbol) override; + void EmitWinCFIEndProc() override; + void EmitWinCFIStartChained() override; + void EmitWinCFIEndChained() override; + void EmitWinCFIPushReg(unsigned Register) override; + void EmitWinCFISetFrame(unsigned Register, unsigned Offset) override; + void EmitWinCFIAllocStack(unsigned Size) override; + void EmitWinCFISaveReg(unsigned Register, unsigned Offset) override; + void EmitWinCFISaveXMM(unsigned Register, unsigned Offset) override; + void EmitWinCFIPushFrame(bool Code) override; + void EmitWinCFIEndProlog() override; + + void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except) override; + void EmitWinEHHandlerData() override; void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; @@ -334,14 +333,6 @@ void MCAsmStreamer::EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) { EmitEOL(); } -void MCAsmStreamer::EmitDebugLabel(MCSymbol *Symbol) { - assert(Symbol->isUndefined() && "Cannot define a symbol twice!"); - MCStreamer::EmitDebugLabel(Symbol); - - OS << *Symbol << MAI->getDebugLabelSuffix(); - EmitEOL(); -} - void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { switch (Flag) { case MCAF_SyntaxUnified: OS << "\t.syntax unified"; break; @@ -944,10 +935,7 @@ void MCAsmStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) { } void MCAsmStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) { - // Put a dummy non-null value in Frame.End to mark that this frame has been - // closed. - Frame.End = (MCSymbol *) 1; - + MCStreamer::EmitCFIEndProcImpl(Frame); OS << "\t.cfi_endproc"; EmitEOL(); } @@ -1061,37 +1049,37 @@ void MCAsmStreamer::EmitCFIWindowSave() { EmitEOL(); } -void MCAsmStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) { - MCStreamer::EmitWin64EHStartProc(Symbol); +void MCAsmStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol) { + MCStreamer::EmitWinCFIStartProc(Symbol); OS << ".seh_proc " << *Symbol; EmitEOL(); } -void MCAsmStreamer::EmitWin64EHEndProc() { - MCStreamer::EmitWin64EHEndProc(); +void MCAsmStreamer::EmitWinCFIEndProc() { + MCStreamer::EmitWinCFIEndProc(); OS << "\t.seh_endproc"; EmitEOL(); } -void MCAsmStreamer::EmitWin64EHStartChained() { - MCStreamer::EmitWin64EHStartChained(); +void MCAsmStreamer::EmitWinCFIStartChained() { + MCStreamer::EmitWinCFIStartChained(); OS << "\t.seh_startchained"; EmitEOL(); } -void MCAsmStreamer::EmitWin64EHEndChained() { - MCStreamer::EmitWin64EHEndChained(); +void MCAsmStreamer::EmitWinCFIEndChained() { + MCStreamer::EmitWinCFIEndChained(); OS << "\t.seh_endchained"; EmitEOL(); } -void MCAsmStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind, - bool Except) { - MCStreamer::EmitWin64EHHandler(Sym, Unwind, Except); +void MCAsmStreamer::EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, + bool Except) { + MCStreamer::EmitWinEHHandler(Sym, Unwind, Except); OS << "\t.seh_handler " << *Sym; if (Unwind) @@ -1114,14 +1102,14 @@ static const MCSection *getWin64EHTableSection(StringRef suffix, SectionKind::getDataRel()); } -void MCAsmStreamer::EmitWin64EHHandlerData() { - MCStreamer::EmitWin64EHHandlerData(); +void MCAsmStreamer::EmitWinEHHandlerData() { + MCStreamer::EmitWinEHHandlerData(); // Switch sections. Don't call SwitchSection directly, because that will // cause the section switch to be visible in the emitted assembly. // We only do this so the section switch that terminates the handler // data block is visible. - MCWin64EHUnwindInfo *CurFrame = getCurrentW64UnwindInfo(); + MCWinFrameInfo *CurFrame = getCurrentWinFrameInfo(); StringRef suffix=MCWin64EHUnwindEmitter::GetSectionSuffix(CurFrame->Function); const MCSection *xdataSect = getWin64EHTableSection(suffix, getContext()); if (xdataSect) @@ -1131,43 +1119,43 @@ void MCAsmStreamer::EmitWin64EHHandlerData() { EmitEOL(); } -void MCAsmStreamer::EmitWin64EHPushReg(unsigned Register) { - MCStreamer::EmitWin64EHPushReg(Register); +void MCAsmStreamer::EmitWinCFIPushReg(unsigned Register) { + MCStreamer::EmitWinCFIPushReg(Register); OS << "\t.seh_pushreg " << Register; EmitEOL(); } -void MCAsmStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) { - MCStreamer::EmitWin64EHSetFrame(Register, Offset); +void MCAsmStreamer::EmitWinCFISetFrame(unsigned Register, unsigned Offset) { + MCStreamer::EmitWinCFISetFrame(Register, Offset); OS << "\t.seh_setframe " << Register << ", " << Offset; EmitEOL(); } -void MCAsmStreamer::EmitWin64EHAllocStack(unsigned Size) { - MCStreamer::EmitWin64EHAllocStack(Size); +void MCAsmStreamer::EmitWinCFIAllocStack(unsigned Size) { + MCStreamer::EmitWinCFIAllocStack(Size); OS << "\t.seh_stackalloc " << Size; EmitEOL(); } -void MCAsmStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) { - MCStreamer::EmitWin64EHSaveReg(Register, Offset); +void MCAsmStreamer::EmitWinCFISaveReg(unsigned Register, unsigned Offset) { + MCStreamer::EmitWinCFISaveReg(Register, Offset); OS << "\t.seh_savereg " << Register << ", " << Offset; EmitEOL(); } -void MCAsmStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) { - MCStreamer::EmitWin64EHSaveXMM(Register, Offset); +void MCAsmStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset) { + MCStreamer::EmitWinCFISaveXMM(Register, Offset); OS << "\t.seh_savexmm " << Register << ", " << Offset; EmitEOL(); } -void MCAsmStreamer::EmitWin64EHPushFrame(bool Code) { - MCStreamer::EmitWin64EHPushFrame(Code); +void MCAsmStreamer::EmitWinCFIPushFrame(bool Code) { + MCStreamer::EmitWinCFIPushFrame(Code); OS << "\t.seh_pushframe"; if (Code) @@ -1175,8 +1163,8 @@ void MCAsmStreamer::EmitWin64EHPushFrame(bool Code) { EmitEOL(); } -void MCAsmStreamer::EmitWin64EHEndProlog(void) { - MCStreamer::EmitWin64EHEndProlog(); +void MCAsmStreamer::EmitWinCFIEndProlog(void) { + MCStreamer::EmitWinCFIEndProlog(); OS << "\t.seh_endprologue"; EmitEOL(); diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp index 886a5f554531..a8aad7124bdb 100644 --- a/lib/MC/MCAssembler.cpp +++ b/lib/MC/MCAssembler.cpp @@ -27,6 +27,7 @@ #include "llvm/Support/LEB128.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/MC/MCSectionELF.h" #include using namespace llvm; @@ -433,12 +434,27 @@ const MCSymbolData *MCAssembler::getAtom(const MCSymbolData *SD) const { return SD->getFragment()->getAtom(); } +// Try to fully compute Expr to an absolute value and if that fails produce +// a relocatable expr. +// FIXME: Should this be the behavior of EvaluateAsRelocatable itself? +static bool evaluate(const MCExpr &Expr, const MCAsmLayout &Layout, + MCValue &Target) { + if (Expr.EvaluateAsValue(Target, &Layout)) + if (Target.isAbsolute()) + return true; + return Expr.EvaluateAsRelocatable(Target, &Layout); +} + bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup, const MCFragment *DF, MCValue &Target, uint64_t &Value) const { ++stats::evaluateFixup; - if (!Fixup.getValue()->EvaluateAsRelocatable(Target, &Layout)) + // FIXME: This code has some duplication with RecordRelocation. We should + // probably merge the two into a single callback that tries to evaluate a + // fixup and records a relocation if one is needed. + const MCExpr *Expr = Fixup.getValue(); + if (!evaluate(*Expr, Layout, Target)) getContext().FatalError(Fixup.getLoc(), "expected relocatable expression"); bool IsPCRel = Backend.getFixupKindInfo( @@ -782,8 +798,13 @@ void MCAssembler::writeSectionData(const MCSectionData *SD, assert(DF.fixup_begin() == DF.fixup_end() && "Cannot have fixups in virtual section!"); for (unsigned i = 0, e = DF.getContents().size(); i != e; ++i) - assert(DF.getContents()[i] == 0 && - "Invalid data value for virtual section!"); + if (DF.getContents()[i]) { + if (auto *ELFSec = dyn_cast(&SD->getSection())) + report_fatal_error("non-zero initializer found in section '" + + ELFSec->getSectionName() + "'"); + else + report_fatal_error("non-zero initializer found in virtual section"); + } break; } case MCFragment::FT_Align: @@ -1222,7 +1243,7 @@ void MCSectionData::dump() { OS << "]>"; } -void MCSymbolData::dump() { +void MCSymbolData::dump() const { raw_ostream &OS = llvm::errs(); OS << "getNumBuffers() > 0) - MainFileName = SrcMgr->getMemoryBuffer(0)->getBufferIdentifier(); + if (SrcMgr && SrcMgr->getNumBuffers()) + MainFileName = + SrcMgr->getMemoryBuffer(SrcMgr->getMainFileID())->getBufferIdentifier(); } MCContext::~MCContext() { @@ -284,17 +285,17 @@ const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section, int Selection) { // Do the lookup, if we have a hit, return it. - SectionGroupPair P(Section, COMDATSymName); - auto IterBool = COFFUniquingMap.insert(std::make_pair(P, nullptr)); + SectionGroupTriple T(Section, COMDATSymName, Selection); + auto IterBool = COFFUniquingMap.insert(std::make_pair(T, nullptr)); auto Iter = IterBool.first; if (!IterBool.second) return Iter->second; - const MCSymbol *COMDATSymbol = nullptr; + MCSymbol *COMDATSymbol = nullptr; if (!COMDATSymName.empty()) COMDATSymbol = GetOrCreateSymbol(COMDATSymName); - StringRef CachedName = Iter->first.first; + StringRef CachedName = std::get<0>(Iter->first); MCSectionCOFF *Result = new (*this) MCSectionCOFF(CachedName, Characteristics, COMDATSymbol, Selection, Kind); @@ -309,8 +310,8 @@ MCContext::getCOFFSection(StringRef Section, unsigned Characteristics, } const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section) { - SectionGroupPair P(Section, ""); - auto Iter = COFFUniquingMap.find(P); + SectionGroupTriple T(Section, "", 0); + auto Iter = COFFUniquingMap.find(T); if (Iter == COFFUniquingMap.end()) return nullptr; return Iter->second; @@ -340,6 +341,29 @@ bool MCContext::isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID) { return !MCDwarfFiles[FileNumber].Name.empty(); } +/// finalizeDwarfSections - Emit end symbols for each non-empty code section. +/// Also remove empty sections from SectionStartEndSyms, to avoid generating +/// useless debug info for them. +void MCContext::finalizeDwarfSections(MCStreamer &MCOS) { + MCContext &context = MCOS.getContext(); + + auto sec = SectionStartEndSyms.begin(); + while (sec != SectionStartEndSyms.end()) { + assert(sec->second.first && "Start symbol must be set by now"); + MCOS.SwitchSection(sec->first); + if (MCOS.mayHaveInstructions()) { + MCSymbol *SectionEndSym = context.CreateTempSymbol(); + MCOS.EmitLabel(SectionEndSym); + sec->second.second = SectionEndSym; + ++sec; + } else { + MapVector >::iterator + to_erase = sec; + sec = SectionStartEndSyms.erase(to_erase); + } + } +} + void MCContext::FatalError(SMLoc Loc, const Twine &Msg) const { // If we have a source manager and a location, use it. Otherwise just // use the generic report_fatal_error(). diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp index be6731abedd9..968cbc96a954 100644 --- a/lib/MC/MCDwarf.cpp +++ b/lib/MC/MCDwarf.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSection.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -518,8 +519,12 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) { MCOS->EmitULEB128IntValue(dwarf::DW_TAG_compile_unit); MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1); EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4); - EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr); - EmitAbbrev(MCOS, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr); + if (MCOS->getContext().getGenDwarfSectionSyms().size() > 1) { + EmitAbbrev(MCOS, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4); + } else { + EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr); + EmitAbbrev(MCOS, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr); + } EmitAbbrev(MCOS, dwarf::DW_AT_name, dwarf::DW_FORM_string); if (!context.getCompilationDir().empty()) EmitAbbrev(MCOS, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string); @@ -552,20 +557,14 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) { } // When generating dwarf for assembly source files this emits the data for -// .debug_aranges section. Which contains a header and a table of pairs of -// PointerSize'ed values for the address and size of section(s) with line table -// entries (just the default .text in our case) and a terminating pair of zeros. +// .debug_aranges section. This section contains a header and a table of pairs +// of PointerSize'ed values for the address and size of section(s) with line +// table entries. static void EmitGenDwarfAranges(MCStreamer *MCOS, const MCSymbol *InfoSectionSymbol) { MCContext &context = MCOS->getContext(); - // Create a symbol at the end of the section that we are creating the dwarf - // debugging info to use later in here as part of the expression to calculate - // the size of the section for the table. - MCOS->SwitchSection(context.getGenDwarfSection()); - MCSymbol *SectionEndSym = context.CreateTempSymbol(); - MCOS->EmitLabel(SectionEndSym); - context.setGenDwarfSectionEndSym(SectionEndSym); + auto &Sections = context.getGenDwarfSectionSyms(); MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection()); @@ -583,8 +582,8 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS, Length += Pad; // Add the size of the pair of PointerSize'ed values for the address and size - // of the one default .text section we have in the table. - Length += 2 * AddrSize; + // of each section we have in the table. + Length += 2 * AddrSize * Sections.size(); // And the pair of terminating zeros. Length += 2 * AddrSize; @@ -608,14 +607,21 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS, for(int i = 0; i < Pad; i++) MCOS->EmitIntValue(0, 1); - // Now emit the table of pairs of PointerSize'ed values for the section(s) - // address and size, in our case just the one default .text section. - const MCExpr *Addr = MCSymbolRefExpr::Create( - context.getGenDwarfSectionStartSym(), MCSymbolRefExpr::VK_None, context); - const MCExpr *Size = MakeStartMinusEndExpr(*MCOS, - *context.getGenDwarfSectionStartSym(), *SectionEndSym, 0); - MCOS->EmitValue(Addr, AddrSize); - MCOS->EmitAbsValue(Size, AddrSize); + // Now emit the table of pairs of PointerSize'ed values for the section + // addresses and sizes. + for (const auto &sec : Sections) { + MCSymbol *StartSymbol = sec.second.first; + MCSymbol *EndSymbol = sec.second.second; + assert(StartSymbol && "StartSymbol must not be NULL"); + assert(EndSymbol && "EndSymbol must not be NULL"); + + const MCExpr *Addr = MCSymbolRefExpr::Create( + StartSymbol, MCSymbolRefExpr::VK_None, context); + const MCExpr *Size = MakeStartMinusEndExpr(*MCOS, + *StartSymbol, *EndSymbol, 0); + MCOS->EmitValue(Addr, AddrSize); + MCOS->EmitAbsValue(Size, AddrSize); + } // And finally the pair of terminating zeros. MCOS->EmitIntValue(0, AddrSize); @@ -627,7 +633,8 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS, // DIE and a list of label DIEs. static void EmitGenDwarfInfo(MCStreamer *MCOS, const MCSymbol *AbbrevSectionSymbol, - const MCSymbol *LineSectionSymbol) { + const MCSymbol *LineSectionSymbol, + const MCSymbol *RangesSectionSymbol) { MCContext &context = MCOS->getContext(); MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection()); @@ -648,13 +655,14 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS, // The 2 byte DWARF version. MCOS->EmitIntValue(context.getDwarfVersion(), 2); + const MCAsmInfo &AsmInfo = *context.getAsmInfo(); // The 4 byte offset to the debug abbrevs from the start of the .debug_abbrev, // it is at the start of that section so this is zero. - if (AbbrevSectionSymbol) { - MCOS->EmitSymbolValue(AbbrevSectionSymbol, 4); - } else { + if (AbbrevSectionSymbol == nullptr) MCOS->EmitIntValue(0, 4); - } + else + MCOS->EmitSymbolValue(AbbrevSectionSymbol, 4, + AsmInfo.needsDwarfSectionOffsetDirective()); const MCAsmInfo *asmInfo = context.getAsmInfo(); int AddrSize = asmInfo->getPointerSize(); @@ -674,15 +682,37 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS, MCOS->EmitIntValue(0, 4); } - // AT_low_pc, the first address of the default .text section. - const MCExpr *Start = MCSymbolRefExpr::Create( - context.getGenDwarfSectionStartSym(), MCSymbolRefExpr::VK_None, context); - MCOS->EmitValue(Start, AddrSize); + if (RangesSectionSymbol) { + // There are multiple sections containing code, so we must use the + // .debug_ranges sections. - // AT_high_pc, the last address of the default .text section. - const MCExpr *End = MCSymbolRefExpr::Create( - context.getGenDwarfSectionEndSym(), MCSymbolRefExpr::VK_None, context); - MCOS->EmitValue(End, AddrSize); + // AT_ranges, the 4 byte offset from the start of the .debug_ranges section + // to the address range list for this compilation unit. + MCOS->EmitSymbolValue(RangesSectionSymbol, 4); + } else { + // If we only have one non-empty code section, we can use the simpler + // AT_low_pc and AT_high_pc attributes. + + // Find the first (and only) non-empty text section + auto &Sections = context.getGenDwarfSectionSyms(); + const auto TextSection = Sections.begin(); + assert(TextSection != Sections.end() && "No text section found"); + + MCSymbol *StartSymbol = TextSection->second.first; + MCSymbol *EndSymbol = TextSection->second.second; + assert(StartSymbol && "StartSymbol must not be NULL"); + assert(EndSymbol && "EndSymbol must not be NULL"); + + // AT_low_pc, the first address of the default .text section. + const MCExpr *Start = MCSymbolRefExpr::Create( + StartSymbol, MCSymbolRefExpr::VK_None, context); + MCOS->EmitValue(Start, AddrSize); + + // AT_high_pc, the last address of the default .text section. + const MCExpr *End = MCSymbolRefExpr::Create( + EndSymbol, MCSymbolRefExpr::VK_None, context); + MCOS->EmitValue(End, AddrSize); + } // AT_name, the name of the source file. Reconstruct from the first directory // and file table entries. @@ -711,14 +741,10 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS, // AT_producer, the version of the assembler tool. StringRef DwarfDebugProducer = context.getDwarfDebugProducer(); - if (!DwarfDebugProducer.empty()){ + if (!DwarfDebugProducer.empty()) MCOS->EmitBytes(DwarfDebugProducer); - } - else { - MCOS->EmitBytes(StringRef("llvm-mc (based on LLVM ")); - MCOS->EmitBytes(StringRef(PACKAGE_VERSION)); - MCOS->EmitBytes(StringRef(")")); - } + else + MCOS->EmitBytes(StringRef("llvm-mc (based on LLVM " PACKAGE_VERSION ")")); MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string. // AT_language, a 4 byte value. We use DW_LANG_Mips_Assembler as the dwarf2 @@ -766,13 +792,51 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS, MCOS->EmitLabel(InfoEnd); } +// When generating dwarf for assembly source files this emits the data for +// .debug_ranges section. We only emit one range list, which spans all of the +// executable sections of this file. +static void EmitGenDwarfRanges(MCStreamer *MCOS) { + MCContext &context = MCOS->getContext(); + auto &Sections = context.getGenDwarfSectionSyms(); + + const MCAsmInfo *AsmInfo = context.getAsmInfo(); + int AddrSize = AsmInfo->getPointerSize(); + + MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRangesSection()); + + for (const auto sec : Sections) { + + MCSymbol *StartSymbol = sec.second.first; + MCSymbol *EndSymbol = sec.second.second; + assert(StartSymbol && "StartSymbol must not be NULL"); + assert(EndSymbol && "EndSymbol must not be NULL"); + + // Emit a base address selection entry for the start of this section + const MCExpr *SectionStartAddr = MCSymbolRefExpr::Create( + StartSymbol, MCSymbolRefExpr::VK_None, context); + MCOS->EmitFill(AddrSize, 0xFF); + MCOS->EmitValue(SectionStartAddr, AddrSize); + + // Emit a range list entry spanning this section + const MCExpr *SectionSize = MakeStartMinusEndExpr(*MCOS, + *StartSymbol, *EndSymbol, 0); + MCOS->EmitIntValue(0, AddrSize); + MCOS->EmitAbsValue(SectionSize, AddrSize); + } + + // Emit end of list entry + MCOS->EmitIntValue(0, AddrSize); + MCOS->EmitIntValue(0, AddrSize); +} + // // When generating dwarf for assembly source files this emits the Dwarf // sections. // void MCGenDwarfInfo::Emit(MCStreamer *MCOS) { - // Create the dwarf sections in this order (.debug_line already created). MCContext &context = MCOS->getContext(); + + // Create the dwarf sections in this order (.debug_line already created). const MCAsmInfo *AsmInfo = context.getAsmInfo(); bool CreateDwarfSectionSymbols = AsmInfo->doesDwarfUseRelocationsAcrossSections(); @@ -781,6 +845,22 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) { LineSectionSymbol = MCOS->getDwarfLineTableSymbol(0); MCSymbol *AbbrevSectionSymbol = nullptr; MCSymbol *InfoSectionSymbol = nullptr; + MCSymbol *RangesSectionSymbol = NULL; + + // Create end symbols for each section, and remove empty sections + MCOS->getContext().finalizeDwarfSections(*MCOS); + + // If there are no sections to generate debug info for, we don't need + // to do anything + if (MCOS->getContext().getGenDwarfSectionSyms().empty()) + return; + + // We only need to use the .debug_ranges section if we have multiple + // code sections. + const bool UseRangesSection = + MCOS->getContext().getGenDwarfSectionSyms().size() > 1; + CreateDwarfSectionSymbols |= UseRangesSection; + MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection()); if (CreateDwarfSectionSymbols) { InfoSectionSymbol = context.CreateTempSymbol(); @@ -791,20 +871,30 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) { AbbrevSectionSymbol = context.CreateTempSymbol(); MCOS->EmitLabel(AbbrevSectionSymbol); } - MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection()); + if (UseRangesSection) { + MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRangesSection()); + if (CreateDwarfSectionSymbols) { + RangesSectionSymbol = context.CreateTempSymbol(); + MCOS->EmitLabel(RangesSectionSymbol); + } + } - // If there are no line table entries then do not emit any section contents. - if (!context.hasMCLineSections()) - return; + assert((RangesSectionSymbol != NULL) || !UseRangesSection); + + MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection()); // Output the data for .debug_aranges section. EmitGenDwarfAranges(MCOS, InfoSectionSymbol); + if (UseRangesSection) + EmitGenDwarfRanges(MCOS); + // Output the data for .debug_abbrev section. EmitGenDwarfAbbrev(MCOS); // Output the data for .debug_info section. - EmitGenDwarfInfo(MCOS, AbbrevSectionSymbol, LineSectionSymbol); + EmitGenDwarfInfo(MCOS, AbbrevSectionSymbol, LineSectionSymbol, + RangesSectionSymbol); } // @@ -815,12 +905,13 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) { // void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS, SourceMgr &SrcMgr, SMLoc &Loc) { - // We won't create dwarf labels for temporary symbols or symbols not in - // the default text. + // We won't create dwarf labels for temporary symbols. if (Symbol->isTemporary()) return; MCContext &context = MCOS->getContext(); - if (context.getGenDwarfSection() != MCOS->getCurrentSection().first) + // We won't create dwarf labels for symbols in sections that we are not + // generating debug info for. + if (!context.getGenDwarfSectionSyms().count(MCOS->getCurrentSection().first)) return; // The dwarf label's name does not have the symbol name's leading @@ -834,7 +925,7 @@ void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS, // Finding the line number is the expensive part which is why we just don't // pass it in as for some symbols we won't create a dwarf label. - int CurBuffer = SrcMgr.FindBufferContainingLoc(Loc); + unsigned CurBuffer = SrcMgr.FindBufferContainingLoc(Loc); unsigned LineNumber = SrcMgr.FindLineNumber(Loc, CurBuffer); // We create a temporary symbol for use for the AT_high_pc and AT_low_pc @@ -1203,7 +1294,7 @@ void FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer, unsigned FDEEncoding = MOFI->getFDEEncoding(); unsigned Size = getSizeForEncoding(Streamer, FDEEncoding); if (VerboseAsm) Streamer.AddComment("Range Start"); - Streamer.EmitSymbolValue(Frame.Function, Size); + Streamer.EmitSymbolValue(Frame.Begin, Size); // Range Length const MCExpr *Range = MakeStartMinusEndExpr(Streamer, *Frame.Begin, @@ -1246,12 +1337,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer, const MCObjectFileInfo *MOFI = context.getObjectFileInfo(); bool verboseAsm = streamer.isVerboseAsm(); - MCSymbol *sectionStart; - if (MOFI->isFunctionEHFrameSymbolPrivate() || !IsEH) - sectionStart = context.CreateTempSymbol(); - else - sectionStart = context.GetOrCreateSymbol(Twine("EH_frame") + Twine(CIENum)); - + MCSymbol *sectionStart = context.CreateTempSymbol(); streamer.EmitLabel(sectionStart); CIENum++; @@ -1270,7 +1356,10 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer, // Version if (verboseAsm) streamer.AddComment("DW_CIE_VERSION"); - streamer.EmitIntValue(dwarf::DW_CIE_VERSION, 1); + // For DWARF2, we use CIE version 1 + // For DWARF3+, we use CIE version 3 + uint8_t CIEVersion = context.getDwarfVersion() <= 2 ? 1 : 3; + streamer.EmitIntValue(CIEVersion, 1); // Augmentation String SmallString<8> Augmentation; @@ -1298,7 +1387,14 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer, // Return Address Register if (verboseAsm) streamer.AddComment("CIE Return Address Column"); - streamer.EmitULEB128IntValue(MRI->getDwarfRegNum(MRI->getRARegister(), true)); + if (CIEVersion == 1) { + assert(MRI->getRARegister() <= 255 && + "DWARF 2 encodes return_address_register in one byte"); + streamer.EmitIntValue(MRI->getDwarfRegNum(MRI->getRARegister(), true), 1); + } else { + streamer.EmitULEB128IntValue( + MRI->getDwarfRegNum(MRI->getRARegister(), true)); + } // Augmentation Data Length (optional) @@ -1360,13 +1456,6 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCObjectStreamer &streamer, const MCObjectFileInfo *MOFI = context.getObjectFileInfo(); bool verboseAsm = streamer.isVerboseAsm(); - if (IsEH && frame.Function && !MOFI->isFunctionEHFrameSymbolPrivate()) { - MCSymbol *EHSym = - context.GetOrCreateSymbol(frame.Function->getName() + Twine(".eh")); - streamer.EmitEHSymAttributes(frame.Function, EHSym); - streamer.EmitLabel(EHSym); - } - // Length const MCExpr *Length = MakeStartMinusEndExpr(streamer, *fdeStart, *fdeEnd, 0); if (verboseAsm) streamer.AddComment("FDE Length"); @@ -1435,13 +1524,12 @@ namespace { return CIEKey(nullptr, -1, 0, false, false); } - CIEKey(const MCSymbol* Personality_, unsigned PersonalityEncoding_, - unsigned LsdaEncoding_, bool IsSignalFrame_, bool IsSimple_) : - Personality(Personality_), PersonalityEncoding(PersonalityEncoding_), - LsdaEncoding(LsdaEncoding_), IsSignalFrame(IsSignalFrame_), - IsSimple(IsSimple_) { - } - const MCSymbol* Personality; + CIEKey(const MCSymbol *Personality_, unsigned PersonalityEncoding_, + unsigned LsdaEncoding_, bool IsSignalFrame_, bool IsSimple_) + : Personality(Personality_), PersonalityEncoding(PersonalityEncoding_), + LsdaEncoding(LsdaEncoding_), IsSignalFrame(IsSignalFrame_), + IsSimple(IsSimple_) {} + const MCSymbol *Personality; unsigned PersonalityEncoding; unsigned LsdaEncoding; bool IsSignalFrame; @@ -1483,7 +1571,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB, MCContext &Context = Streamer.getContext(); const MCObjectFileInfo *MOFI = Context.getObjectFileInfo(); FrameEmitterImpl Emitter(IsEH); - ArrayRef FrameArray = Streamer.getFrameInfos(); + ArrayRef FrameArray = Streamer.getDwarfFrameInfos(); // Emit the compact unwind info if available. bool NeedsEHFrameSection = !MOFI->getSupportsCompactUnwindWithoutEHFrame(); @@ -1516,7 +1604,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB, Emitter.setSectionStart(SectionStart); MCSymbol *FDEEnd = nullptr; - DenseMap CIEStarts; + DenseMap CIEStarts; const MCSymbol *DummyDebugKey = nullptr; NeedsEHFrameSection = !MOFI->getSupportsCompactUnwindWithoutEHFrame(); diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp index 0a9cd31dda00..386c2099f2e2 100644 --- a/lib/MC/MCELF.cpp +++ b/lib/MC/MCELF.cpp @@ -61,7 +61,7 @@ void MCELF::SetVisibility(MCSymbolData &SD, unsigned Visibility) { SD.setFlags(OtherFlags | (Visibility << ELF_STV_Shift)); } -unsigned MCELF::GetVisibility(MCSymbolData &SD) { +unsigned MCELF::GetVisibility(const MCSymbolData &SD) { unsigned Visibility = (SD.getFlags() & (0x3 << ELF_STV_Shift)) >> ELF_STV_Shift; assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL || @@ -76,7 +76,7 @@ void MCELF::setOther(MCSymbolData &SD, unsigned Other) { SD.setFlags(OtherFlags | (Other << ELF_STO_Shift)); } -unsigned MCELF::getOther(MCSymbolData &SD) { +unsigned MCELF::getOther(const MCSymbolData &SD) { unsigned Other = (SD.getFlags() & (0x3f << ELF_STO_Shift)) >> ELF_STO_Shift; return Other; diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp index 4012c442e4f6..84176dc1e907 100644 --- a/lib/MC/MCELFObjectTargetWriter.cpp +++ b/lib/MC/MCELFObjectTargetWriter.cpp @@ -24,6 +24,7 @@ MCELFObjectTargetWriter::MCELFObjectTargetWriter(bool Is64Bit_, IsN64(IsN64_){ } -bool MCELFObjectTargetWriter::needsRelocateWithSymbol(unsigned Type) const { +bool MCELFObjectTargetWriter::needsRelocateWithSymbol(const MCSymbolData &SD, + unsigned Type) const { return false; } diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp index 767348ca4471..7c70540dd5b9 100644 --- a/lib/MC/MCELFStreamer.cpp +++ b/lib/MC/MCELFStreamer.cpp @@ -65,10 +65,6 @@ void MCELFStreamer::EmitLabel(MCSymbol *Symbol) { MCELF::SetType(SD, ELF::STT_TLS); } -void MCELFStreamer::EmitDebugLabel(MCSymbol *Symbol) { - EmitLabel(Symbol); -} - void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { // Let the target do whatever target specific stuff it needs to do. getAssembler().getBackend().handleAssemblerFlag(Flag); diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp index 37d05e910a4a..9e8bc942e494 100644 --- a/lib/MC/MCMachOStreamer.cpp +++ b/lib/MC/MCMachOStreamer.cpp @@ -60,7 +60,6 @@ class MCMachOStreamer : public MCObjectStreamer { void ChangeSection(const MCSection *Sect, const MCExpr *Subsect) override; void EmitLabel(MCSymbol *Symbol) override; - void EmitDebugLabel(MCSymbol *Symbol) override; void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) override; void EmitAssemblerFlag(MCAssemblerFlag Flag) override; void EmitLinkerOptions(ArrayRef Options) override; @@ -162,9 +161,6 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) { SD.setFlags(SD.getFlags() & ~SF_ReferenceTypeMask); } -void MCMachOStreamer::EmitDebugLabel(MCSymbol *Symbol) { - EmitLabel(Symbol); -} void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) { if (!getAssembler().getBackend().hasDataInCodeSupport()) return; diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp index 4f2740ed3ae9..d5434023db72 100644 --- a/lib/MC/MCNullStreamer.cpp +++ b/lib/MC/MCNullStreamer.cpp @@ -24,83 +24,17 @@ namespace { /// @name MCStreamer Interface /// @{ - void ChangeSection(const MCSection *Section, - const MCExpr *Subsection) override { - } - - void EmitLabel(MCSymbol *Symbol) override { - assert(Symbol->isUndefined() && "Cannot define a symbol twice!"); - assert(getCurrentSection().first &&"Cannot emit before setting section!"); - AssignSection(Symbol, getCurrentSection().first); - } - void EmitDebugLabel(MCSymbol *Symbol) override { - EmitLabel(Symbol); - } - void EmitAssemblerFlag(MCAssemblerFlag Flag) override {} - void EmitThumbFunc(MCSymbol *Func) override {} - - void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override {} - void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override {} bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override { return true; } - void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override {} - - void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {} - void EmitCOFFSymbolStorageClass(int StorageClass) override {} - void EmitCOFFSymbolType(int Type) override {} - void EndCOFFSymbolDef() override {} void EmitCOFFSecRel32(MCSymbol const *Symbol) override {} - - void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) override {} void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment) override {} - void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, - unsigned ByteAlignment) override {} void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = nullptr, uint64_t Size = 0, unsigned ByteAlignment = 0) override {} - void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, - uint64_t Size, unsigned ByteAlignment) override {} - void EmitBytes(StringRef Data) override {} - - void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc = SMLoc()) override {} - void EmitULEB128Value(const MCExpr *Value) override {} - void EmitSLEB128Value(const MCExpr *Value) override {} void EmitGPRel32Value(const MCExpr *Value) override {} - void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0, - unsigned ValueSize = 1, - unsigned MaxBytesToEmit = 0) override {} - - void EmitCodeAlignment(unsigned ByteAlignment, - unsigned MaxBytesToEmit = 0) override {} - - bool EmitValueToOffset(const MCExpr *Offset, - unsigned char Value = 0) override { return false; } - - void EmitFileDirective(StringRef Filename) override {} - unsigned EmitDwarfFileDirective(unsigned FileNo, StringRef Directory, - StringRef Filename, - unsigned CUID = 0) override { - return 0; - } - void EmitDwarfLocDirective(unsigned FileNo, unsigned Line, - unsigned Column, unsigned Flags, - unsigned Isa, unsigned Discriminator, - StringRef FileName) override {} - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo&) override {} - - void EmitBundleAlignMode(unsigned AlignPow2) override {} - void EmitBundleLock(bool AlignToEnd) override {} - void EmitBundleUnlock() override {} - - void FinishImpl() override {} - - void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override { - RecordProcEnd(Frame); - } }; } diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp index d12f60c09920..d490ef30b692 100644 --- a/lib/MC/MCObjectFileInfo.cpp +++ b/lib/MC/MCObjectFileInfo.cpp @@ -18,9 +18,29 @@ #include "llvm/MC/MCSectionMachO.h" using namespace llvm; +static bool useCompactUnwind(const Triple &T) { + // Only on darwin. + if (!T.isOSDarwin()) + return false; + + // aarch64 always has it. + if (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64) + return true; + + // Use it on newer version of OS X. + if (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) + return true; + + // And the iOS simulator. + if (T.isiOS() && + (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86)) + return true; + + return false; +} + void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) { // MachO - IsFunctionEHFrameSymbolPrivate = false; SupportsWeakOmittedEHFrame = false; if (T.isOSDarwin() && @@ -151,13 +171,10 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) { COFFDebugSymbolsSection = nullptr; - if ((T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) || - (T.isOSDarwin() && - (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64))) { + if (useCompactUnwind(T)) { CompactUnwindSection = - Ctx->getMachOSection("__LD", "__compact_unwind", - MachO::S_ATTR_DEBUG, - SectionKind::getReadOnly()); + Ctx->getMachOSection("__LD", "__compact_unwind", MachO::S_ATTR_DEBUG, + SectionKind::getReadOnly()); if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86) CompactUnwindDwarfEHFrameOnly = 0x04000000; @@ -632,11 +649,16 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) { // though it contains relocatable pointers. In PIC mode, this is probably a // big runtime hit for C++ apps. Either the contents of the LSDA need to be // adjusted or this should be a data section. - LSDASection = - Ctx->getCOFFSection(".gcc_except_table", - COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | - COFF::IMAGE_SCN_MEM_READ, - SectionKind::getReadOnly()); + assert(T.isOSWindows() && "Windows is the only supported COFF target"); + if (T.getArch() == Triple::x86_64) { + // On Windows 64 with SEH, the LSDA is emitted into the .xdata section + LSDASection = 0; + } else { + LSDASection = Ctx->getCOFFSection(".gcc_except_table", + COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ, + SectionKind::getReadOnly()); + } // Debug info. COFFDebugSymbolsSection = @@ -774,7 +796,7 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) { SectionKind::getDataRel()); } -void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm, +void MCObjectFileInfo::InitMCObjectFileInfo(StringRef T, Reloc::Model relocm, CodeModel::Model cm, MCContext &ctx) { RelocM = relocm; @@ -784,7 +806,6 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm, // Common. CommDirectiveSupportsAlignment = true; SupportsWeakOmittedEHFrame = true; - IsFunctionEHFrameSymbolPrivate = true; SupportsCompactUnwindWithoutEHFrame = false; PersonalityEncoding = LSDAEncoding = FDECFIEncoding = TTypeEncoding = @@ -799,8 +820,9 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm, DwarfAccelNamespaceSection = nullptr; // Used only by selected targets. DwarfAccelTypesSection = nullptr; // Used only by selected targets. - Triple T(TT); - Triple::ArchType Arch = T.getArch(); + TT = Triple(T); + + Triple::ArchType Arch = TT.getArch(); // FIXME: Checking for Arch here to filter out bogus triples such as // cellspu-apple-darwin. Perhaps we should fix in Triple? if ((Arch == Triple::x86 || Arch == Triple::x86_64 || @@ -808,17 +830,17 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm, Arch == Triple::arm64 || Arch == Triple::aarch64 || Arch == Triple::ppc || Arch == Triple::ppc64 || Arch == Triple::UnknownArch) && - (T.isOSDarwin() || T.isOSBinFormatMachO())) { + (TT.isOSDarwin() || TT.isOSBinFormatMachO())) { Env = IsMachO; - InitMachOMCObjectFileInfo(T); + InitMachOMCObjectFileInfo(TT); } else if ((Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm || Arch == Triple::thumb) && - (T.isOSWindows() && T.getObjectFormat() == Triple::COFF)) { + (TT.isOSWindows() && TT.getObjectFormat() == Triple::COFF)) { Env = IsCOFF; - InitCOFFMCObjectFileInfo(T); + InitCOFFMCObjectFileInfo(TT); } else { Env = IsELF; - InitELFMCObjectFileInfo(T); + InitELFMCObjectFileInfo(TT); } } diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp index a1aa60283cd0..a721b59bd111 100644 --- a/lib/MC/MCObjectStreamer.cpp +++ b/lib/MC/MCObjectStreamer.cpp @@ -83,32 +83,8 @@ MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() const { return F; } -const MCExpr *MCObjectStreamer::AddValueSymbols(const MCExpr *Value) { - switch (Value->getKind()) { - case MCExpr::Target: - cast(Value)->AddValueSymbols(Assembler); - break; - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast(Value); - AddValueSymbols(BE->getLHS()); - AddValueSymbols(BE->getRHS()); - break; - } - - case MCExpr::SymbolRef: - Assembler->getOrCreateSymbolData(cast(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbols(cast(Value)->getSubExpr()); - break; - } - - return Value; +void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) { + Assembler->getOrCreateSymbolData(Sym); } void MCObjectStreamer::EmitCFISections(bool EH, bool Debug) { @@ -119,13 +95,14 @@ void MCObjectStreamer::EmitCFISections(bool EH, bool Debug) { void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, const SMLoc &Loc) { + MCStreamer::EmitValueImpl(Value, Size, Loc); MCDataFragment *DF = getOrCreateDataFragment(); MCLineEntry::Make(this, getCurrentSection().first); // Avoid fixups when possible. int64_t AbsValue; - if (AddValueSymbols(Value)->EvaluateAsAbsolute(AbsValue, getAssembler())) { + if (Value->EvaluateAsAbsolute(AbsValue, getAssembler())) { EmitIntValue(AbsValue, Size); return; } @@ -136,11 +113,14 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, } void MCObjectStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) { - RecordProcStart(Frame); + // We need to create a local symbol to avoid relocations. + Frame.Begin = getContext().CreateTempSymbol(); + EmitLabel(Frame.Begin); } void MCObjectStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) { - RecordProcEnd(Frame); + Frame.End = getContext().CreateTempSymbol(); + EmitLabel(Frame.End); } void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) { @@ -158,10 +138,6 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) { SD.setOffset(F->getContents().size()); } -void MCObjectStreamer::EmitDebugLabel(MCSymbol *Symbol) { - EmitLabel(Symbol); -} - void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) { int64_t IntValue; if (Value->EvaluateAsAbsolute(IntValue, getAssembler())) { @@ -205,15 +181,12 @@ void MCObjectStreamer::ChangeSection(const MCSection *Section, void MCObjectStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) { getAssembler().getOrCreateSymbolData(*Symbol); - AddValueSymbols(Value); MCStreamer::EmitAssignment(Symbol, Value); } -void MCObjectStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) { - // Scan for values. - for (unsigned i = Inst.getNumOperands(); i--; ) - if (Inst.getOperand(i).isExpr()) - AddValueSymbols(Inst.getOperand(i).getExpr()); +void MCObjectStreamer::EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI) { + MCStreamer::EmitInstruction(Inst, STI); MCSectionData *SD = getCurrentSectionData(); SD->setHasInstructions(true); diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp index bca516eca027..145ad4a56123 100644 --- a/lib/MC/MCParser/AsmLexer.cpp +++ b/lib/MC/MCParser/AsmLexer.cpp @@ -22,7 +22,6 @@ using namespace llvm; AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { - CurBuf = nullptr; CurPtr = nullptr; isAtStartOfLine = true; AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@"); @@ -31,13 +30,13 @@ AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { AsmLexer::~AsmLexer() { } -void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { - CurBuf = buf; +void AsmLexer::setBuffer(StringRef Buf, const char *ptr) { + CurBuf = Buf; if (ptr) CurPtr = ptr; else - CurPtr = CurBuf->getBufferStart(); + CurPtr = CurBuf.begin(); TokStart = nullptr; } @@ -58,7 +57,7 @@ int AsmLexer::getNextChar() { case 0: // A nul character in the stream is either the end of the current buffer or // a random nul in the file. Disambiguate that here. - if (CurPtr-1 != CurBuf->getBufferEnd()) + if (CurPtr - 1 != CurBuf.end()) return 0; // Just whitespace. // Otherwise, return end of file. @@ -201,8 +200,8 @@ AsmToken AsmLexer::LexLineComment() { CurChar = getNextChar(); if (CurChar == EOF) - return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); - return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); + return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); + return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); } static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { @@ -420,9 +419,8 @@ StringRef AsmLexer::LexUntilEndOfStatement() { while (!isAtStartOfComment(*CurPtr) && // Start of line comment. !isAtStatementSeparator(CurPtr) && // End of statement marker. - *CurPtr != '\n' && - *CurPtr != '\r' && - (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { + *CurPtr != '\n' && *CurPtr != '\r' && + (*CurPtr != 0 || CurPtr != CurBuf.end())) { ++CurPtr; } return StringRef(TokStart, CurPtr-TokStart); @@ -431,9 +429,8 @@ StringRef AsmLexer::LexUntilEndOfStatement() { StringRef AsmLexer::LexUntilEndOfLine() { TokStart = CurPtr; - while (*CurPtr != '\n' && - *CurPtr != '\r' && - (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { + while (*CurPtr != '\n' && *CurPtr != '\r' && + (*CurPtr != 0 || CurPtr != CurBuf.end())) { ++CurPtr; } return StringRef(TokStart, CurPtr-TokStart); diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp index cbff7beccae0..ed1d704c1d67 100644 --- a/lib/MC/MCParser/AsmParser.cpp +++ b/lib/MC/MCParser/AsmParser.cpp @@ -133,7 +133,7 @@ class AsmParser : public MCAsmParser { /// This is the current buffer index we're lexing from as managed by the /// SourceMgr object. - int CurBuffer; + unsigned CurBuffer; AsmCond TheCondState; std::vector TheCondStack; @@ -162,13 +162,13 @@ class AsmParser : public MCAsmParser { StringRef CppHashFilename; int64_t CppHashLineNumber; SMLoc CppHashLoc; - int CppHashBuf; + unsigned CppHashBuf; /// When generating dwarf for assembly source files we need to calculate the /// logical line number based on the last parsed cpp hash file line comment /// and current line. Since this is slow and messes up the SourceMgr's /// cache we save the last info we queried with SrcMgr.FindLineNumber(). SMLoc LastQueryIDLoc; - int LastQueryBuffer; + unsigned LastQueryBuffer; unsigned LastQueryLine; /// AssemblerDialect. ~OU means unset value and use value provided by MAI. @@ -310,9 +310,9 @@ class AsmParser : public MCAsmParser { /// current token is not set; clients should ensure Lex() is called /// subsequently. /// - /// \param InBuffer If not -1, should be the known buffer id that contains the + /// \param InBuffer If not 0, should be the known buffer id that contains the /// location. - void jumpToLoc(SMLoc Loc, int InBuffer=-1); + void jumpToLoc(SMLoc Loc, unsigned InBuffer = 0); /// \brief Parse up to the end of statement and a return the contents from the /// current token until the end of the statement; the current token on exit @@ -345,8 +345,9 @@ class AsmParser : public MCAsmParser { DK_REFERENCE, DK_WEAK_DEFINITION, DK_WEAK_REFERENCE, DK_WEAK_DEF_CAN_BE_HIDDEN, DK_COMM, DK_COMMON, DK_LCOMM, DK_ABORT, DK_INCLUDE, DK_INCBIN, DK_CODE16, DK_CODE16GCC, DK_REPT, DK_IRP, DK_IRPC, - DK_IF, DK_IFNE, DK_IFB, DK_IFNB, DK_IFC, DK_IFEQS, DK_IFNC, DK_IFDEF, - DK_IFNDEF, DK_IFNOTDEF, DK_ELSEIF, DK_ELSE, DK_ENDIF, + DK_IF, DK_IFEQ, DK_IFGE, DK_IFGT, DK_IFLE, DK_IFLT, DK_IFNE, DK_IFB, + DK_IFNB, DK_IFC, DK_IFEQS, DK_IFNC, DK_IFDEF, DK_IFNDEF, DK_IFNOTDEF, + DK_ELSEIF, DK_ELSE, DK_ENDIF, DK_SPACE, DK_SKIP, DK_FILE, DK_LINE, DK_LOC, DK_STABS, DK_CFI_SECTIONS, DK_CFI_STARTPROC, DK_CFI_ENDPROC, DK_CFI_DEF_CFA, DK_CFI_DEF_CFA_OFFSET, DK_CFI_ADJUST_CFA_OFFSET, DK_CFI_DEF_CFA_REGISTER, @@ -433,8 +434,8 @@ class AsmParser : public MCAsmParser { bool parseDirectiveInclude(); // ".include" bool parseDirectiveIncbin(); // ".incbin" - // ".if" or ".ifne" - bool parseDirectiveIf(SMLoc DirectiveLoc); + // ".if", ".ifeq", ".ifge", ".ifgt" , ".ifle", ".iflt" or ".ifne" + bool parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind); // ".ifb" or ".ifnb", depending on ExpectBlank. bool parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank); // ".ifc" or ".ifnc", depending on ExpectEqual. @@ -490,15 +491,15 @@ enum { DEFAULT_ADDRSPACE = 0 }; AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out, const MCAsmInfo &_MAI) : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM), - PlatformParser(nullptr), CurBuffer(0), MacrosEnabledFlag(true), - HadError(false), CppHashLineNumber(0), AssemblerDialect(~0U), - IsDarwin(false), ParsingInlineAsm(false) { + PlatformParser(nullptr), CurBuffer(_SM.getMainFileID()), + MacrosEnabledFlag(true), HadError(false), CppHashLineNumber(0), + AssemblerDialect(~0U), IsDarwin(false), ParsingInlineAsm(false) { // Save the old handler. SavedDiagHandler = SrcMgr.getDiagHandler(); SavedDiagContext = SrcMgr.getDiagContext(); // Set our own handler which calls the saved handler. SrcMgr.setDiagHandler(DiagHandler, this); - Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)); + Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer()); // Initialize the platform / file format parser. switch (_Ctx.getObjectFileInfo()->getObjectFileType()) { @@ -565,14 +566,13 @@ bool AsmParser::Error(SMLoc L, const Twine &Msg, ArrayRef Ranges) { bool AsmParser::enterIncludeFile(const std::string &Filename) { std::string IncludedFile; - int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile); - if (NewBuf == -1) + unsigned NewBuf = + SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile); + if (!NewBuf) return true; CurBuffer = NewBuf; - - Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)); - + Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer()); return false; } @@ -581,8 +581,9 @@ bool AsmParser::enterIncludeFile(const std::string &Filename) { /// returns true on failure. bool AsmParser::processIncbinFile(const std::string &Filename) { std::string IncludedFile; - int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile); - if (NewBuf == -1) + unsigned NewBuf = + SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile); + if (!NewBuf) return true; // Pick up the bytes from the file and emit them. @@ -590,13 +591,10 @@ bool AsmParser::processIncbinFile(const std::string &Filename) { return false; } -void AsmParser::jumpToLoc(SMLoc Loc, int InBuffer) { - if (InBuffer != -1) { - CurBuffer = InBuffer; - } else { - CurBuffer = SrcMgr.FindBufferContainingLoc(Loc); - } - Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer), Loc.getPointer()); +void AsmParser::jumpToLoc(SMLoc Loc, unsigned InBuffer) { + CurBuffer = InBuffer ? InBuffer : SrcMgr.FindBufferContainingLoc(Loc); + Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(), + Loc.getPointer()); } const AsmToken &AsmParser::Lex() { @@ -632,10 +630,12 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) { // If we are generating dwarf for assembly source files save the initial text // section and generate a .file directive. if (getContext().getGenDwarfForAssembly()) { - getContext().setGenDwarfSection(getStreamer().getCurrentSection().first); MCSymbol *SectionStartSym = getContext().CreateTempSymbol(); getStreamer().EmitLabel(SectionStartSym); - getContext().setGenDwarfSectionStartSym(SectionStartSym); + auto InsertResult = getContext().addGenDwarfSection( + getStreamer().getCurrentSection().first); + assert(InsertResult.second && ".text section should not have debug info yet"); + InsertResult.first->second.first = SectionStartSym; getContext().setGenDwarfFileNumber(getStreamer().EmitDwarfFileDirective( 0, StringRef(), getContext().getMainFileName())); } @@ -811,7 +811,19 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { // Parse symbol variant std::pair Split; if (!MAI.useParensForSymbolVariant()) { - Split = Identifier.split('@'); + if (FirstTokenKind == AsmToken::String) { + if (Lexer.is(AsmToken::At)) { + Lexer.Lex(); // eat @ + SMLoc AtLoc = getLexer().getLoc(); + StringRef VName; + if (parseIdentifier(VName)) + return Error(AtLoc, "expected symbol variant after '@'"); + + Split = std::make_pair(Identifier, VName); + } + } else { + Split = Identifier.split('@'); + } } else if (Lexer.is(AsmToken::LParen)) { Lexer.Lex(); // eat ( StringRef VName; @@ -1229,8 +1241,13 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info) { default: break; case DK_IF: + case DK_IFEQ: + case DK_IFGE: + case DK_IFGT: + case DK_IFLE: + case DK_IFLT: case DK_IFNE: - return parseDirectiveIf(IDLoc); + return parseDirectiveIf(IDLoc, DirKind); case DK_IFB: return parseDirectiveIfb(IDLoc, true); case DK_IFNB: @@ -1574,12 +1591,11 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info) { printMessage(IDLoc, SourceMgr::DK_Note, OS.str()); } - // If we are generating dwarf for assembly source files and the current - // section is the initial text section then generate a .loc directive for - // the instruction. + // If we are generating dwarf for the current section then generate a .loc + // directive for the instruction. if (!HadError && getContext().getGenDwarfForAssembly() && - getContext().getGenDwarfSection() == - getStreamer().getCurrentSection().first) { + getContext().getGenDwarfSectionSyms().count( + getStreamer().getCurrentSection().first)) { unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer); @@ -1678,13 +1694,15 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) { const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr(); const SMLoc &DiagLoc = Diag.getLoc(); - int DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc); - int CppHashBuf = Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc); + unsigned DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc); + unsigned CppHashBuf = + Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc); // Like SourceMgr::printMessage() we need to print the include stack if any // before printing the message. - int DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc); - if (!Parser->SavedDiagHandler && DiagCurBuffer > 0) { + unsigned DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc); + if (!Parser->SavedDiagHandler && DiagCurBuffer && + DiagCurBuffer != DiagSrcMgr.getMainFileID()) { SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer); DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS); } @@ -2011,7 +2029,7 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M, break; if (FAI >= NParameters) { - assert(M && "expected macro to be defined"); + assert(M && "expected macro to be defined"); Error(IDLoc, "parameter named '" + FA.Name + "' does not exist for macro '" + M->Name + "'"); @@ -2110,7 +2128,7 @@ bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) { // Jump to the macro instantiation and prime the lexer. CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc()); - Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)); + Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer()); Lex(); return false; @@ -3792,9 +3810,8 @@ bool AsmParser::parseDirectiveIncbin() { } /// parseDirectiveIf -/// ::= .if expression -/// ::= .ifne expression -bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc) { +/// ::= .if{,eq,ge,gt,le,lt,ne} expression +bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind) { TheCondStack.push_back(TheCondState); TheCondState.TheCond = AsmCond::IfCond; if (TheCondState.Ignore) { @@ -3809,6 +3826,29 @@ bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc) { Lex(); + switch (DirKind) { + default: + llvm_unreachable("unsupported directive"); + case DK_IF: + case DK_IFNE: + break; + case DK_IFEQ: + ExprValue = ExprValue == 0; + break; + case DK_IFGE: + ExprValue = ExprValue >= 0; + break; + case DK_IFGT: + ExprValue = ExprValue > 0; + break; + case DK_IFLE: + ExprValue = ExprValue <= 0; + break; + case DK_IFLT: + ExprValue = ExprValue < 0; + break; + } + TheCondState.CondMet = ExprValue; TheCondState.Ignore = !TheCondState.CondMet; } @@ -4111,6 +4151,11 @@ void AsmParser::initializeDirectiveKindMap() { DirectiveKindMap[".bundle_lock"] = DK_BUNDLE_LOCK; DirectiveKindMap[".bundle_unlock"] = DK_BUNDLE_UNLOCK; DirectiveKindMap[".if"] = DK_IF; + DirectiveKindMap[".ifeq"] = DK_IFEQ; + DirectiveKindMap[".ifge"] = DK_IFGE; + DirectiveKindMap[".ifgt"] = DK_IFGT; + DirectiveKindMap[".ifle"] = DK_IFLE; + DirectiveKindMap[".iflt"] = DK_IFLT; DirectiveKindMap[".ifne"] = DK_IFNE; DirectiveKindMap[".ifb"] = DK_IFB; DirectiveKindMap[".ifnb"] = DK_IFNB; @@ -4220,7 +4265,7 @@ void AsmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc, // Jump to the macro instantiation and prime the lexer. CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc()); - Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)); + Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer()); Lex(); } @@ -4465,7 +4510,8 @@ bool AsmParser::parseMSInlineAsm( continue; // Register operand. - if (Operand.isReg() && !Operand.needAddressOf()) { + if (Operand.isReg() && !Operand.needAddressOf() && + !getTargetParser().OmitRegisterFromClobberLists(Operand.getReg())) { unsigned NumDefs = Desc.getNumDefs(); // Clobber. if (NumDefs && Operand.getMCOperandNum() < NumDefs) @@ -4499,9 +4545,9 @@ bool AsmParser::parseMSInlineAsm( } // Consider implicit defs to be clobbers. Think of cpuid and push. - const uint16_t *ImpDefs = Desc.getImplicitDefs(); - for (unsigned I = 0, E = Desc.getNumImplicitDefs(); I != E; ++I) - ClobberRegs.push_back(ImpDefs[I]); + ArrayRef ImpDefs(Desc.getImplicitDefs(), + Desc.getNumImplicitDefs()); + ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end()); } // Set the number of Outputs and Inputs. @@ -4536,27 +4582,26 @@ bool AsmParser::parseMSInlineAsm( // Build the IR assembly string. std::string AsmStringIR; raw_string_ostream OS(AsmStringIR); - const char *AsmStart = SrcMgr.getMemoryBuffer(0)->getBufferStart(); - const char *AsmEnd = SrcMgr.getMemoryBuffer(0)->getBufferEnd(); + StringRef ASMString = + SrcMgr.getMemoryBuffer(SrcMgr.getMainFileID())->getBuffer(); + const char *AsmStart = ASMString.begin(); + const char *AsmEnd = ASMString.end(); array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), rewritesSort); - for (SmallVectorImpl::iterator I = AsmStrRewrites.begin(), - E = AsmStrRewrites.end(); - I != E; ++I) { - AsmRewriteKind Kind = (*I).Kind; + for (const AsmRewrite &AR : AsmStrRewrites) { + AsmRewriteKind Kind = AR.Kind; if (Kind == AOK_Delete) continue; - const char *Loc = (*I).Loc.getPointer(); + const char *Loc = AR.Loc.getPointer(); assert(Loc >= AsmStart && "Expected Loc to be at or after Start!"); // Emit everything up to the immediate/expression. - unsigned Len = Loc - AsmStart; - if (Len) + if (unsigned Len = Loc - AsmStart) OS << StringRef(AsmStart, Len); // Skip the original expression. if (Kind == AOK_Skip) { - AsmStart = Loc + (*I).Len; + AsmStart = Loc + AR.Len; continue; } @@ -4566,7 +4611,7 @@ bool AsmParser::parseMSInlineAsm( default: break; case AOK_Imm: - OS << "$$" << (*I).Val; + OS << "$$" << AR.Val; break; case AOK_ImmPrefix: OS << "$$"; @@ -4578,7 +4623,7 @@ bool AsmParser::parseMSInlineAsm( OS << '$' << OutputIdx++; break; case AOK_SizeDirective: - switch ((*I).Val) { + switch (AR.Val) { default: break; case 8: OS << "byte ptr "; break; case 16: OS << "word ptr "; break; @@ -4593,7 +4638,7 @@ bool AsmParser::parseMSInlineAsm( OS << ".byte"; break; case AOK_Align: { - unsigned Val = (*I).Val; + unsigned Val = AR.Val; OS << ".align " << Val; // Skip the original immediate. @@ -4606,12 +4651,12 @@ bool AsmParser::parseMSInlineAsm( OS.flush(); if (AsmStringIR.back() != '.') OS << '.'; - OS << (*I).Val; + OS << AR.Val; break; } // Skip the original expression. - AsmStart = Loc + (*I).Len + AdditionalSkip; + AsmStart = Loc + AR.Len + AdditionalSkip; } // Emit the remainder of the asm string. diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp index 8e8447f7b89b..5ecf9e5c64bc 100644 --- a/lib/MC/MCParser/COFFAsmParser.cpp +++ b/lib/MC/MCParser/COFFAsmParser.cpp @@ -13,6 +13,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionCOFF.h" @@ -169,8 +170,8 @@ bool COFFAsmParser::ParseSectionFlags(StringRef FlagsString, unsigned* Flags) { bool ReadOnlyRemoved = false; unsigned SecFlags = None; - for (unsigned i = 0; i < FlagsString.size(); ++i) { - switch (FlagsString[i]) { + for (char FlagChar : FlagsString) { + switch (FlagChar) { case 'a': // Ignored. break; @@ -291,8 +292,7 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) { bool COFFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Characteristics, SectionKind Kind) { - return ParseSectionSwitch(Section, Characteristics, Kind, "", - COFF::IMAGE_COMDAT_SELECT_ANY); + return ParseSectionSwitch(Section, Characteristics, Kind, "", (COFF::COMDATType)0); } bool COFFAsmParser::ParseSectionSwitch(StringRef Section, @@ -356,9 +356,10 @@ bool COFFAsmParser::ParseDirectiveSection(StringRef, SMLoc) { return true; } - COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY; + COFF::COMDATType Type = (COFF::COMDATType)0; StringRef COMDATSymName; if (getLexer().is(AsmToken::Comma)) { + Type = COFF::IMAGE_COMDAT_SELECT_ANY;; Lex(); Flags |= COFF::IMAGE_SCN_LNK_COMDAT; @@ -378,6 +379,11 @@ bool COFFAsmParser::ParseDirectiveSection(StringRef, SMLoc) { return TokError("unexpected token in directive"); SectionKind Kind = computeSectionKind(Flags); + if (Kind.isText()) { + const Triple &T = getContext().getObjectFileInfo()->getTargetTriple(); + if (T.getArch() == Triple::arm || T.getArch() == Triple::thumb) + Flags |= COFF::IMAGE_SCN_MEM_16BIT; + } ParseSectionSwitch(SectionName, Flags, Kind, COMDATSymName, Type); return false; } @@ -517,25 +523,25 @@ bool COFFAsmParser::ParseSEHDirectiveStartProc(StringRef, SMLoc) { MCSymbol *Symbol = getContext().GetOrCreateSymbol(SymbolID); Lex(); - getStreamer().EmitWin64EHStartProc(Symbol); + getStreamer().EmitWinCFIStartProc(Symbol); return false; } bool COFFAsmParser::ParseSEHDirectiveEndProc(StringRef, SMLoc) { Lex(); - getStreamer().EmitWin64EHEndProc(); + getStreamer().EmitWinCFIEndProc(); return false; } bool COFFAsmParser::ParseSEHDirectiveStartChained(StringRef, SMLoc) { Lex(); - getStreamer().EmitWin64EHStartChained(); + getStreamer().EmitWinCFIStartChained(); return false; } bool COFFAsmParser::ParseSEHDirectiveEndChained(StringRef, SMLoc) { Lex(); - getStreamer().EmitWin64EHEndChained(); + getStreamer().EmitWinCFIEndChained(); return false; } @@ -561,13 +567,13 @@ bool COFFAsmParser::ParseSEHDirectiveHandler(StringRef, SMLoc) { MCSymbol *handler = getContext().GetOrCreateSymbol(SymbolID); Lex(); - getStreamer().EmitWin64EHHandler(handler, unwind, except); + getStreamer().EmitWinEHHandler(handler, unwind, except); return false; } bool COFFAsmParser::ParseSEHDirectiveHandlerData(StringRef, SMLoc) { Lex(); - getStreamer().EmitWin64EHHandlerData(); + getStreamer().EmitWinEHHandlerData(); return false; } @@ -580,7 +586,7 @@ bool COFFAsmParser::ParseSEHDirectivePushReg(StringRef, SMLoc L) { return TokError("unexpected token in directive"); Lex(); - getStreamer().EmitWin64EHPushReg(Reg); + getStreamer().EmitWinCFIPushReg(Reg); return false; } @@ -604,7 +610,7 @@ bool COFFAsmParser::ParseSEHDirectiveSetFrame(StringRef, SMLoc L) { return TokError("unexpected token in directive"); Lex(); - getStreamer().EmitWin64EHSetFrame(Reg, Off); + getStreamer().EmitWinCFISetFrame(Reg, Off); return false; } @@ -621,7 +627,7 @@ bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc) { return TokError("unexpected token in directive"); Lex(); - getStreamer().EmitWin64EHAllocStack(Size); + getStreamer().EmitWinCFIAllocStack(Size); return false; } @@ -646,7 +652,7 @@ bool COFFAsmParser::ParseSEHDirectiveSaveReg(StringRef, SMLoc L) { Lex(); // FIXME: Err on %xmm* registers - getStreamer().EmitWin64EHSaveReg(Reg, Off); + getStreamer().EmitWinCFISaveReg(Reg, Off); return false; } @@ -673,7 +679,7 @@ bool COFFAsmParser::ParseSEHDirectiveSaveXMM(StringRef, SMLoc L) { Lex(); // FIXME: Err on non-%xmm* registers - getStreamer().EmitWin64EHSaveXMM(Reg, Off); + getStreamer().EmitWinCFISaveXMM(Reg, Off); return false; } @@ -694,13 +700,13 @@ bool COFFAsmParser::ParseSEHDirectivePushFrame(StringRef, SMLoc) { return TokError("unexpected token in directive"); Lex(); - getStreamer().EmitWin64EHPushFrame(Code); + getStreamer().EmitWinCFIPushFrame(Code); return false; } bool COFFAsmParser::ParseSEHDirectiveEndProlog(StringRef, SMLoc) { Lex(); - getStreamer().EmitWin64EHEndProlog(); + getStreamer().EmitWinCFIEndProlog(); return false; } diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp index f74b30a4f126..b2a67856da0a 100644 --- a/lib/MC/MCParser/DarwinAsmParser.cpp +++ b/lib/MC/MCParser/DarwinAsmParser.cpp @@ -650,7 +650,7 @@ bool DarwinAsmParser::parseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) { } // Write the message. - int CurBuf = getSourceManager().FindBufferContainingLoc(IDLoc); + unsigned CurBuf = getSourceManager().FindBufferContainingLoc(IDLoc); *OS << getSourceManager().getBufferInfo(CurBuf).Buffer->getBufferIdentifier() << ":" << getSourceManager().FindLineNumber(IDLoc, CurBuf) << ":" << LogMessage + "\n"; diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp index 78bb6c7aad0e..98b2b3bd60b2 100644 --- a/lib/MC/MCParser/ELFAsmParser.cpp +++ b/lib/MC/MCParser/ELFAsmParser.cpp @@ -150,7 +150,7 @@ class ELFAsmParser : public MCAsmParserExtension { private: bool ParseSectionName(StringRef &SectionName); - bool ParseSectionArguments(bool IsPush); + bool ParseSectionArguments(bool IsPush, SMLoc loc); unsigned parseSunStyleSectionFlags(); }; @@ -382,7 +382,7 @@ unsigned ELFAsmParser::parseSunStyleSectionFlags() { bool ELFAsmParser::ParseDirectivePushSection(StringRef s, SMLoc loc) { getStreamer().PushSection(); - if (ParseSectionArguments(/*IsPush=*/true)) { + if (ParseSectionArguments(/*IsPush=*/true, loc)) { getStreamer().PopSection(); return true; } @@ -397,11 +397,11 @@ bool ELFAsmParser::ParseDirectivePopSection(StringRef, SMLoc) { } // FIXME: This is a work in progress. -bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc) { - return ParseSectionArguments(/*IsPush=*/false); +bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc loc) { + return ParseSectionArguments(/*IsPush=*/false, loc); } -bool ELFAsmParser::ParseSectionArguments(bool IsPush) { +bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) { StringRef SectionName; if (ParseSectionName(SectionName)) @@ -545,10 +545,24 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) { } SectionKind Kind = computeSectionKind(Flags, Size); - getStreamer().SwitchSection(getContext().getELFSection(SectionName, Type, - Flags, Kind, Size, - GroupName), - Subsection); + const MCSection *ELFSection = getContext().getELFSection( + SectionName, Type, Flags, Kind, Size, GroupName); + getStreamer().SwitchSection(ELFSection, Subsection); + + if (getContext().getGenDwarfForAssembly()) { + auto &Sections = getContext().getGenDwarfSectionSyms(); + auto InsertResult = Sections.insert( + std::make_pair(ELFSection, std::make_pair(nullptr, nullptr))); + if (InsertResult.second) { + if (getContext().getDwarfVersion() <= 2) + Error(loc, "DWARF2 only supports one section per compilation unit"); + + MCSymbol *SectionStartSymbol = getContext().CreateTempSymbol(); + getStreamer().EmitLabel(SectionStartSymbol); + InsertResult.first->second.first = SectionStartSymbol; + } + } + return false; } diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp index 7dccf0d751d6..46e80cc0c0d8 100644 --- a/lib/MC/MCStreamer.cpp +++ b/lib/MC/MCStreamer.cpp @@ -17,6 +17,7 @@ #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCWin64EH.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/raw_ostream.h" @@ -37,21 +38,20 @@ void MCTargetStreamer::finish() {} void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {} MCStreamer::MCStreamer(MCContext &Ctx) - : Context(Ctx), CurrentW64UnwindInfo(nullptr), LastSymbol(nullptr) { + : Context(Ctx), CurrentWinFrameInfo(nullptr) { SectionStack.push_back(std::pair()); } MCStreamer::~MCStreamer() { - for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i) - delete W64UnwindInfos[i]; + for (unsigned i = 0; i < getNumWinFrameInfos(); ++i) + delete WinFrameInfos[i]; } void MCStreamer::reset() { - for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i) - delete W64UnwindInfos[i]; - W64UnwindInfos.clear(); - CurrentW64UnwindInfo = nullptr; - LastSymbol = nullptr; + for (unsigned i = 0; i < getNumWinFrameInfos(); ++i) + delete WinFrameInfos[i]; + WinFrameInfos.clear(); + CurrentWinFrameInfo = nullptr; SectionStack.clear(); SectionStack.push_back(std::pair()); } @@ -87,10 +87,9 @@ raw_ostream &MCStreamer::GetCommentOS() { void MCStreamer::emitRawComment(const Twine &T, bool TabPrefix) {} void MCStreamer::generateCompactUnwindEncodings(MCAsmBackend *MAB) { - for (std::vector::iterator I = FrameInfos.begin(), - E = FrameInfos.end(); I != E; ++I) - I->CompactUnwindEncoding = - (MAB ? MAB->generateCompactUnwindEncoding(I->Instructions) : 0); + for (auto &FI : DwarfFrameInfos) + FI.CompactUnwindEncoding = + (MAB ? MAB->generateCompactUnwindEncoding(FI.Instructions) : 0); } void MCStreamer::EmitDwarfSetLineAddr(int64_t LineDelta, @@ -149,8 +148,15 @@ void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size, EmitValueImpl(Value, Size, Loc); } -void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size) { - EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size); +void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size, + bool IsSectionRelative) { + assert((!IsSectionRelative || Size == 4) && + "SectionRelative value requires 4-bytes"); + + if (!IsSectionRelative) + EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size); + else + EmitCOFFSecRel32(Sym); } void MCStreamer::EmitGPRel64Value(const MCExpr *Value) { @@ -199,14 +205,14 @@ MCSymbol *MCStreamer::getDwarfLineTableSymbol(unsigned CUID) { return Table.getLabel(); } -MCDwarfFrameInfo *MCStreamer::getCurrentFrameInfo() { - if (FrameInfos.empty()) +MCDwarfFrameInfo *MCStreamer::getCurrentDwarfFrameInfo() { + if (DwarfFrameInfos.empty()) return nullptr; - return &FrameInfos.back(); + return &DwarfFrameInfos.back(); } -void MCStreamer::EnsureValidFrame() { - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); +void MCStreamer::EnsureValidDwarfFrame() { + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); if (!CurFrame || CurFrame->End) report_fatal_error("No open frame"); } @@ -234,23 +240,15 @@ void MCStreamer::EmitLabel(MCSymbol *Symbol) { assert(!Symbol->isVariable() && "Cannot emit a variable symbol!"); assert(getCurrentSection().first && "Cannot emit before setting section!"); AssignSection(Symbol, getCurrentSection().first); - LastSymbol = Symbol; MCTargetStreamer *TS = getTargetStreamer(); if (TS) TS->emitLabel(Symbol); } -void MCStreamer::EmitDebugLabel(MCSymbol *Symbol) { - assert(!Symbol->isVariable() && "Cannot emit a variable symbol!"); - assert(getCurrentSection().first && "Cannot emit before setting section!"); - AssignSection(Symbol, getCurrentSection().first); - LastSymbol = Symbol; -} - void MCStreamer::EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding) { - EnsureValidFrame(); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + EnsureValidDwarfFrame(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->CompactUnwindEncoding = CompactUnwindEncoding; } @@ -259,7 +257,7 @@ void MCStreamer::EmitCFISections(bool EH, bool Debug) { } void MCStreamer::EmitCFIStartProc(bool IsSimple) { - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); if (CurFrame && !CurFrame->End) report_fatal_error("Starting a frame before finishing the previous one!"); @@ -267,39 +265,26 @@ void MCStreamer::EmitCFIStartProc(bool IsSimple) { Frame.IsSimple = IsSimple; EmitCFIStartProcImpl(Frame); - FrameInfos.push_back(Frame); + DwarfFrameInfos.push_back(Frame); } void MCStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) { } -void MCStreamer::RecordProcStart(MCDwarfFrameInfo &Frame) { - // Report an error if we haven't seen a symbol yet where we'd bind - // .cfi_startproc. - if (!LastSymbol) - report_fatal_error("No symbol to start a frame"); - Frame.Function = LastSymbol; - // We need to create a local symbol to avoid relocations. - Frame.Begin = getContext().CreateTempSymbol(); - EmitLabel(Frame.Begin); -} - void MCStreamer::EmitCFIEndProc() { - EnsureValidFrame(); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + EnsureValidDwarfFrame(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); EmitCFIEndProcImpl(*CurFrame); } void MCStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) { -} - -void MCStreamer::RecordProcEnd(MCDwarfFrameInfo &Frame) { - Frame.End = getContext().CreateTempSymbol(); - EmitLabel(Frame.End); + // Put a dummy non-null value in Frame.End to mark that this frame has been + // closed. + Frame.End = (MCSymbol *) 1; } MCSymbol *MCStreamer::EmitCFICommon() { - EnsureValidFrame(); + EnsureValidDwarfFrame(); MCSymbol *Label = getContext().CreateTempSymbol(); EmitLabel(Label); return Label; @@ -309,7 +294,7 @@ void MCStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createDefCfa(Label, Register, Offset); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } @@ -317,7 +302,7 @@ void MCStreamer::EmitCFIDefCfaOffset(int64_t Offset) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createDefCfaOffset(Label, Offset); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } @@ -325,7 +310,7 @@ void MCStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createAdjustCfaOffset(Label, Adjustment); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } @@ -333,7 +318,7 @@ void MCStreamer::EmitCFIDefCfaRegister(int64_t Register) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createDefCfaRegister(Label, Register); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } @@ -341,7 +326,7 @@ void MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createOffset(Label, Register, Offset); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } @@ -349,21 +334,21 @@ void MCStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createRelOffset(Label, Register, Offset); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } void MCStreamer::EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding) { - EnsureValidFrame(); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + EnsureValidDwarfFrame(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Personality = Sym; CurFrame->PersonalityEncoding = Encoding; } void MCStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) { - EnsureValidFrame(); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + EnsureValidDwarfFrame(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Lsda = Sym; CurFrame->LsdaEncoding = Encoding; } @@ -371,7 +356,7 @@ void MCStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) { void MCStreamer::EmitCFIRememberState() { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createRememberState(Label); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } @@ -379,7 +364,7 @@ void MCStreamer::EmitCFIRestoreState() { // FIXME: Error if there is no matching cfi_remember_state. MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createRestoreState(Label); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } @@ -387,7 +372,7 @@ void MCStreamer::EmitCFISameValue(int64_t Register) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createSameValue(Label, Register); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } @@ -395,20 +380,20 @@ void MCStreamer::EmitCFIRestore(int64_t Register) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createRestore(Label, Register); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } void MCStreamer::EmitCFIEscape(StringRef Values) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createEscape(Label, Values); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } void MCStreamer::EmitCFISignalFrame() { - EnsureValidFrame(); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + EnsureValidDwarfFrame(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->IsSignalFrame = true; } @@ -416,7 +401,7 @@ void MCStreamer::EmitCFIUndefined(int64_t Register) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createUndefined(Label, Register); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } @@ -424,7 +409,7 @@ void MCStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createRegister(Label, Register1, Register2); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } @@ -432,168 +417,164 @@ void MCStreamer::EmitCFIWindowSave() { MCSymbol *Label = EmitCFICommon(); MCCFIInstruction Instruction = MCCFIInstruction::createWindowSave(Label); - MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); CurFrame->Instructions.push_back(Instruction); } -void MCStreamer::setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame) { - W64UnwindInfos.push_back(Frame); - CurrentW64UnwindInfo = W64UnwindInfos.back(); -} - -void MCStreamer::EnsureValidW64UnwindInfo() { - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; - if (!CurFrame || CurFrame->End) +void MCStreamer::EnsureValidWinFrameInfo() { + if (!CurrentWinFrameInfo || CurrentWinFrameInfo->End) report_fatal_error("No open Win64 EH frame function!"); } -void MCStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) { - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; - if (CurFrame && !CurFrame->End) +void MCStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol) { + if (CurrentWinFrameInfo && !CurrentWinFrameInfo->End) report_fatal_error("Starting a function before ending the previous one!"); - MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo; + MCWinFrameInfo *Frame = new MCWinFrameInfo; Frame->Begin = getContext().CreateTempSymbol(); Frame->Function = Symbol; EmitLabel(Frame->Begin); - setCurrentW64UnwindInfo(Frame); + WinFrameInfos.push_back(Frame); + CurrentWinFrameInfo = WinFrameInfos.back(); } -void MCStreamer::EmitWin64EHEndProc() { - EnsureValidW64UnwindInfo(); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; - if (CurFrame->ChainedParent) +void MCStreamer::EmitWinCFIEndProc() { + EnsureValidWinFrameInfo(); + if (CurrentWinFrameInfo->ChainedParent) report_fatal_error("Not all chained regions terminated!"); - CurFrame->End = getContext().CreateTempSymbol(); - EmitLabel(CurFrame->End); + CurrentWinFrameInfo->End = getContext().CreateTempSymbol(); + EmitLabel(CurrentWinFrameInfo->End); } -void MCStreamer::EmitWin64EHStartChained() { - EnsureValidW64UnwindInfo(); - MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo; - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; +void MCStreamer::EmitWinCFIStartChained() { + EnsureValidWinFrameInfo(); + MCWinFrameInfo *Frame = new MCWinFrameInfo; Frame->Begin = getContext().CreateTempSymbol(); - Frame->Function = CurFrame->Function; - Frame->ChainedParent = CurFrame; + Frame->Function = CurrentWinFrameInfo->Function; + Frame->ChainedParent = CurrentWinFrameInfo; EmitLabel(Frame->Begin); - setCurrentW64UnwindInfo(Frame); + WinFrameInfos.push_back(Frame); + CurrentWinFrameInfo = WinFrameInfos.back(); } -void MCStreamer::EmitWin64EHEndChained() { - EnsureValidW64UnwindInfo(); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; - if (!CurFrame->ChainedParent) +void MCStreamer::EmitWinCFIEndChained() { + EnsureValidWinFrameInfo(); + if (!CurrentWinFrameInfo->ChainedParent) report_fatal_error("End of a chained region outside a chained region!"); - CurFrame->End = getContext().CreateTempSymbol(); - EmitLabel(CurFrame->End); - CurrentW64UnwindInfo = CurFrame->ChainedParent; + CurrentWinFrameInfo->End = getContext().CreateTempSymbol(); + EmitLabel(CurrentWinFrameInfo->End); + CurrentWinFrameInfo = CurrentWinFrameInfo->ChainedParent; } -void MCStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind, - bool Except) { - EnsureValidW64UnwindInfo(); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; - if (CurFrame->ChainedParent) +void MCStreamer::EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, + bool Except) { + EnsureValidWinFrameInfo(); + if (CurrentWinFrameInfo->ChainedParent) report_fatal_error("Chained unwind areas can't have handlers!"); - CurFrame->ExceptionHandler = Sym; + CurrentWinFrameInfo->ExceptionHandler = Sym; if (!Except && !Unwind) report_fatal_error("Don't know what kind of handler this is!"); if (Unwind) - CurFrame->HandlesUnwind = true; + CurrentWinFrameInfo->HandlesUnwind = true; if (Except) - CurFrame->HandlesExceptions = true; + CurrentWinFrameInfo->HandlesExceptions = true; } -void MCStreamer::EmitWin64EHHandlerData() { - EnsureValidW64UnwindInfo(); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; - if (CurFrame->ChainedParent) +void MCStreamer::EmitWinEHHandlerData() { + EnsureValidWinFrameInfo(); + if (CurrentWinFrameInfo->ChainedParent) report_fatal_error("Chained unwind areas can't have handlers!"); } -void MCStreamer::EmitWin64EHPushReg(unsigned Register) { - EnsureValidW64UnwindInfo(); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; +void MCStreamer::EmitWinCFIPushReg(unsigned Register) { + EnsureValidWinFrameInfo(); + MCSymbol *Label = getContext().CreateTempSymbol(); - MCWin64EHInstruction Inst(Win64EH::UOP_PushNonVol, Label, Register); EmitLabel(Label); - CurFrame->Instructions.push_back(Inst); + + WinEH::Instruction Inst = Win64EH::Instruction::PushNonVol(Label, Register); + CurrentWinFrameInfo->Instructions.push_back(Inst); } -void MCStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) { - EnsureValidW64UnwindInfo(); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; - if (CurFrame->LastFrameInst >= 0) +void MCStreamer::EmitWinCFISetFrame(unsigned Register, unsigned Offset) { + EnsureValidWinFrameInfo(); + if (CurrentWinFrameInfo->LastFrameInst >= 0) report_fatal_error("Frame register and offset already specified!"); if (Offset & 0x0F) report_fatal_error("Misaligned frame pointer offset!"); + if (Offset > 240) + report_fatal_error("Frame offset must be less than or equal to 240!"); + MCSymbol *Label = getContext().CreateTempSymbol(); - MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, Label, Register, Offset); EmitLabel(Label); - CurFrame->LastFrameInst = CurFrame->Instructions.size(); - CurFrame->Instructions.push_back(Inst); + + WinEH::Instruction Inst = + Win64EH::Instruction::SetFPReg(Label, Register, Offset); + CurrentWinFrameInfo->LastFrameInst = CurrentWinFrameInfo->Instructions.size(); + CurrentWinFrameInfo->Instructions.push_back(Inst); } -void MCStreamer::EmitWin64EHAllocStack(unsigned Size) { - EnsureValidW64UnwindInfo(); +void MCStreamer::EmitWinCFIAllocStack(unsigned Size) { + EnsureValidWinFrameInfo(); + if (Size == 0) + report_fatal_error("Allocation size must be non-zero!"); if (Size & 7) report_fatal_error("Misaligned stack allocation!"); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + MCSymbol *Label = getContext().CreateTempSymbol(); - MCWin64EHInstruction Inst(Label, Size); EmitLabel(Label); - CurFrame->Instructions.push_back(Inst); + + WinEH::Instruction Inst = Win64EH::Instruction::Alloc(Label, Size); + CurrentWinFrameInfo->Instructions.push_back(Inst); } -void MCStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) { - EnsureValidW64UnwindInfo(); +void MCStreamer::EmitWinCFISaveReg(unsigned Register, unsigned Offset) { + EnsureValidWinFrameInfo(); if (Offset & 7) report_fatal_error("Misaligned saved register offset!"); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + MCSymbol *Label = getContext().CreateTempSymbol(); - MCWin64EHInstruction Inst( - Offset > 512*1024-8 ? Win64EH::UOP_SaveNonVolBig : Win64EH::UOP_SaveNonVol, - Label, Register, Offset); EmitLabel(Label); - CurFrame->Instructions.push_back(Inst); + + WinEH::Instruction Inst = + Win64EH::Instruction::SaveNonVol(Label, Register, Offset); + CurrentWinFrameInfo->Instructions.push_back(Inst); } -void MCStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) { - EnsureValidW64UnwindInfo(); +void MCStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset) { + EnsureValidWinFrameInfo(); if (Offset & 0x0F) report_fatal_error("Misaligned saved vector register offset!"); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + MCSymbol *Label = getContext().CreateTempSymbol(); - MCWin64EHInstruction Inst( - Offset > 512*1024-16 ? Win64EH::UOP_SaveXMM128Big : Win64EH::UOP_SaveXMM128, - Label, Register, Offset); EmitLabel(Label); - CurFrame->Instructions.push_back(Inst); + + WinEH::Instruction Inst = + Win64EH::Instruction::SaveXMM(Label, Register, Offset); + CurrentWinFrameInfo->Instructions.push_back(Inst); } -void MCStreamer::EmitWin64EHPushFrame(bool Code) { - EnsureValidW64UnwindInfo(); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; - if (CurFrame->Instructions.size() > 0) +void MCStreamer::EmitWinCFIPushFrame(bool Code) { + EnsureValidWinFrameInfo(); + if (CurrentWinFrameInfo->Instructions.size() > 0) report_fatal_error("If present, PushMachFrame must be the first UOP"); + MCSymbol *Label = getContext().CreateTempSymbol(); - MCWin64EHInstruction Inst(Win64EH::UOP_PushMachFrame, Label, Code); EmitLabel(Label); - CurFrame->Instructions.push_back(Inst); + + WinEH::Instruction Inst = Win64EH::Instruction::PushMachFrame(Label, Code); + CurrentWinFrameInfo->Instructions.push_back(Inst); } -void MCStreamer::EmitWin64EHEndProlog() { - EnsureValidW64UnwindInfo(); - MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; - CurFrame->PrologEnd = getContext().CreateTempSymbol(); - EmitLabel(CurFrame->PrologEnd); +void MCStreamer::EmitWinCFIEndProlog() { + EnsureValidWinFrameInfo(); + CurrentWinFrameInfo->PrologEnd = getContext().CreateTempSymbol(); + EmitLabel(CurrentWinFrameInfo->PrologEnd); } void MCStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) { - llvm_unreachable("This file format doesn't support this directive"); } void MCStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) { - llvm_unreachable("This file format doesn't support this directive"); } /// EmitRawText - If this file is backed by an assembly streamer, this dumps @@ -610,15 +591,15 @@ void MCStreamer::EmitRawText(const Twine &T) { EmitRawTextImpl(T.toStringRef(Str)); } -void MCStreamer::EmitW64Tables() { - if (!getNumW64UnwindInfos()) +void MCStreamer::EmitWindowsUnwindTables() { + if (!getNumWinFrameInfos()) return; MCWin64EHUnwindEmitter::Emit(*this); } void MCStreamer::Finish() { - if (!FrameInfos.empty() && !FrameInfos.back().End) + if (!DwarfFrameInfos.empty() && !DwarfFrameInfos.back().End) report_fatal_error("Unfinished frame!"); MCTargetStreamer *TS = getTargetStreamer(); @@ -629,9 +610,82 @@ void MCStreamer::Finish() { } void MCStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) { + visitUsedExpr(*Value); Symbol->setVariableValue(Value); MCTargetStreamer *TS = getTargetStreamer(); if (TS) TS->emitAssignment(Symbol, Value); } + +void MCStreamer::visitUsedSymbol(const MCSymbol &Sym) { +} + +void MCStreamer::visitUsedExpr(const MCExpr &Expr) { + switch (Expr.getKind()) { + case MCExpr::Target: + cast(Expr).visitUsedExpr(*this); + break; + + case MCExpr::Constant: + break; + + case MCExpr::Binary: { + const MCBinaryExpr &BE = cast(Expr); + visitUsedExpr(*BE.getLHS()); + visitUsedExpr(*BE.getRHS()); + break; + } + + case MCExpr::SymbolRef: + visitUsedSymbol(cast(Expr).getSymbol()); + break; + + case MCExpr::Unary: + visitUsedExpr(*cast(Expr).getSubExpr()); + break; + } +} + +void MCStreamer::EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI) { + // Scan for values. + for (unsigned i = Inst.getNumOperands(); i--;) + if (Inst.getOperand(i).isExpr()) + visitUsedExpr(*Inst.getOperand(i).getExpr()); +} + +void MCStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {} +void MCStreamer::EmitThumbFunc(MCSymbol *Func) {} +void MCStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {} +void MCStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {} +void MCStreamer::EndCOFFSymbolDef() {} +void MCStreamer::EmitFileDirective(StringRef Filename) {} +void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {} +void MCStreamer::EmitCOFFSymbolType(int Type) {} +void MCStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {} +void MCStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment) {} +void MCStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, + uint64_t Size, unsigned ByteAlignment) {} +void MCStreamer::ChangeSection(const MCSection *, const MCExpr *) {} +void MCStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {} +void MCStreamer::EmitBytes(StringRef Data) {} +void MCStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, + const SMLoc &Loc) { + visitUsedExpr(*Value); +} +void MCStreamer::EmitULEB128Value(const MCExpr *Value) {} +void MCStreamer::EmitSLEB128Value(const MCExpr *Value) {} +void MCStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value, + unsigned ValueSize, + unsigned MaxBytesToEmit) {} +void MCStreamer::EmitCodeAlignment(unsigned ByteAlignment, + unsigned MaxBytesToEmit) {} +bool MCStreamer::EmitValueToOffset(const MCExpr *Offset, unsigned char Value) { + return false; +} +void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {} +void MCStreamer::EmitBundleLock(bool AlignToEnd) {} +void MCStreamer::FinishImpl() {} +void MCStreamer::EmitBundleUnlock() {} diff --git a/lib/MC/MCTargetOptions.cpp b/lib/MC/MCTargetOptions.cpp index 8e946d57f7fb..efd724a15df6 100644 --- a/lib/MC/MCTargetOptions.cpp +++ b/lib/MC/MCTargetOptions.cpp @@ -14,6 +14,7 @@ namespace llvm { MCTargetOptions::MCTargetOptions() : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false), MCSaveTempLabels(false), MCUseDwarfDirectory(false), - ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false) {} + ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false), + DwarfVersion(0) {} } // end namespace llvm diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp index b8b07d3a1808..95e198393729 100644 --- a/lib/MC/MCWin64EH.cpp +++ b/lib/MC/MCWin64EH.cpp @@ -15,114 +15,108 @@ #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Win64EH.h" namespace llvm { // NOTE: All relocations generated here are 4-byte image-relative. -static uint8_t CountOfUnwindCodes(std::vector &instArray){ - uint8_t count = 0; - for (std::vector::const_iterator I = instArray.begin(), - E = instArray.end(); I != E; ++I) { - switch (I->getOperation()) { +static uint8_t CountOfUnwindCodes(std::vector &Insns) { + uint8_t Count = 0; + for (const auto &I : Insns) { + switch (static_cast(I.Operation)) { case Win64EH::UOP_PushNonVol: case Win64EH::UOP_AllocSmall: case Win64EH::UOP_SetFPReg: case Win64EH::UOP_PushMachFrame: - count += 1; + Count += 1; break; case Win64EH::UOP_SaveNonVol: case Win64EH::UOP_SaveXMM128: - count += 2; + Count += 2; break; case Win64EH::UOP_SaveNonVolBig: case Win64EH::UOP_SaveXMM128Big: - count += 3; + Count += 3; break; case Win64EH::UOP_AllocLarge: - if (I->getSize() > 512*1024-8) - count += 3; - else - count += 2; + Count += (I.Offset > 512 * 1024 - 8) ? 3 : 2; break; } } - return count; + return Count; } -static void EmitAbsDifference(MCStreamer &streamer, MCSymbol *lhs, - MCSymbol *rhs) { - MCContext &context = streamer.getContext(); - const MCExpr *diff = MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create( - lhs, context), - MCSymbolRefExpr::Create( - rhs, context), - context); - streamer.EmitAbsValue(diff, 1); - +static void EmitAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS, + const MCSymbol *RHS) { + MCContext &Context = Streamer.getContext(); + const MCExpr *Diff = + MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(LHS, Context), + MCSymbolRefExpr::Create(RHS, Context), Context); + Streamer.EmitAbsValue(Diff, 1); } static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin, - MCWin64EHInstruction &inst) { + WinEH::Instruction &inst) { uint8_t b2; uint16_t w; - b2 = (inst.getOperation() & 0x0F); - switch (inst.getOperation()) { + b2 = (inst.Operation & 0x0F); + switch (static_cast(inst.Operation)) { case Win64EH::UOP_PushNonVol: - EmitAbsDifference(streamer, inst.getLabel(), begin); - b2 |= (inst.getRegister() & 0x0F) << 4; + EmitAbsDifference(streamer, inst.Label, begin); + b2 |= (inst.Register & 0x0F) << 4; streamer.EmitIntValue(b2, 1); break; case Win64EH::UOP_AllocLarge: - EmitAbsDifference(streamer, inst.getLabel(), begin); - if (inst.getSize() > 512*1024-8) { + EmitAbsDifference(streamer, inst.Label, begin); + if (inst.Offset > 512 * 1024 - 8) { b2 |= 0x10; streamer.EmitIntValue(b2, 1); - w = inst.getSize() & 0xFFF8; + w = inst.Offset & 0xFFF8; streamer.EmitIntValue(w, 2); - w = inst.getSize() >> 16; + w = inst.Offset >> 16; } else { streamer.EmitIntValue(b2, 1); - w = inst.getSize() >> 3; + w = inst.Offset >> 3; } streamer.EmitIntValue(w, 2); break; case Win64EH::UOP_AllocSmall: - b2 |= (((inst.getSize()-8) >> 3) & 0x0F) << 4; - EmitAbsDifference(streamer, inst.getLabel(), begin); + b2 |= (((inst.Offset - 8) >> 3) & 0x0F) << 4; + EmitAbsDifference(streamer, inst.Label, begin); streamer.EmitIntValue(b2, 1); break; case Win64EH::UOP_SetFPReg: - EmitAbsDifference(streamer, inst.getLabel(), begin); + EmitAbsDifference(streamer, inst.Label, begin); streamer.EmitIntValue(b2, 1); break; case Win64EH::UOP_SaveNonVol: case Win64EH::UOP_SaveXMM128: - b2 |= (inst.getRegister() & 0x0F) << 4; - EmitAbsDifference(streamer, inst.getLabel(), begin); + b2 |= (inst.Register & 0x0F) << 4; + EmitAbsDifference(streamer, inst.Label, begin); streamer.EmitIntValue(b2, 1); - w = inst.getOffset() >> 3; - if (inst.getOperation() == Win64EH::UOP_SaveXMM128) + w = inst.Offset >> 3; + if (inst.Operation == Win64EH::UOP_SaveXMM128) w >>= 1; streamer.EmitIntValue(w, 2); break; case Win64EH::UOP_SaveNonVolBig: case Win64EH::UOP_SaveXMM128Big: - b2 |= (inst.getRegister() & 0x0F) << 4; - EmitAbsDifference(streamer, inst.getLabel(), begin); + b2 |= (inst.Register & 0x0F) << 4; + EmitAbsDifference(streamer, inst.Label, begin); streamer.EmitIntValue(b2, 1); - if (inst.getOperation() == Win64EH::UOP_SaveXMM128Big) - w = inst.getOffset() & 0xFFF0; + if (inst.Operation == Win64EH::UOP_SaveXMM128Big) + w = inst.Offset & 0xFFF0; else - w = inst.getOffset() & 0xFFF8; + w = inst.Offset & 0xFFF8; streamer.EmitIntValue(w, 2); - w = inst.getOffset() >> 16; + w = inst.Offset >> 16; streamer.EmitIntValue(w, 2); break; case Win64EH::UOP_PushMachFrame: - if (inst.isPushCodeFrame()) + if (inst.Offset == 1) b2 |= 0x10; - EmitAbsDifference(streamer, inst.getLabel(), begin); + EmitAbsDifference(streamer, inst.Label, begin); streamer.EmitIntValue(b2, 1); break; } @@ -142,7 +136,7 @@ static void EmitSymbolRefWithOfs(MCStreamer &streamer, } static void EmitRuntimeFunction(MCStreamer &streamer, - const MCWin64EHUnwindInfo *info) { + const MCWinFrameInfo *info) { MCContext &context = streamer.getContext(); streamer.EmitValueToAlignment(4); @@ -153,7 +147,7 @@ static void EmitRuntimeFunction(MCStreamer &streamer, context), 4); } -static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) { +static void EmitUnwindInfo(MCStreamer &streamer, MCWinFrameInfo *info) { // If this UNWIND_INFO already has a symbol, it's already been emitted. if (info->Symbol) return; @@ -184,17 +178,16 @@ static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) { uint8_t frame = 0; if (info->LastFrameInst >= 0) { - MCWin64EHInstruction &frameInst = info->Instructions[info->LastFrameInst]; - assert(frameInst.getOperation() == Win64EH::UOP_SetFPReg); - frame = (frameInst.getRegister() & 0x0F) | - (frameInst.getOffset() & 0xF0); + WinEH::Instruction &frameInst = info->Instructions[info->LastFrameInst]; + assert(frameInst.Operation == Win64EH::UOP_SetFPReg); + frame = (frameInst.Register & 0x0F) | (frameInst.Offset & 0xF0); } streamer.EmitIntValue(frame, 1); // Emit unwind instructions (in reverse order). uint8_t numInst = info->Instructions.size(); for (uint8_t c = 0; c < numInst; ++c) { - MCWin64EHInstruction inst = info->Instructions.back(); + WinEH::Instruction inst = info->Instructions.back(); info->Instructions.pop_back(); EmitUnwindCode(streamer, info->Begin, inst); } @@ -263,7 +256,7 @@ static const MCSection *getWin64EHFuncTableSection(StringRef suffix, } void MCWin64EHUnwindEmitter::EmitUnwindInfo(MCStreamer &streamer, - MCWin64EHUnwindInfo *info) { + MCWinFrameInfo *info) { // Switch sections (the static function above is meant to be called from // here and from Emit(). MCContext &context = streamer.getContext(); @@ -274,23 +267,23 @@ void MCWin64EHUnwindEmitter::EmitUnwindInfo(MCStreamer &streamer, llvm::EmitUnwindInfo(streamer, info); } -void MCWin64EHUnwindEmitter::Emit(MCStreamer &streamer) { - MCContext &context = streamer.getContext(); +void MCWin64EHUnwindEmitter::Emit(MCStreamer &Streamer) { + MCContext &Context = Streamer.getContext(); + // Emit the unwind info structs first. - for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) { - MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i); - const MCSection *xdataSect = - getWin64EHTableSection(GetSectionSuffix(info.Function), context); - streamer.SwitchSection(xdataSect); - llvm::EmitUnwindInfo(streamer, &info); + for (const auto &CFI : Streamer.getWinFrameInfos()) { + const MCSection *XData = + getWin64EHTableSection(GetSectionSuffix(CFI->Function), Context); + Streamer.SwitchSection(XData); + EmitUnwindInfo(Streamer, CFI); } + // Now emit RUNTIME_FUNCTION entries. - for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) { - MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i); - const MCSection *pdataSect = - getWin64EHFuncTableSection(GetSectionSuffix(info.Function), context); - streamer.SwitchSection(pdataSect); - EmitRuntimeFunction(streamer, &info); + for (const auto &CFI : Streamer.getWinFrameInfos()) { + const MCSection *PData = + getWin64EHFuncTableSection(GetSectionSuffix(CFI->Function), Context); + Streamer.SwitchSection(PData); + EmitRuntimeFunction(Streamer, CFI); } } diff --git a/lib/MC/Makefile b/lib/MC/Makefile index bf8b7c0e7831..a10f17e30be8 100644 --- a/lib/MC/Makefile +++ b/lib/MC/Makefile @@ -10,7 +10,7 @@ LEVEL = ../.. LIBRARYNAME = LLVMMC BUILD_ARCHIVE := 1 -PARALLEL_DIRS := MCParser MCDisassembler +PARALLEL_DIRS := MCAnalysis MCParser MCDisassembler include $(LEVEL)/Makefile.common diff --git a/lib/Object/StringTableBuilder.cpp b/lib/MC/StringTableBuilder.cpp similarity index 96% rename from lib/Object/StringTableBuilder.cpp rename to lib/MC/StringTableBuilder.cpp index 9152834a2966..db58ece5c866 100644 --- a/lib/Object/StringTableBuilder.cpp +++ b/lib/MC/StringTableBuilder.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/MC/StringTableBuilder.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Object/StringTableBuilder.h" using namespace llvm; diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp index 2d4b75858a5f..824895be32de 100644 --- a/lib/MC/WinCOFFObjectWriter.cpp +++ b/lib/MC/WinCOFFObjectWriter.cpp @@ -153,7 +153,7 @@ class WinCOFFObjectWriter : public MCObjectWriter { void MakeSymbolReal(COFFSymbol &S, size_t Index); void MakeSectionReal(COFFSection &S, size_t Number); - bool ExportSymbol(MCSymbolData const &SymbolData, MCAssembler &Asm); + bool ExportSymbol(const MCSymbol &Symbol, MCAssembler &Asm); bool IsPhysicalSection(COFFSection *S); @@ -456,10 +456,13 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData, // If no storage class was specified in the streamer, define it here. if (coff_symbol->Data.StorageClass == 0) { - bool external = ResSymData.isExternal() || !ResSymData.Fragment; + bool IsExternal = + ResSymData.isExternal() || + (!ResSymData.getFragment() && !ResSymData.getSymbol().isVariable()); - coff_symbol->Data.StorageClass = - external ? COFF::IMAGE_SYM_CLASS_EXTERNAL : COFF::IMAGE_SYM_CLASS_STATIC; + coff_symbol->Data.StorageClass = IsExternal + ? COFF::IMAGE_SYM_CLASS_EXTERNAL + : COFF::IMAGE_SYM_CLASS_STATIC; } if (!Base) { @@ -546,16 +549,24 @@ void WinCOFFObjectWriter::MakeSymbolReal(COFFSymbol &S, size_t Index) { S.Index = Index; } -bool WinCOFFObjectWriter::ExportSymbol(MCSymbolData const &SymbolData, +bool WinCOFFObjectWriter::ExportSymbol(const MCSymbol &Symbol, MCAssembler &Asm) { // This doesn't seem to be right. Strings referred to from the .data section // need symbols so they can be linked to code in the .text section right? - // return Asm.isSymbolLinkerVisible (&SymbolData); + // return Asm.isSymbolLinkerVisible(Symbol); + + // Non-temporary labels should always be visible to the linker. + if (!Symbol.isTemporary()) + return true; + + // Absolute temporary labels are never visible. + if (!Symbol.isInSection()) + return false; // For now, all non-variable symbols are exported, // the linker will sort the rest out for us. - return SymbolData.isExternal() || !SymbolData.getSymbol().isVariable(); + return !Symbol.isVariable(); } bool WinCOFFObjectWriter::IsPhysicalSection(COFFSection *S) { @@ -689,7 +700,7 @@ void WinCOFFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm, DefineSection(Section); for (MCSymbolData &SD : Asm.symbols()) - if (ExportSymbol(SD, Asm)) + if (ExportSymbol(SD.getSymbol(), Asm)) DefineSymbol(SD, Asm, Layout); } diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp index e6df4651a536..d391a3f43956 100644 --- a/lib/MC/WinCOFFStreamer.cpp +++ b/lib/MC/WinCOFFStreamer.cpp @@ -82,10 +82,6 @@ void MCWinCOFFStreamer::EmitLabel(MCSymbol *Symbol) { MCObjectStreamer::EmitLabel(Symbol); } -void MCWinCOFFStreamer::EmitDebugLabel(MCSymbol *Symbol) { - EmitLabel(Symbol); -} - void MCWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { llvm_unreachable("not implemented"); } @@ -242,7 +238,7 @@ void MCWinCOFFStreamer::EmitIdent(StringRef IdentString) { llvm_unreachable("not implemented"); } -void MCWinCOFFStreamer::EmitWin64EHHandlerData() { +void MCWinCOFFStreamer::EmitWinEHHandlerData() { llvm_unreachable("not implemented"); } diff --git a/lib/Object/YAML.cpp b/lib/MC/YAML.cpp similarity index 78% rename from lib/Object/YAML.cpp rename to lib/MC/YAML.cpp index 61e9da303959..067e91a26d37 100644 --- a/lib/Object/YAML.cpp +++ b/lib/MC/YAML.cpp @@ -12,21 +12,20 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Object/YAML.h" +#include "llvm/MC/YAML.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/raw_ostream.h" #include using namespace llvm; -using namespace object::yaml; -void yaml::ScalarTraits::output( - const object::yaml::BinaryRef &Val, void *, llvm::raw_ostream &Out) { +void yaml::ScalarTraits::output( + const yaml::BinaryRef &Val, void *, llvm::raw_ostream &Out) { Val.writeAsHex(Out); } -StringRef yaml::ScalarTraits::input( - StringRef Scalar, void *, object::yaml::BinaryRef &Val) { +StringRef yaml::ScalarTraits::input(StringRef Scalar, void *, + yaml::BinaryRef &Val) { if (Scalar.size() % 2 != 0) return "BinaryRef hex string must contain an even number of nybbles."; // TODO: Can we improve YAMLIO to permit a more accurate diagnostic here? @@ -34,11 +33,11 @@ StringRef yaml::ScalarTraits::input( for (unsigned I = 0, N = Scalar.size(); I != N; ++I) if (!isxdigit(Scalar[I])) return "BinaryRef hex string must contain only hex digits."; - Val = object::yaml::BinaryRef(Scalar); + Val = yaml::BinaryRef(Scalar); return StringRef(); } -void BinaryRef::writeAsBinary(raw_ostream &OS) const { +void yaml::BinaryRef::writeAsBinary(raw_ostream &OS) const { if (!DataIsHexString) { OS.write((const char *)Data.data(), Data.size()); return; @@ -50,7 +49,7 @@ void BinaryRef::writeAsBinary(raw_ostream &OS) const { } } -void BinaryRef::writeAsHex(raw_ostream &OS) const { +void yaml::BinaryRef::writeAsHex(raw_ostream &OS) const { if (binary_size() == 0) return; if (DataIsHexString) { diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp index 05e891384f12..e32bdd5c5bec 100644 --- a/lib/Object/Archive.cpp +++ b/lib/Object/Archive.cpp @@ -177,27 +177,26 @@ Archive::Child::getMemoryBuffer(bool FullPath) const { ErrorOr> Archive::Child::getAsBinary(LLVMContext *Context) const { - std::unique_ptr ret; ErrorOr> BuffOrErr = getMemoryBuffer(); if (std::error_code EC = BuffOrErr.getError()) return EC; - return createBinary(BuffOrErr.get().release(), Context); + + return createBinary(std::move(*BuffOrErr), Context); } -ErrorOr Archive::create(MemoryBuffer *Source) { +ErrorOr Archive::create(std::unique_ptr Source) { std::error_code EC; - std::unique_ptr Ret(new Archive(Source, EC)); + std::unique_ptr Ret(new Archive(std::move(Source), EC)); if (EC) return EC; return Ret.release(); } -Archive::Archive(MemoryBuffer *source, std::error_code &ec) - : Binary(Binary::ID_Archive, source), SymbolTable(child_end()) { +Archive::Archive(std::unique_ptr Source, std::error_code &ec) + : Binary(Binary::ID_Archive, std::move(Source)), SymbolTable(child_end()) { // Check for sufficient magic. - assert(source); - if (source->getBufferSize() < 8 || - StringRef(source->getBufferStart(), 8) != Magic) { + if (Data->getBufferSize() < 8 || + StringRef(Data->getBufferStart(), 8) != Magic) { ec = object_error::invalid_file_type; return; } @@ -338,7 +337,14 @@ ErrorOr Archive::Symbol::getMember() const { Offset = *(reinterpret_cast(Offsets) + SymbolIndex); } else if (Parent->kind() == K_BSD) { - llvm_unreachable("BSD format is not supported"); + // The SymbolIndex is an index into the ranlib structs that start at + // Offsets (the first uint32_t is the number of bytes of the ranlib + // structs). The ranlib structs are a pair of uint32_t's the first + // being a string table offset and the second being the offset into + // the archive of the member that defines the symbol. Which is what + // is needed here. + Offset = *(reinterpret_cast(Offsets) + + (SymbolIndex * 2) + 1); } else { uint32_t MemberCount = *reinterpret_cast(Buf); @@ -376,9 +382,43 @@ ErrorOr Archive::Symbol::getMember() const { Archive::Symbol Archive::Symbol::getNext() const { Symbol t(*this); - // Go to one past next null. - t.StringIndex = - Parent->SymbolTable->getBuffer().find('\0', t.StringIndex) + 1; + if (Parent->kind() == K_BSD) { + // t.StringIndex is an offset from the start of the __.SYMDEF or + // "__.SYMDEF SORTED" member into the string table for the ranlib + // struct indexed by t.SymbolIndex . To change t.StringIndex to the + // offset in the string table for t.SymbolIndex+1 we subtract the + // its offset from the start of the string table for t.SymbolIndex + // and add the offset of the string table for t.SymbolIndex+1. + + // The __.SYMDEF or "__.SYMDEF SORTED" member starts with a uint32_t + // which is the number of bytes of ranlib structs that follow. The ranlib + // structs are a pair of uint32_t's the first being a string table offset + // and the second being the offset into the archive of the member that + // define the symbol. After that the next uint32_t is the byte count of + // the string table followed by the string table. + const char *Buf = Parent->SymbolTable->getBuffer().begin(); + uint32_t RanlibCount = 0; + RanlibCount = (*reinterpret_cast(Buf)) / + (sizeof(uint32_t) * 2); + // If t.SymbolIndex + 1 will be past the count of symbols (the RanlibCount) + // don't change the t.StringIndex as we don't want to reference a ranlib + // past RanlibCount. + if (t.SymbolIndex + 1 < RanlibCount) { + const char *Ranlibs = Buf + 4; + uint32_t CurRanStrx = 0; + uint32_t NextRanStrx = 0; + CurRanStrx = *(reinterpret_cast(Ranlibs) + + (t.SymbolIndex * 2)); + NextRanStrx = *(reinterpret_cast(Ranlibs) + + ((t.SymbolIndex + 1) * 2)); + t.StringIndex -= CurRanStrx; + t.StringIndex += NextRanStrx; + } + } else { + // Go to one past next null. + t.StringIndex = + Parent->SymbolTable->getBuffer().find('\0', t.StringIndex) + 1; + } ++t.SymbolIndex; return t; } @@ -393,7 +433,22 @@ Archive::symbol_iterator Archive::symbol_begin() const { symbol_count = *reinterpret_cast(buf); buf += sizeof(uint32_t) + (symbol_count * (sizeof(uint32_t))); } else if (kind() == K_BSD) { - llvm_unreachable("BSD archive format is not supported"); + // The __.SYMDEF or "__.SYMDEF SORTED" member starts with a uint32_t + // which is the number of bytes of ranlib structs that follow. The ranlib + // structs are a pair of uint32_t's the first being a string table offset + // and the second being the offset into the archive of the member that + // define the symbol. After that the next uint32_t is the byte count of + // the string table followed by the string table. + uint32_t ranlib_count = 0; + ranlib_count = (*reinterpret_cast(buf)) / + (sizeof(uint32_t) * 2); + const char *ranlibs = buf + 4; + uint32_t ran_strx = 0; + ran_strx = *(reinterpret_cast(ranlibs)); + buf += sizeof(uint32_t) + (ranlib_count * (2 * (sizeof(uint32_t)))); + // Skip the byte count of the string table. + buf += sizeof(uint32_t); + buf += ran_strx; } else { uint32_t member_count = 0; uint32_t symbol_count = 0; @@ -415,7 +470,8 @@ Archive::symbol_iterator Archive::symbol_end() const { if (kind() == K_GNU) { symbol_count = *reinterpret_cast(buf); } else if (kind() == K_BSD) { - llvm_unreachable("BSD archive format is not supported"); + symbol_count = (*reinterpret_cast(buf)) / + (sizeof(uint32_t) * 2); } else { uint32_t member_count = 0; member_count = *reinterpret_cast(buf); diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp index c55ed0c75988..552d5db89c54 100644 --- a/lib/Object/Binary.cpp +++ b/lib/Object/Binary.cpp @@ -25,13 +25,10 @@ using namespace llvm; using namespace object; -Binary::~Binary() { - if (BufferOwned) - delete Data; -} +Binary::~Binary() {} -Binary::Binary(unsigned int Type, MemoryBuffer *Source, bool BufferOwned) - : TypeID(Type), BufferOwned(BufferOwned), Data(Source) {} +Binary::Binary(unsigned int Type, std::unique_ptr Source) + : TypeID(Type), Data(std::move(Source)) {} StringRef Binary::getData() const { return Data->getBuffer(); @@ -41,14 +38,13 @@ StringRef Binary::getFileName() const { return Data->getBufferIdentifier(); } -ErrorOr object::createBinary(MemoryBuffer *Source, +ErrorOr object::createBinary(std::unique_ptr Buffer, LLVMContext *Context) { - std::unique_ptr scopedSource(Source); - sys::fs::file_magic Type = sys::fs::identify_magic(Source->getBuffer()); + sys::fs::file_magic Type = sys::fs::identify_magic(Buffer->getBuffer()); switch (Type) { case sys::fs::file_magic::archive: - return Archive::create(scopedSource.release()); + return Archive::create(std::move(Buffer)); case sys::fs::file_magic::elf_relocatable: case sys::fs::file_magic::elf_executable: case sys::fs::file_magic::elf_shared_object: @@ -67,10 +63,9 @@ ErrorOr object::createBinary(MemoryBuffer *Source, case sys::fs::file_magic::coff_import_library: case sys::fs::file_magic::pecoff_executable: case sys::fs::file_magic::bitcode: - return ObjectFile::createSymbolicFile(scopedSource.release(), true, Type, - Context); + return ObjectFile::createSymbolicFile(Buffer, Type, Context); case sys::fs::file_magic::macho_universal_binary: - return MachOUniversalBinary::create(scopedSource.release()); + return MachOUniversalBinary::create(std::move(Buffer)); case sys::fs::file_magic::unknown: case sys::fs::file_magic::windows_resource: // Unrecognized object file format. @@ -80,8 +75,9 @@ ErrorOr object::createBinary(MemoryBuffer *Source, } ErrorOr object::createBinary(StringRef Path) { - std::unique_ptr File; - if (std::error_code EC = MemoryBuffer::getFileOrSTDIN(Path, File)) + ErrorOr> FileOrErr = + MemoryBuffer::getFileOrSTDIN(Path); + if (std::error_code EC = FileOrErr.getError()) return EC; - return createBinary(File.release()); + return createBinary(std::move(*FileOrErr)); } diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt index cd8c9efe7b01..5b08e42dd9e6 100644 --- a/lib/Object/CMakeLists.txt +++ b/lib/Object/CMakeLists.txt @@ -12,7 +12,6 @@ add_llvm_library(LLVMObject MachOUniversal.cpp Object.cpp ObjectFile.cpp - StringTableBuilder.cpp + RecordStreamer.cpp SymbolicFile.cpp - YAML.cpp ) diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp index 186d64bafd44..46ef87d15680 100644 --- a/lib/Object/COFFObjectFile.cpp +++ b/lib/Object/COFFObjectFile.cpp @@ -31,9 +31,9 @@ using support::ulittle32_t; using support::little16_t; // Returns false if size is greater than the buffer size. And sets ec. -static bool checkSize(const MemoryBuffer *M, std::error_code &EC, +static bool checkSize(const MemoryBuffer &M, std::error_code &EC, uint64_t Size) { - if (M->getBufferSize() < Size) { + if (M.getBufferSize() < Size) { EC = object_error::unexpected_eof; return false; } @@ -43,13 +43,12 @@ static bool checkSize(const MemoryBuffer *M, std::error_code &EC, // Sets Obj unless any bytes in [addr, addr + size) fall outsize of m. // Returns unexpected_eof if error. template -static std::error_code getObject(const T *&Obj, const MemoryBuffer *M, +static std::error_code getObject(const T *&Obj, const MemoryBuffer &M, const uint8_t *Ptr, const size_t Size = sizeof(T)) { uintptr_t Addr = uintptr_t(Ptr); - if (Addr + Size < Addr || - Addr + Size < Size || - Addr + Size > uintptr_t(M->getBufferEnd())) { + if (Addr + Size < Addr || Addr + Size < Size || + Addr + Size > uintptr_t(M.getBufferEnd())) { return object_error::unexpected_eof; } Obj = reinterpret_cast(Addr); @@ -398,7 +397,7 @@ relocation_iterator COFFObjectFile::section_rel_end(DataRefImpl Ref) const { // Initialize the pointer to the symbol table. std::error_code COFFObjectFile::initSymbolTablePtr() { if (std::error_code EC = getObject( - SymbolTable, Data, base() + COFFHeader->PointerToSymbolTable, + SymbolTable, *Data, base() + COFFHeader->PointerToSymbolTable, COFFHeader->NumberOfSymbols * sizeof(coff_symbol))) return EC; @@ -409,11 +408,12 @@ std::error_code COFFObjectFile::initSymbolTablePtr() { base() + COFFHeader->PointerToSymbolTable + COFFHeader->NumberOfSymbols * sizeof(coff_symbol); const ulittle32_t *StringTableSizePtr; - if (std::error_code EC = getObject(StringTableSizePtr, Data, StringTableAddr)) + if (std::error_code EC = + getObject(StringTableSizePtr, *Data, StringTableAddr)) return EC; StringTableSize = *StringTableSizePtr; if (std::error_code EC = - getObject(StringTable, Data, StringTableAddr, StringTableSize)) + getObject(StringTable, *Data, StringTableAddr, StringTableSize)) return EC; // Treat table sizes < 4 as empty because contrary to the PECOFF spec, some @@ -511,15 +511,16 @@ std::error_code COFFObjectFile::initExportTablePtr() { return object_error::success; } -COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, std::error_code &EC, - bool BufferOwned) - : ObjectFile(Binary::ID_COFF, Object, BufferOwned), COFFHeader(nullptr), +COFFObjectFile::COFFObjectFile(std::unique_ptr Object, + std::error_code &EC) + : ObjectFile(Binary::ID_COFF, std::move(Object)), COFFHeader(nullptr), PE32Header(nullptr), PE32PlusHeader(nullptr), DataDirectory(nullptr), SectionTable(nullptr), SymbolTable(nullptr), StringTable(nullptr), StringTableSize(0), ImportDirectory(nullptr), NumberOfImportDirectory(0), ExportDirectory(nullptr) { // Check that we at least have enough room for a header. - if (!checkSize(Data, EC, sizeof(coff_file_header))) return; + if (!checkSize(*Data, EC, sizeof(coff_file_header))) + return; // The current location in the file where we are looking at. uint64_t CurPtr = 0; @@ -532,7 +533,8 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, std::error_code &EC, if (base()[0] == 0x4d && base()[1] == 0x5a) { // PE/COFF, seek through MS-DOS compatibility stub and 4-byte // PE signature to find 'normal' COFF header. - if (!checkSize(Data, EC, 0x3c + 8)) return; + if (!checkSize(*Data, EC, 0x3c + 8)) + return; CurPtr = *reinterpret_cast(base() + 0x3c); // Check the PE magic bytes. ("PE\0\0") if (std::memcmp(base() + CurPtr, "PE\0\0", 4) != 0) { @@ -543,13 +545,13 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, std::error_code &EC, HasPEHeader = true; } - if ((EC = getObject(COFFHeader, Data, base() + CurPtr))) + if ((EC = getObject(COFFHeader, *Data, base() + CurPtr))) return; CurPtr += sizeof(coff_file_header); if (HasPEHeader) { const pe32_header *Header; - if ((EC = getObject(Header, Data, base() + CurPtr))) + if ((EC = getObject(Header, *Data, base() + CurPtr))) return; const uint8_t *DataDirAddr; @@ -567,7 +569,7 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, std::error_code &EC, EC = object_error::parse_failed; return; } - if ((EC = getObject(DataDirectory, Data, DataDirAddr, DataDirSize))) + if ((EC = getObject(DataDirectory, *Data, DataDirAddr, DataDirSize))) return; CurPtr += COFFHeader->SizeOfOptionalHeader; } @@ -575,7 +577,7 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, std::error_code &EC, if (COFFHeader->isImportLibrary()) return; - if ((EC = getObject(SectionTable, Data, base() + CurPtr, + if ((EC = getObject(SectionTable, *Data, base() + CurPtr, COFFHeader->NumberOfSections * sizeof(coff_section)))) return; @@ -1110,11 +1112,11 @@ ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const { return object_error::success; } -ErrorOr ObjectFile::createCOFFObjectFile(MemoryBuffer *Object, - bool BufferOwned) { +ErrorOr +ObjectFile::createCOFFObjectFile(std::unique_ptr Object) { std::error_code EC; std::unique_ptr Ret( - new COFFObjectFile(Object, EC, BufferOwned)); + new COFFObjectFile(std::move(Object), EC)); if (EC) return EC; return Ret.release(); diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp index df4dd5ea7c79..d999106ba982 100644 --- a/lib/Object/ELF.cpp +++ b/lib/Object/ELF.cpp @@ -519,6 +519,7 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) { LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_LO); LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HI); LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HA); + LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_PLTREL24); LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL32); LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLS); LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPMOD32); diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp index 0a3e2cb790d0..4f0f60b62428 100644 --- a/lib/Object/ELFObjectFile.cpp +++ b/lib/Object/ELFObjectFile.cpp @@ -17,9 +17,10 @@ namespace llvm { using namespace object; -static ErrorOr createELFObjectFileAux(MemoryBuffer *Obj, - bool BufferOwned) { - std::pair Ident = getElfArchType(Obj); +ErrorOr +ObjectFile::createELFObjectFile(std::unique_ptr &Obj) { + std::pair Ident = + getElfArchType(Obj->getBuffer()); std::size_t MaxAlignment = 1ULL << countTrailingZeros(uintptr_t(Obj->getBufferStart())); @@ -28,49 +29,49 @@ static ErrorOr createELFObjectFileAux(MemoryBuffer *Obj, if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) #if !LLVM_IS_UNALIGNED_ACCESS_FAST if (MaxAlignment >= 4) - R.reset(new ELFObjectFile >( - Obj, EC, BufferOwned)); + R.reset(new ELFObjectFile>( + std::move(Obj), EC)); else #endif if (MaxAlignment >= 2) - R.reset(new ELFObjectFile >( - Obj, EC, BufferOwned)); + R.reset(new ELFObjectFile>( + std::move(Obj), EC)); else return object_error::parse_failed; else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB) #if !LLVM_IS_UNALIGNED_ACCESS_FAST if (MaxAlignment >= 4) - R.reset(new ELFObjectFile >(Obj, EC, - BufferOwned)); + R.reset(new ELFObjectFile>(std::move(Obj), + EC)); else #endif if (MaxAlignment >= 2) - R.reset(new ELFObjectFile >(Obj, EC, - BufferOwned)); + R.reset(new ELFObjectFile>(std::move(Obj), + EC)); else return object_error::parse_failed; else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB) #if !LLVM_IS_UNALIGNED_ACCESS_FAST if (MaxAlignment >= 8) - R.reset(new ELFObjectFile >(Obj, EC, - BufferOwned)); + R.reset(new ELFObjectFile>(std::move(Obj), + EC)); else #endif if (MaxAlignment >= 2) - R.reset(new ELFObjectFile >(Obj, EC, - BufferOwned)); + R.reset(new ELFObjectFile>(std::move(Obj), + EC)); else return object_error::parse_failed; else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) { #if !LLVM_IS_UNALIGNED_ACCESS_FAST if (MaxAlignment >= 8) - R.reset(new ELFObjectFile >( - Obj, EC, BufferOwned)); + R.reset(new ELFObjectFile>( + std::move(Obj), EC)); else #endif if (MaxAlignment >= 2) - R.reset(new ELFObjectFile >( - Obj, EC, BufferOwned)); + R.reset(new ELFObjectFile>( + std::move(Obj), EC)); else return object_error::parse_failed; } @@ -82,12 +83,4 @@ static ErrorOr createELFObjectFileAux(MemoryBuffer *Obj, return R.release(); } -ErrorOr ObjectFile::createELFObjectFile(MemoryBuffer *Obj, - bool BufferOwned) { - ErrorOr Ret = createELFObjectFileAux(Obj, BufferOwned); - if (BufferOwned && Ret.getError()) - delete Obj; - return Ret; -} - } // end namespace llvm diff --git a/lib/Object/ELFYAML.cpp b/lib/Object/ELFYAML.cpp index dc3d46782503..6340841e5ee6 100644 --- a/lib/Object/ELFYAML.cpp +++ b/lib/Object/ELFYAML.cpp @@ -298,6 +298,8 @@ void ScalarBitSetTraits::bitset(IO &IO, void ScalarEnumerationTraits::enumeration( IO &IO, ELFYAML::ELF_SHT &Value) { + const auto *Object = static_cast(IO.getContext()); + assert(Object && "The IO context is not initialized"); #define ECase(X) IO.enumCase(Value, #X, ELF::X); ECase(SHT_NULL) ECase(SHT_PROGBITS) @@ -325,15 +327,29 @@ void ScalarEnumerationTraits::enumeration( ECase(SHT_GNU_versym) ECase(SHT_HIOS) ECase(SHT_LOPROC) - ECase(SHT_ARM_EXIDX) - ECase(SHT_ARM_PREEMPTMAP) - ECase(SHT_ARM_ATTRIBUTES) - ECase(SHT_ARM_DEBUGOVERLAY) - ECase(SHT_ARM_OVERLAYSECTION) - ECase(SHT_HEX_ORDERED) - ECase(SHT_X86_64_UNWIND) - ECase(SHT_MIPS_REGINFO) - ECase(SHT_MIPS_OPTIONS) + switch (Object->Header.Machine) { + case ELF::EM_ARM: + ECase(SHT_ARM_EXIDX) + ECase(SHT_ARM_PREEMPTMAP) + ECase(SHT_ARM_ATTRIBUTES) + ECase(SHT_ARM_DEBUGOVERLAY) + ECase(SHT_ARM_OVERLAYSECTION) + break; + case ELF::EM_HEXAGON: + ECase(SHT_HEX_ORDERED) + break; + case ELF::EM_X86_64: + ECase(SHT_X86_64_UNWIND) + break; + case ELF::EM_MIPS: + ECase(SHT_MIPS_REGINFO) + ECase(SHT_MIPS_OPTIONS) + ECase(SHT_MIPS_ABIFLAGS) + break; + default: + // Nothing to do. + break; + } #undef ECase } diff --git a/lib/Object/IRObjectFile.cpp b/lib/Object/IRObjectFile.cpp index 004d8de065d8..5323d9277eee 100644 --- a/lib/Object/IRObjectFile.cpp +++ b/lib/Object/IRObjectFile.cpp @@ -11,35 +11,119 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Object/IRObjectFile.h" +#include "RecordStreamer.h" #include "llvm/Bitcode/ReaderWriter.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/GVMaterializer.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" -#include "llvm/Object/IRObjectFile.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; using namespace object; -IRObjectFile::IRObjectFile(MemoryBuffer *Object, std::error_code &EC, - LLVMContext &Context, bool BufferOwned) - : SymbolicFile(Binary::ID_IR, Object, BufferOwned) { - ErrorOr MOrErr = - getLazyBitcodeModule(Object, Context, /*BufferOwned*/ false); - if ((EC = MOrErr.getError())) - return; - - M.reset(MOrErr.get()); - +IRObjectFile::IRObjectFile(std::unique_ptr Object, + std::unique_ptr Mod) + : SymbolicFile(Binary::ID_IR, std::move(Object)), M(std::move(Mod)) { // If we have a DataLayout, setup a mangler. const DataLayout *DL = M->getDataLayout(); if (!DL) return; Mang.reset(new Mangler(DL)); + + const std::string &InlineAsm = M->getModuleInlineAsm(); + if (InlineAsm.empty()) + return; + + StringRef Triple = M->getTargetTriple(); + std::string Err; + const Target *T = TargetRegistry::lookupTarget(Triple, Err); + if (!T) + return; + + std::unique_ptr MRI(T->createMCRegInfo(Triple)); + if (!MRI) + return; + + std::unique_ptr MAI(T->createMCAsmInfo(*MRI, Triple)); + if (!MAI) + return; + + std::unique_ptr STI( + T->createMCSubtargetInfo(Triple, "", "")); + if (!STI) + return; + + std::unique_ptr MCII(T->createMCInstrInfo()); + if (!MCII) + return; + + MCObjectFileInfo MOFI; + MCContext MCCtx(MAI.get(), MRI.get(), &MOFI); + MOFI.InitMCObjectFileInfo(Triple, Reloc::Default, CodeModel::Default, MCCtx); + std::unique_ptr Streamer(new RecordStreamer(MCCtx)); + + std::unique_ptr Buffer(MemoryBuffer::getMemBuffer(InlineAsm)); + SourceMgr SrcMgr; + SrcMgr.AddNewSourceBuffer(Buffer.release(), SMLoc()); + std::unique_ptr Parser( + createMCAsmParser(SrcMgr, MCCtx, *Streamer, *MAI)); + + MCTargetOptions MCOptions; + std::unique_ptr TAP( + T->createMCAsmParser(*STI, *Parser, *MCII, MCOptions)); + if (!TAP) + return; + + Parser->setTargetParser(*TAP); + if (Parser->Run(false)) + return; + + for (auto &KV : *Streamer) { + StringRef Key = KV.first(); + RecordStreamer::State Value = KV.second; + uint32_t Res = BasicSymbolRef::SF_None; + switch (Value) { + case RecordStreamer::NeverSeen: + llvm_unreachable("foo"); + case RecordStreamer::DefinedGlobal: + Res |= BasicSymbolRef::SF_Global; + break; + case RecordStreamer::Defined: + break; + case RecordStreamer::Global: + case RecordStreamer::Used: + Res |= BasicSymbolRef::SF_Undefined; + Res |= BasicSymbolRef::SF_Global; + break; + } + AsmSymbols.push_back( + std::make_pair(Key, std::move(Res))); + } } -static const GlobalValue &getGV(DataRefImpl &Symb) { - return *reinterpret_cast(Symb.p & ~uintptr_t(3)); +IRObjectFile::~IRObjectFile() { + GVMaterializer *GVM = M->getMaterializer(); + if (GVM) + GVM->releaseBuffer(); + } + +static const GlobalValue *getGV(DataRefImpl &Symb) { + if ((Symb.p & 3) == 3) + return nullptr; + + return reinterpret_cast(Symb.p & ~uintptr_t(3)); } static uintptr_t skipEmpty(Module::const_alias_iterator I, const Module &M) { @@ -63,31 +147,43 @@ static uintptr_t skipEmpty(Module::const_iterator I, const Module &M) { return reinterpret_cast(GV) | 0; } +static unsigned getAsmSymIndex(DataRefImpl Symb) { + assert((Symb.p & uintptr_t(3)) == 3); + uintptr_t Index = Symb.p & ~uintptr_t(3); + Index >>= 2; + return Index; +} + void IRObjectFile::moveSymbolNext(DataRefImpl &Symb) const { - const GlobalValue *GV = &getGV(Symb); - const Module &M = *GV->getParent(); + const GlobalValue *GV = getGV(Symb); uintptr_t Res; + switch (Symb.p & 3) { case 0: { Module::const_iterator Iter(static_cast(GV)); ++Iter; - Res = skipEmpty(Iter, M); + Res = skipEmpty(Iter, *M); break; } case 1: { Module::const_global_iterator Iter(static_cast(GV)); ++Iter; - Res = skipEmpty(Iter, M); + Res = skipEmpty(Iter, *M); break; } case 2: { Module::const_alias_iterator Iter(static_cast(GV)); ++Iter; - Res = skipEmpty(Iter, M); + Res = skipEmpty(Iter, *M); + break; + } + case 3: { + unsigned Index = getAsmSymIndex(Symb); + assert(Index < AsmSymbols.size()); + ++Index; + Res = (Index << 2) | 3; break; } - case 3: - llvm_unreachable("Invalid symbol reference"); } Symb.p = Res; @@ -95,12 +191,18 @@ void IRObjectFile::moveSymbolNext(DataRefImpl &Symb) const { std::error_code IRObjectFile::printSymbolName(raw_ostream &OS, DataRefImpl Symb) const { - const GlobalValue &GV = getGV(Symb); + const GlobalValue *GV = getGV(Symb); + if (!GV) { + unsigned Index = getAsmSymIndex(Symb); + assert(Index <= AsmSymbols.size()); + OS << AsmSymbols[Index].first; + return object_error::success;; + } if (Mang) - Mang->getNameWithPrefix(OS, &GV, false); + Mang->getNameWithPrefix(OS, GV, false); else - OS << GV.getName(); + OS << GV->getName(); return object_error::success; } @@ -116,25 +218,38 @@ static bool isDeclaration(const GlobalValue &V) { } uint32_t IRObjectFile::getSymbolFlags(DataRefImpl Symb) const { - const GlobalValue &GV = getGV(Symb); + const GlobalValue *GV = getGV(Symb); + + if (!GV) { + unsigned Index = getAsmSymIndex(Symb); + assert(Index <= AsmSymbols.size()); + return AsmSymbols[Index].second; + } uint32_t Res = BasicSymbolRef::SF_None; - if (isDeclaration(GV)) + if (isDeclaration(*GV)) Res |= BasicSymbolRef::SF_Undefined; - if (GV.hasPrivateLinkage()) + if (GV->hasPrivateLinkage()) Res |= BasicSymbolRef::SF_FormatSpecific; - if (!GV.hasLocalLinkage()) + if (!GV->hasLocalLinkage()) Res |= BasicSymbolRef::SF_Global; - if (GV.hasCommonLinkage()) + if (GV->hasCommonLinkage()) Res |= BasicSymbolRef::SF_Common; - if (GV.hasLinkOnceLinkage() || GV.hasWeakLinkage()) + if (GV->hasLinkOnceLinkage() || GV->hasWeakLinkage()) Res |= BasicSymbolRef::SF_Weak; + if (GV->getName().startswith("llvm.")) + Res |= BasicSymbolRef::SF_FormatSpecific; + else if (auto *Var = dyn_cast(GV)) { + if (Var->getSection() == StringRef("llvm.metadata")) + Res |= BasicSymbolRef::SF_FormatSpecific; + } + return Res; } -const GlobalValue &IRObjectFile::getSymbolGV(DataRefImpl Symb) const { - const GlobalValue &GV = getGV(Symb); +const GlobalValue *IRObjectFile::getSymbolGV(DataRefImpl Symb) const { + const GlobalValue *GV = getGV(Symb); return GV; } @@ -147,16 +262,18 @@ basic_symbol_iterator IRObjectFile::symbol_begin_impl() const { basic_symbol_iterator IRObjectFile::symbol_end_impl() const { DataRefImpl Ret; - Ret.p = 3; + uint64_t NumAsm = AsmSymbols.size(); + NumAsm <<= 2; + Ret.p = 3 | NumAsm; return basic_symbol_iterator(BasicSymbolRef(Ret, this)); } -ErrorOr llvm::object::SymbolicFile::createIRObjectFile( - MemoryBuffer *Object, LLVMContext &Context, bool BufferOwned) { - std::error_code EC; - std::unique_ptr Ret( - new IRObjectFile(Object, EC, Context, BufferOwned)); - if (EC) +ErrorOr llvm::object::IRObjectFile::createIRObjectFile( + std::unique_ptr Object, LLVMContext &Context) { + ErrorOr MOrErr = getLazyBitcodeModule(Object.get(), Context); + if (std::error_code EC = MOrErr.getError()) return EC; - return Ret.release(); + + std::unique_ptr M(MOrErr.get()); + return new IRObjectFile(std::move(Object), std::move(M)); } diff --git a/lib/Object/LLVMBuild.txt b/lib/Object/LLVMBuild.txt index 7813832ef7ec..bae578c76f7e 100644 --- a/lib/Object/LLVMBuild.txt +++ b/lib/Object/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = Object parent = Libraries -required_libraries = BitReader Core Support +required_libraries = BitReader Core MC MCParser Support diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp index 1e105d3973b7..51c4c332c182 100644 --- a/lib/Object/MachOObjectFile.cpp +++ b/lib/Object/MachOObjectFile.cpp @@ -27,220 +27,19 @@ using namespace llvm; using namespace object; -namespace llvm { - -namespace object { - -struct nlist_base { - uint32_t n_strx; - uint8_t n_type; - uint8_t n_sect; - uint16_t n_desc; -}; - -struct section_base { - char sectname[16]; - char segname[16]; -}; - -template -static void SwapStruct(T &Value); - -template<> -void SwapStruct(MachO::any_relocation_info &H) { - sys::swapByteOrder(H.r_word0); - sys::swapByteOrder(H.r_word1); -} - -template<> -void SwapStruct(MachO::load_command &L) { - sys::swapByteOrder(L.cmd); - sys::swapByteOrder(L.cmdsize); -} - -template<> -void SwapStruct(nlist_base &S) { - sys::swapByteOrder(S.n_strx); - sys::swapByteOrder(S.n_desc); -} - -template<> -void SwapStruct(MachO::section &S) { - sys::swapByteOrder(S.addr); - sys::swapByteOrder(S.size); - sys::swapByteOrder(S.offset); - sys::swapByteOrder(S.align); - sys::swapByteOrder(S.reloff); - sys::swapByteOrder(S.nreloc); - sys::swapByteOrder(S.flags); - sys::swapByteOrder(S.reserved1); - sys::swapByteOrder(S.reserved2); -} - -template<> -void SwapStruct(MachO::section_64 &S) { - sys::swapByteOrder(S.addr); - sys::swapByteOrder(S.size); - sys::swapByteOrder(S.offset); - sys::swapByteOrder(S.align); - sys::swapByteOrder(S.reloff); - sys::swapByteOrder(S.nreloc); - sys::swapByteOrder(S.flags); - sys::swapByteOrder(S.reserved1); - sys::swapByteOrder(S.reserved2); - sys::swapByteOrder(S.reserved3); -} - -template<> -void SwapStruct(MachO::nlist &S) { - sys::swapByteOrder(S.n_strx); - sys::swapByteOrder(S.n_desc); - sys::swapByteOrder(S.n_value); -} - -template<> -void SwapStruct(MachO::nlist_64 &S) { - sys::swapByteOrder(S.n_strx); - sys::swapByteOrder(S.n_desc); - sys::swapByteOrder(S.n_value); -} - -template<> -void SwapStruct(MachO::mach_header &H) { - sys::swapByteOrder(H.magic); - sys::swapByteOrder(H.cputype); - sys::swapByteOrder(H.cpusubtype); - sys::swapByteOrder(H.filetype); - sys::swapByteOrder(H.ncmds); - sys::swapByteOrder(H.sizeofcmds); - sys::swapByteOrder(H.flags); -} - -template<> -void SwapStruct(MachO::mach_header_64 &H) { - sys::swapByteOrder(H.magic); - sys::swapByteOrder(H.cputype); - sys::swapByteOrder(H.cpusubtype); - sys::swapByteOrder(H.filetype); - sys::swapByteOrder(H.ncmds); - sys::swapByteOrder(H.sizeofcmds); - sys::swapByteOrder(H.flags); - sys::swapByteOrder(H.reserved); -} - -template<> -void SwapStruct(MachO::symtab_command &C) { - sys::swapByteOrder(C.cmd); - sys::swapByteOrder(C.cmdsize); - sys::swapByteOrder(C.symoff); - sys::swapByteOrder(C.nsyms); - sys::swapByteOrder(C.stroff); - sys::swapByteOrder(C.strsize); -} - -template<> -void SwapStruct(MachO::dysymtab_command &C) { - sys::swapByteOrder(C.cmd); - sys::swapByteOrder(C.cmdsize); - sys::swapByteOrder(C.ilocalsym); - sys::swapByteOrder(C.nlocalsym); - sys::swapByteOrder(C.iextdefsym); - sys::swapByteOrder(C.nextdefsym); - sys::swapByteOrder(C.iundefsym); - sys::swapByteOrder(C.nundefsym); - sys::swapByteOrder(C.tocoff); - sys::swapByteOrder(C.ntoc); - sys::swapByteOrder(C.modtaboff); - sys::swapByteOrder(C.nmodtab); - sys::swapByteOrder(C.extrefsymoff); - sys::swapByteOrder(C.nextrefsyms); - sys::swapByteOrder(C.indirectsymoff); - sys::swapByteOrder(C.nindirectsyms); - sys::swapByteOrder(C.extreloff); - sys::swapByteOrder(C.nextrel); - sys::swapByteOrder(C.locreloff); - sys::swapByteOrder(C.nlocrel); -} - -template<> -void SwapStruct(MachO::linkedit_data_command &C) { - sys::swapByteOrder(C.cmd); - sys::swapByteOrder(C.cmdsize); - sys::swapByteOrder(C.dataoff); - sys::swapByteOrder(C.datasize); -} - -template<> -void SwapStruct(MachO::segment_command &C) { - sys::swapByteOrder(C.cmd); - sys::swapByteOrder(C.cmdsize); - sys::swapByteOrder(C.vmaddr); - sys::swapByteOrder(C.vmsize); - sys::swapByteOrder(C.fileoff); - sys::swapByteOrder(C.filesize); - sys::swapByteOrder(C.maxprot); - sys::swapByteOrder(C.initprot); - sys::swapByteOrder(C.nsects); - sys::swapByteOrder(C.flags); -} - -template<> -void SwapStruct(MachO::segment_command_64 &C) { - sys::swapByteOrder(C.cmd); - sys::swapByteOrder(C.cmdsize); - sys::swapByteOrder(C.vmaddr); - sys::swapByteOrder(C.vmsize); - sys::swapByteOrder(C.fileoff); - sys::swapByteOrder(C.filesize); - sys::swapByteOrder(C.maxprot); - sys::swapByteOrder(C.initprot); - sys::swapByteOrder(C.nsects); - sys::swapByteOrder(C.flags); -} - -template<> -void SwapStruct(uint32_t &C) { - sys::swapByteOrder(C); -} - -template<> -void SwapStruct(MachO::linker_options_command &C) { - sys::swapByteOrder(C.cmd); - sys::swapByteOrder(C.cmdsize); - sys::swapByteOrder(C.count); -} - -template<> -void SwapStruct(MachO::version_min_command&C) { - sys::swapByteOrder(C.cmd); - sys::swapByteOrder(C.cmdsize); - sys::swapByteOrder(C.version); - sys::swapByteOrder(C.reserved); -} - -template<> -void SwapStruct(MachO::dylib_command&C) { - sys::swapByteOrder(C.cmd); - sys::swapByteOrder(C.cmdsize); - sys::swapByteOrder(C.dylib.name); - sys::swapByteOrder(C.dylib.timestamp); - sys::swapByteOrder(C.dylib.current_version); - sys::swapByteOrder(C.dylib.compatibility_version); -} - -template<> -void SwapStruct(MachO::data_in_code_entry &C) { - sys::swapByteOrder(C.offset); - sys::swapByteOrder(C.length); - sys::swapByteOrder(C.kind); +namespace { + struct section_base { + char sectname[16]; + char segname[16]; + }; } template -T getStruct(const MachOObjectFile *O, const char *P) { +static T getStruct(const MachOObjectFile *O, const char *P) { T Cmd; memcpy(&Cmd, P, sizeof(T)); if (O->isLittleEndian() != sys::IsLittleEndianHost) - SwapStruct(Cmd); + MachO::swapStruct(Cmd); return Cmd; } @@ -274,10 +73,10 @@ static const char *getPtr(const MachOObjectFile *O, size_t Offset) { return O->getData().substr(Offset, 1).data(); } -static nlist_base +static MachO::nlist_base getSymbolTableEntryBase(const MachOObjectFile *O, DataRefImpl DRI) { const char *P = reinterpret_cast(DRI.p); - return getStruct(O, P); + return getStruct(O, P); } static StringRef parseSegmentOrSectionName(const char *P) { @@ -422,10 +221,10 @@ static uint32_t getSectionFlags(const MachOObjectFile *O, return Sect.flags; } -MachOObjectFile::MachOObjectFile(MemoryBuffer *Object, bool IsLittleEndian, - bool Is64bits, std::error_code &EC, - bool BufferOwned) - : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object, BufferOwned), +MachOObjectFile::MachOObjectFile(std::unique_ptr Object, + bool IsLittleEndian, bool Is64bits, + std::error_code &EC) + : ObjectFile(getMachOType(IsLittleEndian, Is64bits), std::move(Object)), SymtabLoadCmd(nullptr), DysymtabLoadCmd(nullptr), DataInCodeLoadCmd(nullptr) { uint32_t LoadCommandCount = this->getHeader().ncmds; @@ -474,7 +273,7 @@ void MachOObjectFile::moveSymbolNext(DataRefImpl &Symb) const { std::error_code MachOObjectFile::getSymbolName(DataRefImpl Symb, StringRef &Res) const { StringRef StringTable = getStringTableData(); - nlist_base Entry = getSymbolTableEntryBase(this, Symb); + MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb); const char *Start = &StringTable.data()[Entry.n_strx]; Res = StringRef(Start); return object_error::success; @@ -528,7 +327,7 @@ std::error_code MachOObjectFile::getSymbolAlignment(DataRefImpl DRI, uint32_t &Result) const { uint32_t flags = getSymbolFlags(DRI); if (flags & SymbolRef::SF_Common) { - nlist_base Entry = getSymbolTableEntryBase(this, DRI); + MachO::nlist_base Entry = getSymbolTableEntryBase(this, DRI); Result = 1 << MachO::GET_COMM_ALIGN(Entry.n_desc); } else { Result = 0; @@ -542,7 +341,7 @@ std::error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI, uint64_t EndOffset = 0; uint8_t SectionIndex; - nlist_base Entry = getSymbolTableEntryBase(this, DRI); + MachO::nlist_base Entry = getSymbolTableEntryBase(this, DRI); uint64_t Value; getSymbolAddress(DRI, Value); if (Value == UnknownAddressOrSize) { @@ -587,7 +386,7 @@ std::error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI, std::error_code MachOObjectFile::getSymbolType(DataRefImpl Symb, SymbolRef::Type &Res) const { - nlist_base Entry = getSymbolTableEntryBase(this, Symb); + MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb); uint8_t n_type = Entry.n_type; Res = SymbolRef::ST_Other; @@ -610,7 +409,7 @@ std::error_code MachOObjectFile::getSymbolType(DataRefImpl Symb, } uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const { - nlist_base Entry = getSymbolTableEntryBase(this, DRI); + MachO::nlist_base Entry = getSymbolTableEntryBase(this, DRI); uint8_t MachOType = Entry.n_type; uint16_t MachOFlags = Entry.n_desc; @@ -647,7 +446,7 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const { std::error_code MachOObjectFile::getSymbolSection(DataRefImpl Symb, section_iterator &Res) const { - nlist_base Entry = getSymbolTableEntryBase(this, Symb); + MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb); uint8_t index = Entry.n_sect; if (index == 0) { @@ -868,6 +667,9 @@ std::error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel, symbol_iterator MachOObjectFile::getRelocationSymbol(DataRefImpl Rel) const { MachO::any_relocation_info RE = getRelocation(Rel); + if (isRelocationScattered(RE)) + return symbol_end(); + uint32_t SymbolIdx = getPlainRelocationSymbolNum(RE); bool isExtern = getPlainRelocationExternal(RE); if (!isExtern) @@ -1384,7 +1186,7 @@ std::error_code MachOObjectFile::getLibraryShortNameByIndex(unsigned Index, LibrariesShortNames.push_back(StringRef()); continue; } - char *P = (char *)(Libraries[i]) + D.dylib.name; + const char *P = (const char *)(Libraries[i]) + D.dylib.name; StringRef Name = StringRef(P); StringRef Suffix; bool isFramework; @@ -1511,6 +1313,108 @@ Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType) { } } +Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType) { + switch (CPUType) { + case MachO::CPU_TYPE_I386: + switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) { + case MachO::CPU_SUBTYPE_I386_ALL: + return Triple("i386-apple-darwin"); + default: + return Triple(); + } + case MachO::CPU_TYPE_X86_64: + switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) { + case MachO::CPU_SUBTYPE_X86_64_ALL: + return Triple("x86_64-apple-darwin"); + case MachO::CPU_SUBTYPE_X86_64_H: + return Triple("x86_64h-apple-darwin"); + default: + return Triple(); + } + case MachO::CPU_TYPE_ARM: + switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) { + case MachO::CPU_SUBTYPE_ARM_V4T: + return Triple("armv4t-apple-darwin"); + case MachO::CPU_SUBTYPE_ARM_V5TEJ: + return Triple("armv5e-apple-darwin"); + case MachO::CPU_SUBTYPE_ARM_V6: + return Triple("armv6-apple-darwin"); + case MachO::CPU_SUBTYPE_ARM_V6M: + return Triple("armv6m-apple-darwin"); + case MachO::CPU_SUBTYPE_ARM_V7EM: + return Triple("armv7em-apple-darwin"); + case MachO::CPU_SUBTYPE_ARM_V7K: + return Triple("armv7k-apple-darwin"); + case MachO::CPU_SUBTYPE_ARM_V7M: + return Triple("armv7m-apple-darwin"); + case MachO::CPU_SUBTYPE_ARM_V7S: + return Triple("armv7s-apple-darwin"); + default: + return Triple(); + } + case MachO::CPU_TYPE_ARM64: + switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) { + case MachO::CPU_SUBTYPE_ARM64_ALL: + return Triple("arm64-apple-darwin"); + default: + return Triple(); + } + case MachO::CPU_TYPE_POWERPC: + switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) { + case MachO::CPU_SUBTYPE_POWERPC_ALL: + return Triple("ppc-apple-darwin"); + default: + return Triple(); + } + case MachO::CPU_TYPE_POWERPC64: + switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) { + case MachO::CPU_SUBTYPE_POWERPC_ALL: + return Triple("ppc64-apple-darwin"); + default: + return Triple(); + } + default: + return Triple(); + } +} + +Triple MachOObjectFile::getHostArch() { + return Triple(sys::getDefaultTargetTriple()); +} + +Triple MachOObjectFile::getArch(StringRef ArchFlag) { + if (ArchFlag == "i386") + return Triple("i386-apple-darwin"); + else if (ArchFlag == "x86_64") + return Triple("x86_64-apple-darwin"); + else if (ArchFlag == "x86_64h") + return Triple("x86_64h-apple-darwin"); + else if (ArchFlag == "armv4t" || ArchFlag == "arm") + return Triple("armv4t-apple-darwin"); + else if (ArchFlag == "armv5e") + return Triple("armv5e-apple-darwin"); + else if (ArchFlag == "armv6") + return Triple("armv6-apple-darwin"); + else if (ArchFlag == "armv6m") + return Triple("armv6m-apple-darwin"); + else if (ArchFlag == "armv7em") + return Triple("armv7em-apple-darwin"); + else if (ArchFlag == "armv7k") + return Triple("armv7k-apple-darwin"); + else if (ArchFlag == "armv7k") + return Triple("armv7m-apple-darwin"); + else if (ArchFlag == "armv7s") + return Triple("armv7s-apple-darwin"); + else if (ArchFlag == "arm64") + return Triple("arm64-apple-darwin"); + else if (ArchFlag == "ppc") + return Triple("ppc-apple-darwin"); + else if (ArchFlag == "ppc64") + return Triple("ppc64-apple-darwin"); + else + return Triple(); +} + unsigned MachOObjectFile::getArch() const { return getArch(getCPUType(this)); } @@ -1721,6 +1625,12 @@ MachOObjectFile::getVersionMinLoadCommand(const LoadCommandInfo &L) const { return getStruct(this, L.Ptr); } +MachO::dylib_command +MachOObjectFile::getDylibIDLoadCommand(const LoadCommandInfo &L) const { + return getStruct(this, L.Ptr); +} + + MachO::any_relocation_info MachOObjectFile::getRelocation(DataRefImpl Rel) const { DataRefImpl Sec; @@ -1812,28 +1722,24 @@ void MachOObjectFile::ReadULEB128s(uint64_t Index, } } -ErrorOr ObjectFile::createMachOObjectFile(MemoryBuffer *Buffer, - bool BufferOwned) { +ErrorOr +ObjectFile::createMachOObjectFile(std::unique_ptr &Buffer) { StringRef Magic = Buffer->getBuffer().slice(0, 4); std::error_code EC; std::unique_ptr Ret; if (Magic == "\xFE\xED\xFA\xCE") - Ret.reset(new MachOObjectFile(Buffer, false, false, EC, BufferOwned)); + Ret.reset(new MachOObjectFile(std::move(Buffer), false, false, EC)); else if (Magic == "\xCE\xFA\xED\xFE") - Ret.reset(new MachOObjectFile(Buffer, true, false, EC, BufferOwned)); + Ret.reset(new MachOObjectFile(std::move(Buffer), true, false, EC)); else if (Magic == "\xFE\xED\xFA\xCF") - Ret.reset(new MachOObjectFile(Buffer, false, true, EC, BufferOwned)); + Ret.reset(new MachOObjectFile(std::move(Buffer), false, true, EC)); else if (Magic == "\xCF\xFA\xED\xFE") - Ret.reset(new MachOObjectFile(Buffer, true, true, EC, BufferOwned)); - else { - delete Buffer; + Ret.reset(new MachOObjectFile(std::move(Buffer), true, true, EC)); + else return object_error::parse_failed; - } if (EC) return EC; return Ret.release(); } -} // end namespace object -} // end namespace llvm diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp index e414de8bcf17..4ba5d9686497 100644 --- a/lib/Object/MachOUniversal.cpp +++ b/lib/Object/MachOUniversal.cpp @@ -53,7 +53,7 @@ static T getUniversalBinaryStruct(const char *Ptr) { MachOUniversalBinary::ObjectForArch::ObjectForArch( const MachOUniversalBinary *Parent, uint32_t Index) : Parent(Parent), Index(Index) { - if (!Parent || Index > Parent->getNumberOfObjects()) { + if (!Parent || Index >= Parent->getNumberOfObjects()) { clear(); } else { // Parse object header. @@ -67,21 +67,15 @@ MachOUniversalBinary::ObjectForArch::ObjectForArch( } } -std::error_code MachOUniversalBinary::ObjectForArch::getAsObjectFile( - std::unique_ptr &Result) const { +ErrorOr> +MachOUniversalBinary::ObjectForArch::getAsObjectFile() const { if (Parent) { StringRef ParentData = Parent->getData(); StringRef ObjectData = ParentData.substr(Header.offset, Header.size); - std::string ObjectName = - Parent->getFileName().str() + ":" + - Triple::getArchTypeName(MachOObjectFile::getArch(Header.cputype)); - MemoryBuffer *ObjBuffer = MemoryBuffer::getMemBuffer( - ObjectData, ObjectName, false); - ErrorOr Obj = ObjectFile::createMachOObjectFile(ObjBuffer); - if (std::error_code EC = Obj.getError()) - return EC; - Result.reset(Obj.get()); - return object_error::success; + std::string ObjectName = Parent->getFileName().str(); + std::unique_ptr ObjBuffer( + MemoryBuffer::getMemBuffer(ObjectData, ObjectName, false)); + return ObjectFile::createMachOObjectFile(ObjBuffer); } return object_error::parse_failed; } @@ -91,12 +85,10 @@ std::error_code MachOUniversalBinary::ObjectForArch::getAsArchive( if (Parent) { StringRef ParentData = Parent->getData(); StringRef ObjectData = ParentData.substr(Header.offset, Header.size); - std::string ObjectName = - Parent->getFileName().str() + ":" + - Triple::getArchTypeName(MachOObjectFile::getArch(Header.cputype)); - MemoryBuffer *ObjBuffer = MemoryBuffer::getMemBuffer( - ObjectData, ObjectName, false); - ErrorOr Obj = Archive::create(ObjBuffer); + std::string ObjectName = Parent->getFileName().str(); + std::unique_ptr ObjBuffer( + MemoryBuffer::getMemBuffer(ObjectData, ObjectName, false)); + ErrorOr Obj = Archive::create(std::move(ObjBuffer)); if (std::error_code EC = Obj.getError()) return EC; Result.reset(Obj.get()); @@ -108,19 +100,20 @@ std::error_code MachOUniversalBinary::ObjectForArch::getAsArchive( void MachOUniversalBinary::anchor() { } ErrorOr -MachOUniversalBinary::create(MemoryBuffer *Source) { +MachOUniversalBinary::create(std::unique_ptr Source) { std::error_code EC; std::unique_ptr Ret( - new MachOUniversalBinary(Source, EC)); + new MachOUniversalBinary(std::move(Source), EC)); if (EC) return EC; return Ret.release(); } -MachOUniversalBinary::MachOUniversalBinary(MemoryBuffer *Source, +MachOUniversalBinary::MachOUniversalBinary(std::unique_ptr Source, std::error_code &ec) - : Binary(Binary::ID_MachOUniversalBinary, Source), NumberOfObjects(0) { - if (Source->getBufferSize() < sizeof(MachO::fat_header)) { + : Binary(Binary::ID_MachOUniversalBinary, std::move(Source)), + NumberOfObjects(0) { + if (Data->getBufferSize() < sizeof(MachO::fat_header)) { ec = object_error::invalid_file_type; return; } @@ -149,14 +142,14 @@ static bool getCTMForArch(Triple::ArchType Arch, MachO::CPUType &CTM) { } } -std::error_code MachOUniversalBinary::getObjectForArch( - Triple::ArchType Arch, std::unique_ptr &Result) const { +ErrorOr> +MachOUniversalBinary::getObjectForArch(Triple::ArchType Arch) const { MachO::CPUType CTM; if (!getCTMForArch(Arch, CTM)) return object_error::arch_not_found; for (object_iterator I = begin_objects(), E = end_objects(); I != E; ++I) { if (I->getCPUType() == static_cast(CTM)) - return I->getAsObjectFile(Result); + return I->getAsObjectFile(); } return object_error::arch_not_found; } diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp index 7282f468b084..567d87f7a0e5 100644 --- a/lib/Object/Object.cpp +++ b/lib/Object/Object.cpp @@ -59,7 +59,9 @@ wrap(const relocation_iterator *SI) { // ObjectFile creation LLVMObjectFileRef LLVMCreateObjectFile(LLVMMemoryBufferRef MemBuf) { - ErrorOr ObjOrErr(ObjectFile::createObjectFile(unwrap(MemBuf))); + std::unique_ptr Buf(unwrap(MemBuf)); + ErrorOr ObjOrErr(ObjectFile::createObjectFile(Buf)); + Buf.release(); ObjectFile *Obj = ObjOrErr ? ObjOrErr.get() : nullptr; return wrap(Obj); } diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp index 7666ed53c372..f5488c6d52db 100644 --- a/lib/Object/ObjectFile.cpp +++ b/lib/Object/ObjectFile.cpp @@ -23,9 +23,8 @@ using namespace object; void ObjectFile::anchor() { } -ObjectFile::ObjectFile(unsigned int Type, MemoryBuffer *Source, - bool BufferOwned) - : SymbolicFile(Type, Source, BufferOwned) {} +ObjectFile::ObjectFile(unsigned int Type, std::unique_ptr Source) + : SymbolicFile(Type, std::move(Source)) {} std::error_code ObjectFile::printSymbolName(raw_ostream &OS, DataRefImpl Symb) const { @@ -46,9 +45,9 @@ section_iterator ObjectFile::getRelocatedSection(DataRefImpl Sec) const { return section_iterator(SectionRef(Sec, this)); } -ErrorOr ObjectFile::createObjectFile(MemoryBuffer *Object, - bool BufferOwned, - sys::fs::file_magic Type) { +ErrorOr +ObjectFile::createObjectFile(std::unique_ptr &Object, + sys::fs::file_magic Type) { if (Type == sys::fs::file_magic::unknown) Type = sys::fs::identify_magic(Object->getBuffer()); @@ -58,14 +57,12 @@ ErrorOr ObjectFile::createObjectFile(MemoryBuffer *Object, case sys::fs::file_magic::archive: case sys::fs::file_magic::macho_universal_binary: case sys::fs::file_magic::windows_resource: - if (BufferOwned) - delete Object; return object_error::invalid_file_type; case sys::fs::file_magic::elf_relocatable: case sys::fs::file_magic::elf_executable: case sys::fs::file_magic::elf_shared_object: case sys::fs::file_magic::elf_core: - return createELFObjectFile(Object, BufferOwned); + return createELFObjectFile(Object); case sys::fs::file_magic::macho_object: case sys::fs::file_magic::macho_executable: case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib: @@ -76,18 +73,19 @@ ErrorOr ObjectFile::createObjectFile(MemoryBuffer *Object, case sys::fs::file_magic::macho_bundle: case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub: case sys::fs::file_magic::macho_dsym_companion: - return createMachOObjectFile(Object, BufferOwned); + return createMachOObjectFile(Object); case sys::fs::file_magic::coff_object: case sys::fs::file_magic::coff_import_library: case sys::fs::file_magic::pecoff_executable: - return createCOFFObjectFile(Object, BufferOwned); + return createCOFFObjectFile(std::move(Object)); } llvm_unreachable("Unexpected Object File Type"); } ErrorOr ObjectFile::createObjectFile(StringRef ObjectPath) { - std::unique_ptr File; - if (std::error_code EC = MemoryBuffer::getFile(ObjectPath, File)) + ErrorOr> FileOrErr = + MemoryBuffer::getFile(ObjectPath); + if (std::error_code EC = FileOrErr.getError()) return EC; - return createObjectFile(File.release()); + return createObjectFile(FileOrErr.get()); } diff --git a/lib/Object/RecordStreamer.cpp b/lib/Object/RecordStreamer.cpp new file mode 100644 index 000000000000..081faddc214c --- /dev/null +++ b/lib/Object/RecordStreamer.cpp @@ -0,0 +1,100 @@ +//===-- RecordStreamer.cpp - Record asm definde and used symbols ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "RecordStreamer.h" +#include "llvm/MC/MCSymbol.h" +using namespace llvm; + +void RecordStreamer::markDefined(const MCSymbol &Symbol) { + State &S = Symbols[Symbol.getName()]; + switch (S) { + case DefinedGlobal: + case Global: + S = DefinedGlobal; + break; + case NeverSeen: + case Defined: + case Used: + S = Defined; + break; + } +} + +void RecordStreamer::markGlobal(const MCSymbol &Symbol) { + State &S = Symbols[Symbol.getName()]; + switch (S) { + case DefinedGlobal: + case Defined: + S = DefinedGlobal; + break; + + case NeverSeen: + case Global: + case Used: + S = Global; + break; + } +} + +void RecordStreamer::markUsed(const MCSymbol &Symbol) { + State &S = Symbols[Symbol.getName()]; + switch (S) { + case DefinedGlobal: + case Defined: + case Global: + break; + + case NeverSeen: + case Used: + S = Used; + break; + } +} + +void RecordStreamer::visitUsedSymbol(const MCSymbol &Sym) { markUsed(Sym); } + +RecordStreamer::const_iterator RecordStreamer::begin() { + return Symbols.begin(); +} + +RecordStreamer::const_iterator RecordStreamer::end() { return Symbols.end(); } + +RecordStreamer::RecordStreamer(MCContext &Context) : MCStreamer(Context) {} + +void RecordStreamer::EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI) { + MCStreamer::EmitInstruction(Inst, STI); +} + +void RecordStreamer::EmitLabel(MCSymbol *Symbol) { + MCStreamer::EmitLabel(Symbol); + markDefined(*Symbol); +} + +void RecordStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) { + markDefined(*Symbol); + MCStreamer::EmitAssignment(Symbol, Value); +} + +bool RecordStreamer::EmitSymbolAttribute(MCSymbol *Symbol, + MCSymbolAttr Attribute) { + if (Attribute == MCSA_Global) + markGlobal(*Symbol); + return true; +} + +void RecordStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol, + uint64_t Size, unsigned ByteAlignment) { + markDefined(*Symbol); +} + +void RecordStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment) { + markDefined(*Symbol); +} diff --git a/lib/Object/RecordStreamer.h b/lib/Object/RecordStreamer.h new file mode 100644 index 000000000000..10e70ef16e22 --- /dev/null +++ b/lib/Object/RecordStreamer.h @@ -0,0 +1,42 @@ +//===-- RecordStreamer.h - Record asm defined and used symbols ---*- C++ -*===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OBJECT_RECORD_STREAMER +#define LLVM_OBJECT_RECORD_STREAMER + +#include "llvm/MC/MCStreamer.h" + +namespace llvm { +class RecordStreamer : public MCStreamer { +public: + enum State { NeverSeen, Global, Defined, DefinedGlobal, Used }; + +private: + StringMap Symbols; + void markDefined(const MCSymbol &Symbol); + void markGlobal(const MCSymbol &Symbol); + void markUsed(const MCSymbol &Symbol); + void visitUsedSymbol(const MCSymbol &Sym) override; + +public: + typedef StringMap::const_iterator const_iterator; + const_iterator begin(); + const_iterator end(); + RecordStreamer(MCContext &Context); + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; + void EmitLabel(MCSymbol *Symbol) override; + void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override; + bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override; + void EmitZerofill(const MCSection *Section, MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment) override; + void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment) override; +}; +} +#endif diff --git a/lib/Object/SymbolicFile.cpp b/lib/Object/SymbolicFile.cpp index 495f0b62bc5e..30cf1a03f415 100644 --- a/lib/Object/SymbolicFile.cpp +++ b/lib/Object/SymbolicFile.cpp @@ -19,14 +19,14 @@ using namespace llvm; using namespace object; -SymbolicFile::SymbolicFile(unsigned int Type, MemoryBuffer *Source, - bool BufferOwned) - : Binary(Type, Source, BufferOwned) {} +SymbolicFile::SymbolicFile(unsigned int Type, + std::unique_ptr Source) + : Binary(Type, std::move(Source)) {} SymbolicFile::~SymbolicFile() {} ErrorOr -SymbolicFile::createSymbolicFile(MemoryBuffer *Object, bool BufferOwned, +SymbolicFile::createSymbolicFile(std::unique_ptr &Object, sys::fs::file_magic Type, LLVMContext *Context) { if (Type == sys::fs::file_magic::unknown) @@ -35,14 +35,12 @@ SymbolicFile::createSymbolicFile(MemoryBuffer *Object, bool BufferOwned, switch (Type) { case sys::fs::file_magic::bitcode: if (Context) - return IRObjectFile::createIRObjectFile(Object, *Context, BufferOwned); + return IRObjectFile::createIRObjectFile(std::move(Object), *Context); // Fallthrough case sys::fs::file_magic::unknown: case sys::fs::file_magic::archive: case sys::fs::file_magic::macho_universal_binary: case sys::fs::file_magic::windows_resource: - if (BufferOwned) - delete Object; return object_error::invalid_file_type; case sys::fs::file_magic::elf_relocatable: case sys::fs::file_magic::elf_executable: @@ -61,7 +59,7 @@ SymbolicFile::createSymbolicFile(MemoryBuffer *Object, bool BufferOwned, case sys::fs::file_magic::coff_object: case sys::fs::file_magic::coff_import_library: case sys::fs::file_magic::pecoff_executable: - return ObjectFile::createObjectFile(Object, BufferOwned, Type); + return ObjectFile::createObjectFile(Object, Type); } llvm_unreachable("Unexpected Binary File Type"); } diff --git a/lib/Option/ArgList.cpp b/lib/Option/ArgList.cpp index a5ab8d747d87..5848bb11bfa1 100644 --- a/lib/Option/ArgList.cpp +++ b/lib/Option/ArgList.cpp @@ -234,44 +234,40 @@ void ArgList::AddLastArg(ArgStringList &Output, OptSpecifier Id0, void ArgList::AddAllArgs(ArgStringList &Output, OptSpecifier Id0, OptSpecifier Id1, OptSpecifier Id2) const { - for (arg_iterator it = filtered_begin(Id0, Id1, Id2), - ie = filtered_end(); it != ie; ++it) { - (*it)->claim(); - (*it)->render(*this, Output); + for (auto Arg: filtered(Id0, Id1, Id2)) { + Arg->claim(); + Arg->render(*this, Output); } } void ArgList::AddAllArgValues(ArgStringList &Output, OptSpecifier Id0, OptSpecifier Id1, OptSpecifier Id2) const { - for (arg_iterator it = filtered_begin(Id0, Id1, Id2), - ie = filtered_end(); it != ie; ++it) { - (*it)->claim(); - for (unsigned i = 0, e = (*it)->getNumValues(); i != e; ++i) - Output.push_back((*it)->getValue(i)); + for (auto Arg : filtered(Id0, Id1, Id2)) { + Arg->claim(); + for (unsigned i = 0, e = Arg->getNumValues(); i != e; ++i) + Output.push_back(Arg->getValue(i)); } } void ArgList::AddAllArgsTranslated(ArgStringList &Output, OptSpecifier Id0, const char *Translation, bool Joined) const { - for (arg_iterator it = filtered_begin(Id0), - ie = filtered_end(); it != ie; ++it) { - (*it)->claim(); + for (auto Arg: filtered(Id0)) { + Arg->claim(); if (Joined) { Output.push_back(MakeArgString(StringRef(Translation) + - (*it)->getValue(0))); + Arg->getValue(0))); } else { Output.push_back(Translation); - Output.push_back((*it)->getValue(0)); + Output.push_back(Arg->getValue(0)); } } } void ArgList::ClaimAllArgs(OptSpecifier Id0) const { - for (arg_iterator it = filtered_begin(Id0), - ie = filtered_end(); it != ie; ++it) - (*it)->claim(); + for (auto Arg : filtered(Id0)) + Arg->claim(); } void ArgList::ClaimAllArgs() const { @@ -350,30 +346,27 @@ void DerivedArgList::AddSynthesizedArg(Arg *A) { } Arg *DerivedArgList::MakeFlagArg(const Arg *BaseArg, const Option Opt) const { - SynthesizedArgs.push_back(make_unique( - Opt, - ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())), - BaseArgs.MakeIndex(Opt.getName()), BaseArg)); + SynthesizedArgs.push_back( + make_unique(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()), + BaseArgs.MakeIndex(Opt.getName()), BaseArg)); return SynthesizedArgs.back().get(); } Arg *DerivedArgList::MakePositionalArg(const Arg *BaseArg, const Option Opt, StringRef Value) const { unsigned Index = BaseArgs.MakeIndex(Value); - SynthesizedArgs.push_back(make_unique( - Opt, - ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())), - Index, BaseArgs.getArgString(Index), BaseArg)); + SynthesizedArgs.push_back( + make_unique(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()), + Index, BaseArgs.getArgString(Index), BaseArg)); return SynthesizedArgs.back().get(); } Arg *DerivedArgList::MakeSeparateArg(const Arg *BaseArg, const Option Opt, StringRef Value) const { unsigned Index = BaseArgs.MakeIndex(Opt.getName(), Value); - SynthesizedArgs.push_back(make_unique( - Opt, - ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())), - Index, BaseArgs.getArgString(Index + 1), BaseArg)); + SynthesizedArgs.push_back( + make_unique(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()), + Index, BaseArgs.getArgString(Index + 1), BaseArg)); return SynthesizedArgs.back().get(); } @@ -381,8 +374,7 @@ Arg *DerivedArgList::MakeJoinedArg(const Arg *BaseArg, const Option Opt, StringRef Value) const { unsigned Index = BaseArgs.MakeIndex(Opt.getName().str() + Value.str()); SynthesizedArgs.push_back(make_unique( - Opt, - ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())), - Index, BaseArgs.getArgString(Index) + Opt.getName().size(), BaseArg)); + Opt, MakeArgString(Opt.getPrefix() + Opt.getName()), Index, + BaseArgs.getArgString(Index) + Opt.getName().size(), BaseArg)); return SynthesizedArgs.back().get(); } diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp index d0493d34c203..0b367282e149 100644 --- a/lib/ProfileData/InstrProfReader.cpp +++ b/lib/ProfileData/InstrProfReader.cpp @@ -23,8 +23,11 @@ using namespace llvm; static std::error_code setupMemoryBuffer(std::string Path, std::unique_ptr &Buffer) { - if (std::error_code EC = MemoryBuffer::getFileOrSTDIN(Path, Buffer)) + ErrorOr> BufferOrErr = + MemoryBuffer::getFileOrSTDIN(Path); + if (std::error_code EC = BufferOrErr.getError()) return EC; + Buffer = std::move(BufferOrErr.get()); // Sanity check the file. if (Buffer->getBufferSize() > std::numeric_limits::max()) diff --git a/lib/Support/Atomic.cpp b/lib/Support/Atomic.cpp index 2ef32b08ef28..ac4ff3eb5c66 100644 --- a/lib/Support/Atomic.cpp +++ b/lib/Support/Atomic.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This header file implements atomic operations. +// This file implements atomic operations. // //===----------------------------------------------------------------------===// diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt index 2438d729d8de..80b6ab84e3c9 100644 --- a/lib/Support/CMakeLists.txt +++ b/lib/Support/CMakeLists.txt @@ -41,10 +41,13 @@ add_llvm_library(LLVMSupport MD5.cpp PluginLoader.cpp PrettyStackTrace.cpp + RandomNumberGenerator.cpp Regex.cpp + ScaledNumber.cpp SmallPtrSet.cpp SmallVector.cpp SourceMgr.cpp + SpecialCaseList.cpp Statistic.cpp StreamableMemoryObject.cpp StringExtras.cpp @@ -70,7 +73,6 @@ add_llvm_library(LLVMSupport # System Atomic.cpp - Disassembler.cpp DynamicLibrary.cpp Errno.cpp Host.cpp diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp index e09d4b6e47b4..586eceae757f 100644 --- a/lib/Support/CommandLine.cpp +++ b/lib/Support/CommandLine.cpp @@ -145,6 +145,7 @@ void OptionCategory::registerCategory() { static void GetOptionInfo(SmallVectorImpl &PositionalOpts, SmallVectorImpl &SinkOpts, StringMap &OptionsMap) { + bool HadErrors = false; SmallVector OptionNames; Option *CAOpt = nullptr; // The ConsumeAfter option if it exists. for (Option *O = RegisteredOptionList; O; O = O->getNextRegisteredOption()) { @@ -158,8 +159,9 @@ static void GetOptionInfo(SmallVectorImpl &PositionalOpts, for (size_t i = 0, e = OptionNames.size(); i != e; ++i) { // Add argument to the argument map! if (OptionsMap.GetOrCreateValue(OptionNames[i], O).second != O) { - errs() << ProgramName << ": CommandLine Error: Argument '" - << OptionNames[i] << "' defined more than once!\n"; + errs() << ProgramName << ": CommandLine Error: Option '" + << OptionNames[i] << "' registered more than once!\n"; + HadErrors = true; } } @@ -171,8 +173,10 @@ static void GetOptionInfo(SmallVectorImpl &PositionalOpts, else if (O->getMiscFlags() & cl::Sink) // Remember sink options SinkOpts.push_back(O); else if (O->getNumOccurrencesFlag() == cl::ConsumeAfter) { - if (CAOpt) + if (CAOpt) { O->error("Cannot specify more than one option with cl::ConsumeAfter!"); + HadErrors = true; + } CAOpt = O; } } @@ -182,6 +186,12 @@ static void GetOptionInfo(SmallVectorImpl &PositionalOpts, // Make sure that they are in order of registration not backwards. std::reverse(PositionalOpts.begin(), PositionalOpts.end()); + + // Fail hard if there were errors. These are strictly unrecoverable and + // indicate serious issues such as conflicting option names or an incorrectly + // linked LLVM distribution. + if (HadErrors) + report_fatal_error("inconsistency in registered CommandLine options"); } @@ -621,9 +631,11 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver, static bool ExpandResponseFile(const char *FName, StringSaver &Saver, TokenizerCallback Tokenizer, SmallVectorImpl &NewArgv) { - std::unique_ptr MemBuf; - if (MemoryBuffer::getFile(FName, MemBuf)) + ErrorOr> MemBufOrErr = + MemoryBuffer::getFile(FName); + if (!MemBufOrErr) return false; + std::unique_ptr MemBuf = std::move(MemBufOrErr.get()); StringRef Str(MemBuf->getBufferStart(), MemBuf->getBufferSize()); // If we have a UTF-16 byte order mark, convert to UTF-8 for parsing. @@ -1006,13 +1018,12 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv, } // Loop over args and make sure all required args are specified! - for (StringMap::iterator I = Opts.begin(), - E = Opts.end(); I != E; ++I) { - switch (I->second->getNumOccurrencesFlag()) { + for (const auto &Opt : Opts) { + switch (Opt.second->getNumOccurrencesFlag()) { case Required: case OneOrMore: - if (I->second->getNumOccurrences() == 0) { - I->second->error("must be specified at least once!"); + if (Opt.second->getNumOccurrences() == 0) { + Opt.second->error("must be specified at least once!"); ErrorParsing = true; } // Fall through diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp index 016c805cc7ab..9b0e44339d84 100644 --- a/lib/Support/CrashRecoveryContext.cpp +++ b/lib/Support/CrashRecoveryContext.cpp @@ -332,12 +332,26 @@ const std::string &CrashRecoveryContext::getBacktrace() const { return CRC->Backtrace; } -// +// FIXME: Portability. +static void setThreadBackgroundPriority() { +#ifdef __APPLE__ + setpriority(PRIO_DARWIN_THREAD, 0, PRIO_DARWIN_BG); +#endif +} + +static bool hasThreadBackgroundPriority() { +#ifdef __APPLE__ + return getpriority(PRIO_DARWIN_THREAD, 0) == 1; +#else + return false; +#endif +} namespace { struct RunSafelyOnThreadInfo { function_ref Fn; CrashRecoveryContext *CRC; + bool UseBackgroundPriority; bool Result; }; } @@ -345,11 +359,16 @@ struct RunSafelyOnThreadInfo { static void RunSafelyOnThread_Dispatch(void *UserData) { RunSafelyOnThreadInfo *Info = reinterpret_cast(UserData); + + if (Info->UseBackgroundPriority) + setThreadBackgroundPriority(); + Info->Result = Info->CRC->RunSafely(Info->Fn); } bool CrashRecoveryContext::RunSafelyOnThread(function_ref Fn, unsigned RequestedStackSize) { - RunSafelyOnThreadInfo Info = { Fn, this, false }; + bool UseBackgroundPriority = hasThreadBackgroundPriority(); + RunSafelyOnThreadInfo Info = { Fn, this, UseBackgroundPriority, false }; llvm_execute_on_thread(RunSafelyOnThread_Dispatch, &Info, RequestedStackSize); if (CrashRecoveryContextImpl *CRC = (CrashRecoveryContextImpl *)Impl) CRC->setSwitchedThread(); diff --git a/lib/Support/Disassembler.cpp b/lib/Support/Disassembler.cpp deleted file mode 100644 index 27df3a9e2cb7..000000000000 --- a/lib/Support/Disassembler.cpp +++ /dev/null @@ -1,74 +0,0 @@ -//===- lib/Support/Disassembler.cpp -----------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the necessary glue to call external disassembler -// libraries. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/Disassembler.h" -#include "llvm/Config/config.h" -#include -#include -#include -#include - -#if USE_UDIS86 -#include -#endif - -using namespace llvm; - -bool llvm::sys::hasDisassembler() -{ -#if defined (__i386__) || defined (__amd64__) || defined (__x86_64__) - // We have option to enable udis86 library. -# if USE_UDIS86 - return true; -#else - return false; -#endif -#else - return false; -#endif -} - -std::string llvm::sys::disassembleBuffer(uint8_t* start, size_t length, - uint64_t pc) { -#if (defined (__i386__) || defined (__amd64__) || defined (__x86_64__)) \ - && USE_UDIS86 - std::stringstream res; - - unsigned bits; -# if defined(__i386__) - bits = 32; -# else - bits = 64; -# endif - - ud_t ud_obj; - - ud_init(&ud_obj); - ud_set_input_buffer(&ud_obj, start, length); - ud_set_mode(&ud_obj, bits); - ud_set_pc(&ud_obj, pc); - ud_set_syntax(&ud_obj, UD_SYN_ATT); - - res << std::setbase(16) - << std::setw(bits/4); - - while (ud_disassemble(&ud_obj)) { - res << ud_insn_off(&ud_obj) << ":\t" << ud_insn_asm(&ud_obj) << "\n"; - } - - return res.str(); -#else - return "No disassembler available. See configure help for options.\n"; -#endif -} diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp index 82d7c0cc6d19..d2b551e8a0a6 100644 --- a/lib/Support/DynamicLibrary.cpp +++ b/lib/Support/DynamicLibrary.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This header file implements the operating system DynamicLibrary concept. +// This file implements the operating system DynamicLibrary concept. // // FIXME: This file leaks ExplicitSymbols and OpenedHandles! // diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp index 2e740ca04518..94bcdc58a806 100644 --- a/lib/Support/FileOutputBuffer.cpp +++ b/lib/Support/FileOutputBuffer.cpp @@ -87,7 +87,7 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size, std::error_code FileOutputBuffer::commit(int64_t NewSmallerSize) { // Unmap buffer, letting OS flush dirty pages to file on disk. - Region.reset(nullptr); + Region.reset(); // If requested, resize file as part of commit. if ( NewSmallerSize != -1 ) { diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp index 0d26bafd7714..8a234917827e 100644 --- a/lib/Support/FileUtilities.cpp +++ b/lib/Support/FileUtilities.cpp @@ -176,18 +176,21 @@ int llvm::DiffFilesWithTolerance(StringRef NameA, std::string *Error) { // Now its safe to mmap the files into memory because both files // have a non-zero size. - std::unique_ptr F1; - if (std::error_code ec = MemoryBuffer::getFile(NameA, F1)) { + ErrorOr> F1OrErr = MemoryBuffer::getFile(NameA); + if (std::error_code EC = F1OrErr.getError()) { if (Error) - *Error = ec.message(); + *Error = EC.message(); return 2; } - std::unique_ptr F2; - if (std::error_code ec = MemoryBuffer::getFile(NameB, F2)) { + std::unique_ptr F1 = std::move(F1OrErr.get()); + + ErrorOr> F2OrErr = MemoryBuffer::getFile(NameB); + if (std::error_code EC = F2OrErr.getError()) { if (Error) - *Error = ec.message(); + *Error = EC.message(); return 2; } + std::unique_ptr F2 = std::move(F2OrErr.get()); // Okay, now that we opened the files, scan them for the first difference. const char *File1Start = F1->getBufferStart(); diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index ce0a3b6bed77..e2dd6d522bb7 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This header file implements the operating system Host concept. +// This file implements the operating system Host concept. // //===----------------------------------------------------------------------===// @@ -570,6 +570,8 @@ StringRef sys::getHostCPUName() { .Case("A2", "a2") .Case("POWER6", "pwr6") .Case("POWER7", "pwr7") + .Case("POWER8", "pwr8") + .Case("POWER8E", "pwr8") .Default(generic); } #elif defined(__linux__) && defined(__arm__) diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp index 681bae2ba183..3f224e0c588f 100644 --- a/lib/Support/LockFileManager.cpp +++ b/lib/Support/LockFileManager.cpp @@ -33,11 +33,13 @@ Optional > LockFileManager::readLockFile(StringRef LockFileName) { // Read the owning host and PID out of the lock file. If it appears that the // owning process is dead, the lock file is invalid. - std::unique_ptr MB; - if (MemoryBuffer::getFile(LockFileName, MB)) { + ErrorOr> MBOrErr = + MemoryBuffer::getFile(LockFileName); + if (!MBOrErr) { sys::fs::remove(LockFileName); return None; } + std::unique_ptr MB = std::move(MBOrErr.get()); StringRef Hostname; StringRef PIDStr; diff --git a/lib/Support/ManagedStatic.cpp b/lib/Support/ManagedStatic.cpp index 6a1c2a545a8d..b8fb2841e525 100644 --- a/lib/Support/ManagedStatic.cpp +++ b/lib/Support/ManagedStatic.cpp @@ -14,16 +14,26 @@ #include "llvm/Support/ManagedStatic.h" #include "llvm/Config/config.h" #include "llvm/Support/Atomic.h" +#include "llvm/Support/Mutex.h" +#include "llvm/Support/MutexGuard.h" #include using namespace llvm; static const ManagedStaticBase *StaticList = nullptr; +static sys::Mutex& getManagedStaticMutex() { + // We need to use a function local static here, since this can get called + // during a static constructor and we need to guarantee that it's initialized + // correctly. + static sys::Mutex ManagedStaticMutex; + return ManagedStaticMutex; +} + void ManagedStaticBase::RegisterManagedStatic(void *(*Creator)(), void (*Deleter)(void*)) const { assert(Creator); if (llvm_is_multithreaded()) { - llvm_acquire_global_lock(); + MutexGuard Lock(getManagedStaticMutex()); if (!Ptr) { void* tmp = Creator(); @@ -43,8 +53,6 @@ void ManagedStaticBase::RegisterManagedStatic(void *(*Creator)(), Next = StaticList; StaticList = this; } - - llvm_release_global_lock(); } else { assert(!Ptr && !DeleterFn && !Next && "Partially initialized ManagedStatic!?"); @@ -75,8 +83,8 @@ void ManagedStaticBase::destroy() const { /// llvm_shutdown - Deallocate and destroy all ManagedStatic variables. void llvm::llvm_shutdown() { + MutexGuard Lock(getManagedStaticMutex()); + while (StaticList) StaticList->destroy(); - - if (llvm_is_multithreaded()) llvm_stop_multithreaded(); } diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp index abf4f60b27a6..5f4b7daae53d 100644 --- a/lib/Support/MemoryBuffer.cpp +++ b/lib/Support/MemoryBuffer.cpp @@ -152,18 +152,11 @@ MemoryBuffer *MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) { return SB; } - -/// getFileOrSTDIN - Open the specified file as a MemoryBuffer, or open stdin -/// if the Filename is "-". If an error occurs, this returns null and fills -/// in *ErrStr with a reason. If stdin is empty, this API (unlike getSTDIN) -/// returns an empty buffer. -std::error_code -MemoryBuffer::getFileOrSTDIN(StringRef Filename, - std::unique_ptr &Result, - int64_t FileSize) { +ErrorOr> +MemoryBuffer::getFileOrSTDIN(StringRef Filename, int64_t FileSize) { if (Filename == "-") - return getSTDIN(Result); - return getFile(Filename, Result, FileSize); + return getSTDIN(); + return getFile(Filename, FileSize); } @@ -212,9 +205,8 @@ class MemoryBufferMMapFile : public MemoryBuffer { }; } -static std::error_code -getMemoryBufferForStream(int FD, StringRef BufferName, - std::unique_ptr &Result) { +static ErrorOr> +getMemoryBufferForStream(int FD, StringRef BufferName) { const ssize_t ChunkSize = 4096*4; SmallString Buffer; ssize_t ReadBytes; @@ -229,48 +221,43 @@ getMemoryBufferForStream(int FD, StringRef BufferName, Buffer.set_size(Buffer.size() + ReadBytes); } while (ReadBytes != 0); - Result.reset(MemoryBuffer::getMemBufferCopy(Buffer, BufferName)); - return std::error_code(); + std::unique_ptr Ret( + MemoryBuffer::getMemBufferCopy(Buffer, BufferName)); + return std::move(Ret); } -static std::error_code getFileAux(const char *Filename, - std::unique_ptr &Result, - int64_t FileSize, bool RequiresNullTerminator, - bool IsVolatileSize); +static ErrorOr> +getFileAux(const char *Filename, int64_t FileSize, bool RequiresNullTerminator, + bool IsVolatileSize); -std::error_code MemoryBuffer::getFile(Twine Filename, - std::unique_ptr &Result, - int64_t FileSize, - bool RequiresNullTerminator, - bool IsVolatileSize) { +ErrorOr> +MemoryBuffer::getFile(Twine Filename, int64_t FileSize, + bool RequiresNullTerminator, bool IsVolatileSize) { // Ensure the path is null terminated. SmallString<256> PathBuf; StringRef NullTerminatedName = Filename.toNullTerminatedStringRef(PathBuf); - return getFileAux(NullTerminatedName.data(), Result, FileSize, - RequiresNullTerminator, IsVolatileSize); + return getFileAux(NullTerminatedName.data(), FileSize, RequiresNullTerminator, + IsVolatileSize); } -static std::error_code getOpenFileImpl(int FD, const char *Filename, - std::unique_ptr &Result, - uint64_t FileSize, uint64_t MapSize, - int64_t Offset, - bool RequiresNullTerminator, - bool IsVolatileSize); - -static std::error_code getFileAux(const char *Filename, - std::unique_ptr &Result, - int64_t FileSize, bool RequiresNullTerminator, - bool IsVolatileSize) { +static ErrorOr> +getOpenFileImpl(int FD, const char *Filename, uint64_t FileSize, + uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator, + bool IsVolatileSize); + +static ErrorOr> +getFileAux(const char *Filename, int64_t FileSize, bool RequiresNullTerminator, + bool IsVolatileSize) { int FD; std::error_code EC = sys::fs::openFileForRead(Filename, FD); if (EC) return EC; - std::error_code ret = - getOpenFileImpl(FD, Filename, Result, FileSize, FileSize, 0, + ErrorOr> Ret = + getOpenFileImpl(FD, Filename, FileSize, FileSize, 0, RequiresNullTerminator, IsVolatileSize); close(FD); - return ret; + return Ret; } static bool shouldUseMmap(int FD, @@ -321,12 +308,10 @@ static bool shouldUseMmap(int FD, return true; } -static std::error_code getOpenFileImpl(int FD, const char *Filename, - std::unique_ptr &Result, - uint64_t FileSize, uint64_t MapSize, - int64_t Offset, - bool RequiresNullTerminator, - bool IsVolatileSize) { +static ErrorOr> +getOpenFileImpl(int FD, const char *Filename, uint64_t FileSize, + uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator, + bool IsVolatileSize) { static int PageSize = sys::process::get_self()->page_size(); // Default is to map the full file. @@ -345,7 +330,7 @@ static std::error_code getOpenFileImpl(int FD, const char *Filename, sys::fs::file_type Type = Status.type(); if (Type != sys::fs::file_type::regular_file && Type != sys::fs::file_type::block_file) - return getMemoryBufferForStream(FD, Filename, Result); + return getMemoryBufferForStream(FD, Filename); FileSize = Status.getSize(); } @@ -355,10 +340,11 @@ static std::error_code getOpenFileImpl(int FD, const char *Filename, if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator, PageSize, IsVolatileSize)) { std::error_code EC; - Result.reset(new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile( - RequiresNullTerminator, FD, MapSize, Offset, EC)); + std::unique_ptr Result( + new (NamedBufferAlloc(Filename)) + MemoryBufferMMapFile(RequiresNullTerminator, FD, MapSize, Offset, EC)); if (!EC) - return std::error_code(); + return std::move(Result); } MemoryBuffer *Buf = MemoryBuffer::getNewUninitMemBuffer(MapSize, Filename); @@ -397,36 +383,29 @@ static std::error_code getOpenFileImpl(int FD, const char *Filename, BufPtr += NumRead; } - Result.swap(SB); - return std::error_code(); + return std::move(SB); } -std::error_code MemoryBuffer::getOpenFile(int FD, const char *Filename, - std::unique_ptr &Result, - uint64_t FileSize, - bool RequiresNullTerminator, - bool IsVolatileSize) { - return getOpenFileImpl(FD, Filename, Result, FileSize, FileSize, 0, +ErrorOr> +MemoryBuffer::getOpenFile(int FD, const char *Filename, uint64_t FileSize, + bool RequiresNullTerminator, bool IsVolatileSize) { + return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0, RequiresNullTerminator, IsVolatileSize); } -std::error_code MemoryBuffer::getOpenFileSlice( - int FD, const char *Filename, std::unique_ptr &Result, - uint64_t MapSize, int64_t Offset, bool IsVolatileSize) { - return getOpenFileImpl(FD, Filename, Result, -1, MapSize, Offset, false, +ErrorOr> +MemoryBuffer::getOpenFileSlice(int FD, const char *Filename, uint64_t MapSize, + int64_t Offset, bool IsVolatileSize) { + return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false, IsVolatileSize); } -//===----------------------------------------------------------------------===// -// MemoryBuffer::getSTDIN implementation. -//===----------------------------------------------------------------------===// - -std::error_code MemoryBuffer::getSTDIN(std::unique_ptr &Result) { +ErrorOr> MemoryBuffer::getSTDIN() { // Read in all of the data from stdin, we cannot mmap stdin. // // FIXME: That isn't necessarily true, we should try to mmap stdin and // fallback if it fails. sys::ChangeStdinToBinary(); - return getMemoryBufferForStream(0, "", Result); + return getMemoryBufferForStream(0, ""); } diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp index 15edf0ddbbd8..d5a0ec55c682 100644 --- a/lib/Support/Path.cpp +++ b/lib/Support/Path.cpp @@ -846,6 +846,40 @@ std::error_code create_directories(const Twine &Path, bool IgnoreExisting) { return create_directory(P, IgnoreExisting); } +std::error_code copy_file(const Twine &From, const Twine &To) { + int ReadFD, WriteFD; + if (std::error_code EC = openFileForRead(From, ReadFD)) + return EC; + if (std::error_code EC = openFileForWrite(To, WriteFD, F_None)) { + close(ReadFD); + return EC; + } + + const size_t BufSize = 4096; + char *Buf = new char[BufSize]; + int BytesRead = 0, BytesWritten = 0; + for (;;) { + BytesRead = read(ReadFD, Buf, BufSize); + if (BytesRead <= 0) + break; + while (BytesRead) { + BytesWritten = write(WriteFD, Buf, BytesRead); + if (BytesWritten < 0) + break; + BytesRead -= BytesWritten; + } + if (BytesWritten < 0) + break; + } + close(ReadFD); + close(WriteFD); + delete[] Buf; + + if (BytesRead < 0 || BytesWritten < 0) + return std::error_code(errno, std::generic_category()); + return std::error_code(); +} + bool exists(file_status status) { return status_known(status) && status.type() != file_type::file_not_found; } @@ -893,7 +927,7 @@ void directory_entry::replace_filename(const Twine &filename, file_status st) { } /// @brief Identify the magic in magic. - file_magic identify_magic(StringRef Magic) { +file_magic identify_magic(StringRef Magic) { if (Magic.size() < 4) return file_magic::unknown; switch ((unsigned char)Magic[0]) { @@ -1031,7 +1065,7 @@ std::error_code identify_magic(const Twine &Path, file_magic &Result) { char Buffer[32]; int Length = read(FD, Buffer, sizeof(Buffer)); - if (Length < 0) + if (close(FD) != 0 || Length < 0) return std::error_code(errno, std::generic_category()); Result = identify_magic(StringRef(Buffer, Length)); diff --git a/lib/Support/Process.cpp b/lib/Support/Process.cpp index 0380ed955dd5..0d42e0e35b92 100644 --- a/lib/Support/Process.cpp +++ b/lib/Support/Process.cpp @@ -7,13 +7,16 @@ // //===----------------------------------------------------------------------===// // -// This header file implements the operating system Process concept. +// This file implements the operating system Process concept. // //===----------------------------------------------------------------------===// +#include "llvm/ADT/StringExtras.h" #include "llvm/Config/config.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/Process.h" +#include "llvm/Support/Program.h" using namespace llvm; using namespace sys; @@ -66,6 +69,33 @@ TimeValue self_process::get_wall_time() const { return getElapsedWallTime(); } +Optional Process::FindInEnvPath(const std::string& EnvName, + const std::string& FileName) +{ + Optional FoundPath; + Optional OptPath = Process::GetEnv(EnvName); + if (!OptPath.hasValue()) + return FoundPath; + + const char EnvPathSeparatorStr[] = {EnvPathSeparator, '\0'}; + SmallVector Dirs; + SplitString(OptPath.getValue(), Dirs, EnvPathSeparatorStr); + + for (const auto &Dir : Dirs) { + if (Dir.empty()) + continue; + + SmallString<128> FilePath(Dir); + path::append(FilePath, FileName); + if (fs::exists(Twine(FilePath))) { + FoundPath = FilePath.str(); + break; + } + } + + return FoundPath; +} + #define COLOR(FGBG, CODE, BOLD) "\033[0;" BOLD FGBG CODE "m" diff --git a/lib/Support/Program.cpp b/lib/Support/Program.cpp index eb700e3a8591..b84b82b1f10b 100644 --- a/lib/Support/Program.cpp +++ b/lib/Support/Program.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This header file implements the operating system Program concept. +// This file implements the operating system Program concept. // //===----------------------------------------------------------------------===// diff --git a/lib/Support/RandomNumberGenerator.cpp b/lib/Support/RandomNumberGenerator.cpp new file mode 100644 index 000000000000..c50e7cb8fbe5 --- /dev/null +++ b/lib/Support/RandomNumberGenerator.cpp @@ -0,0 +1,61 @@ +//===-- RandomNumberGenerator.cpp - Implement RNG class -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements random number generation (RNG). +// The current implementation is NOT cryptographically secure as it uses +// the C++11 facilities. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "rng" +#include "llvm/Support/RandomNumberGenerator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +// Tracking BUG: 19665 +// http://llvm.org/bugs/show_bug.cgi?id=19665 +// +// Do not change to cl::opt since this silently breaks argument parsing. +static cl::opt +Seed("rng-seed", cl::value_desc("seed"), + cl::desc("Seed for the random number generator"), cl::init(0)); + +RandomNumberGenerator::RandomNumberGenerator(StringRef Salt) { + DEBUG( + if (Seed == 0) + errs() << "Warning! Using unseeded random number generator.\n" + ); + + // Combine seed and salt using std::seed_seq. + // Entropy: Seed-low, Seed-high, Salt... + std::vector Data; + Data.reserve(2 + Salt.size()/4 + 1); + Data.push_back(Seed); + Data.push_back(Seed >> 32); + + uint32_t Pack = 0; + for (size_t I = 0; I < Salt.size(); ++I) { + Pack <<= 8; + Pack += Salt[I]; + + if (I%4 == 3) + Data.push_back(Pack); + } + Data.push_back(Pack); + + std::seed_seq SeedSeq(Data.begin(), Data.end()); + Generator.seed(SeedSeq); +} + +uint64_t RandomNumberGenerator::next(uint64_t Max) { + std::uniform_int_distribution distribution(0, Max - 1); + return distribution(Generator); +} diff --git a/lib/Support/ScaledNumber.cpp b/lib/Support/ScaledNumber.cpp new file mode 100644 index 000000000000..3fe027ba3316 --- /dev/null +++ b/lib/Support/ScaledNumber.cpp @@ -0,0 +1,319 @@ +//==- lib/Support/ScaledNumber.cpp - Support for scaled numbers -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of some scaled number algorithms. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ScaledNumber.h" + +#include "llvm/ADT/APFloat.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace llvm::ScaledNumbers; + +std::pair ScaledNumbers::multiply64(uint64_t LHS, + uint64_t RHS) { + // Separate into two 32-bit digits (U.L). + auto getU = [](uint64_t N) { return N >> 32; }; + auto getL = [](uint64_t N) { return N & UINT32_MAX; }; + uint64_t UL = getU(LHS), LL = getL(LHS), UR = getU(RHS), LR = getL(RHS); + + // Compute cross products. + uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR; + + // Sum into two 64-bit digits. + uint64_t Upper = P1, Lower = P4; + auto addWithCarry = [&](uint64_t N) { + uint64_t NewLower = Lower + (getL(N) << 32); + Upper += getU(N) + (NewLower < Lower); + Lower = NewLower; + }; + addWithCarry(P2); + addWithCarry(P3); + + // Check whether the upper digit is empty. + if (!Upper) + return std::make_pair(Lower, 0); + + // Shift as little as possible to maximize precision. + unsigned LeadingZeros = countLeadingZeros(Upper); + int Shift = 64 - LeadingZeros; + if (LeadingZeros) + Upper = Upper << LeadingZeros | Lower >> Shift; + return getRounded(Upper, Shift, + Shift && (Lower & UINT64_C(1) << (Shift - 1))); +} + +static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); } + +std::pair ScaledNumbers::divide32(uint32_t Dividend, + uint32_t Divisor) { + assert(Dividend && "expected non-zero dividend"); + assert(Divisor && "expected non-zero divisor"); + + // Use 64-bit math and canonicalize the dividend to gain precision. + uint64_t Dividend64 = Dividend; + int Shift = 0; + if (int Zeros = countLeadingZeros(Dividend64)) { + Shift -= Zeros; + Dividend64 <<= Zeros; + } + uint64_t Quotient = Dividend64 / Divisor; + uint64_t Remainder = Dividend64 % Divisor; + + // If Quotient needs to be shifted, leave the rounding to getAdjusted(). + if (Quotient > UINT32_MAX) + return getAdjusted(Quotient, Shift); + + // Round based on the value of the next bit. + return getRounded(Quotient, Shift, Remainder >= getHalf(Divisor)); +} + +std::pair ScaledNumbers::divide64(uint64_t Dividend, + uint64_t Divisor) { + assert(Dividend && "expected non-zero dividend"); + assert(Divisor && "expected non-zero divisor"); + + // Minimize size of divisor. + int Shift = 0; + if (int Zeros = countTrailingZeros(Divisor)) { + Shift -= Zeros; + Divisor >>= Zeros; + } + + // Check for powers of two. + if (Divisor == 1) + return std::make_pair(Dividend, Shift); + + // Maximize size of dividend. + if (int Zeros = countLeadingZeros(Dividend)) { + Shift -= Zeros; + Dividend <<= Zeros; + } + + // Start with the result of a divide. + uint64_t Quotient = Dividend / Divisor; + Dividend %= Divisor; + + // Continue building the quotient with long division. + while (!(Quotient >> 63) && Dividend) { + // Shift Dividend and check for overflow. + bool IsOverflow = Dividend >> 63; + Dividend <<= 1; + --Shift; + + // Get the next bit of Quotient. + Quotient <<= 1; + if (IsOverflow || Divisor <= Dividend) { + Quotient |= 1; + Dividend -= Divisor; + } + } + + return getRounded(Quotient, Shift, Dividend >= getHalf(Divisor)); +} + +int ScaledNumbers::compareImpl(uint64_t L, uint64_t R, int ScaleDiff) { + assert(ScaleDiff >= 0 && "wrong argument order"); + assert(ScaleDiff < 64 && "numbers too far apart"); + + uint64_t L_adjusted = L >> ScaleDiff; + if (L_adjusted < R) + return -1; + if (L_adjusted > R) + return 1; + + return L > L_adjusted << ScaleDiff ? 1 : 0; +} + +static void appendDigit(std::string &Str, unsigned D) { + assert(D < 10); + Str += '0' + D % 10; +} + +static void appendNumber(std::string &Str, uint64_t N) { + while (N) { + appendDigit(Str, N % 10); + N /= 10; + } +} + +static bool doesRoundUp(char Digit) { + switch (Digit) { + case '5': + case '6': + case '7': + case '8': + case '9': + return true; + default: + return false; + } +} + +static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) { + assert(E >= ScaledNumbers::MinScale); + assert(E <= ScaledNumbers::MaxScale); + + // Find a new E, but don't let it increase past MaxScale. + int LeadingZeros = ScaledNumberBase::countLeadingZeros64(D); + int NewE = std::min(ScaledNumbers::MaxScale, E + 63 - LeadingZeros); + int Shift = 63 - (NewE - E); + assert(Shift <= LeadingZeros); + assert(Shift == LeadingZeros || NewE == ScaledNumbers::MaxScale); + D <<= Shift; + E = NewE; + + // Check for a denormal. + unsigned AdjustedE = E + 16383; + if (!(D >> 63)) { + assert(E == ScaledNumbers::MaxScale); + AdjustedE = 0; + } + + // Build the float and print it. + uint64_t RawBits[2] = {D, AdjustedE}; + APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits)); + SmallVector Chars; + Float.toString(Chars, Precision, 0); + return std::string(Chars.begin(), Chars.end()); +} + +static std::string stripTrailingZeros(const std::string &Float) { + size_t NonZero = Float.find_last_not_of('0'); + assert(NonZero != std::string::npos && "no . in floating point string"); + + if (Float[NonZero] == '.') + ++NonZero; + + return Float.substr(0, NonZero + 1); +} + +std::string ScaledNumberBase::toString(uint64_t D, int16_t E, int Width, + unsigned Precision) { + if (!D) + return "0.0"; + + // Canonicalize exponent and digits. + uint64_t Above0 = 0; + uint64_t Below0 = 0; + uint64_t Extra = 0; + int ExtraShift = 0; + if (E == 0) { + Above0 = D; + } else if (E > 0) { + if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) { + D <<= Shift; + E -= Shift; + + if (!E) + Above0 = D; + } + } else if (E > -64) { + Above0 = D >> -E; + Below0 = D << (64 + E); + } else if (E > -120) { + Below0 = D >> (-E - 64); + Extra = D << (128 + E); + ExtraShift = -64 - E; + } + + // Fall back on APFloat for very small and very large numbers. + if (!Above0 && !Below0) + return toStringAPFloat(D, E, Precision); + + // Append the digits before the decimal. + std::string Str; + size_t DigitsOut = 0; + if (Above0) { + appendNumber(Str, Above0); + DigitsOut = Str.size(); + } else + appendDigit(Str, 0); + std::reverse(Str.begin(), Str.end()); + + // Return early if there's nothing after the decimal. + if (!Below0) + return Str + ".0"; + + // Append the decimal and beyond. + Str += '.'; + uint64_t Error = UINT64_C(1) << (64 - Width); + + // We need to shift Below0 to the right to make space for calculating + // digits. Save the precision we're losing in Extra. + Extra = (Below0 & 0xf) << 56 | (Extra >> 8); + Below0 >>= 4; + size_t SinceDot = 0; + size_t AfterDot = Str.size(); + do { + if (ExtraShift) { + --ExtraShift; + Error *= 5; + } else + Error *= 10; + + Below0 *= 10; + Extra *= 10; + Below0 += (Extra >> 60); + Extra = Extra & (UINT64_MAX >> 4); + appendDigit(Str, Below0 >> 60); + Below0 = Below0 & (UINT64_MAX >> 4); + if (DigitsOut || Str.back() != '0') + ++DigitsOut; + ++SinceDot; + } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 && + (!Precision || DigitsOut <= Precision || SinceDot < 2)); + + // Return early for maximum precision. + if (!Precision || DigitsOut <= Precision) + return stripTrailingZeros(Str); + + // Find where to truncate. + size_t Truncate = + std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1); + + // Check if there's anything to truncate. + if (Truncate >= Str.size()) + return stripTrailingZeros(Str); + + bool Carry = doesRoundUp(Str[Truncate]); + if (!Carry) + return stripTrailingZeros(Str.substr(0, Truncate)); + + // Round with the first truncated digit. + for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend(); + I != E; ++I) { + if (*I == '.') + continue; + if (*I == '9') { + *I = '0'; + continue; + } + + ++*I; + Carry = false; + break; + } + + // Add "1" in front if we still need to carry. + return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate)); +} + +raw_ostream &ScaledNumberBase::print(raw_ostream &OS, uint64_t D, int16_t E, + int Width, unsigned Precision) { + return OS << toString(D, E, Width, Precision); +} + +void ScaledNumberBase::dump(uint64_t D, int16_t E, int Width) { + print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E + << "]"; +} diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp index 4d00d3baa417..003cb56e6cb5 100644 --- a/lib/Support/SourceMgr.cpp +++ b/lib/Support/SourceMgr.cpp @@ -27,7 +27,7 @@ static const size_t TabStop = 8; namespace { struct LineNoCacheTy { - int LastQueryBufferID; + unsigned LastQueryBufferID; const char *LastQuery; unsigned LineNoOfQuery; }; @@ -49,41 +49,44 @@ SourceMgr::~SourceMgr() { } } -size_t SourceMgr::AddIncludeFile(const std::string &Filename, - SMLoc IncludeLoc, - std::string &IncludedFile) { - std::unique_ptr NewBuf; +unsigned SourceMgr::AddIncludeFile(const std::string &Filename, + SMLoc IncludeLoc, + std::string &IncludedFile) { IncludedFile = Filename; - MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf); + ErrorOr> NewBufOrErr = + MemoryBuffer::getFile(IncludedFile.c_str()); // If the file didn't exist directly, see if it's in an include path. - for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBuf; ++i) { - IncludedFile = IncludeDirectories[i] + sys::path::get_separator().data() + Filename; - MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf); + for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBufOrErr; + ++i) { + IncludedFile = + IncludeDirectories[i] + sys::path::get_separator().data() + Filename; + NewBufOrErr = MemoryBuffer::getFile(IncludedFile.c_str()); } - if (!NewBuf) return ~0U; + if (!NewBufOrErr) + return 0; - return AddNewSourceBuffer(NewBuf.release(), IncludeLoc); + return AddNewSourceBuffer(NewBufOrErr.get().release(), IncludeLoc); } - -int SourceMgr::FindBufferContainingLoc(SMLoc Loc) const { +unsigned SourceMgr::FindBufferContainingLoc(SMLoc Loc) const { for (unsigned i = 0, e = Buffers.size(); i != e; ++i) if (Loc.getPointer() >= Buffers[i].Buffer->getBufferStart() && // Use <= here so that a pointer to the null at the end of the buffer // is included as part of the buffer. Loc.getPointer() <= Buffers[i].Buffer->getBufferEnd()) - return i; - return -1; + return i + 1; + return 0; } std::pair -SourceMgr::getLineAndColumn(SMLoc Loc, int BufferID) const { - if (BufferID == -1) BufferID = FindBufferContainingLoc(Loc); - assert(BufferID != -1 && "Invalid Location!"); +SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const { + if (!BufferID) + BufferID = FindBufferContainingLoc(Loc); + assert(BufferID && "Invalid Location!"); - MemoryBuffer *Buff = getBufferInfo(BufferID).Buffer; + const MemoryBuffer *Buff = getMemoryBuffer(BufferID); // Count the number of \n's between the start of the file and the specified // location. @@ -125,8 +128,8 @@ SourceMgr::getLineAndColumn(SMLoc Loc, int BufferID) const { void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const { if (IncludeLoc == SMLoc()) return; // Top of stack. - int CurBuf = FindBufferContainingLoc(IncludeLoc); - assert(CurBuf != -1 && "Invalid or unspecified location!"); + unsigned CurBuf = FindBufferContainingLoc(IncludeLoc); + assert(CurBuf && "Invalid or unspecified location!"); PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS); @@ -149,10 +152,10 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind, std::string LineStr; if (Loc.isValid()) { - int CurBuf = FindBufferContainingLoc(Loc); - assert(CurBuf != -1 && "Invalid or unspecified location!"); + unsigned CurBuf = FindBufferContainingLoc(Loc); + assert(CurBuf && "Invalid or unspecified location!"); - MemoryBuffer *CurMB = getBufferInfo(CurBuf).Buffer; + const MemoryBuffer *CurMB = getMemoryBuffer(CurBuf); BufferID = CurMB->getBufferIdentifier(); // Scan backward to find the start of the line. @@ -208,8 +211,8 @@ void SourceMgr::PrintMessage(raw_ostream &OS, const SMDiagnostic &Diagnostic, } if (Diagnostic.getLoc().isValid()) { - int CurBuf = FindBufferContainingLoc(Diagnostic.getLoc()); - assert(CurBuf != -1 && "Invalid or unspecified location!"); + unsigned CurBuf = FindBufferContainingLoc(Diagnostic.getLoc()); + assert(CurBuf && "Invalid or unspecified location!"); PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS); } diff --git a/lib/Transforms/Utils/SpecialCaseList.cpp b/lib/Support/SpecialCaseList.cpp similarity index 68% rename from lib/Transforms/Utils/SpecialCaseList.cpp rename to lib/Support/SpecialCaseList.cpp index 45a2b618a710..21e43c5e7035 100644 --- a/lib/Transforms/Utils/SpecialCaseList.cpp +++ b/lib/Support/SpecialCaseList.cpp @@ -14,15 +14,11 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Utils/SpecialCaseList.h" +#include "llvm/Support/SpecialCaseList.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSet.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Module.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Regex.h" #include "llvm/Support/raw_ostream.h" @@ -38,10 +34,12 @@ namespace llvm { /// reason for doing so is efficiency; StringSet is much faster at matching /// literal strings than Regex. struct SpecialCaseList::Entry { - StringSet<> Strings; - Regex *RegEx; + Entry() {} + Entry(Entry &&Other) + : Strings(std::move(Other.Strings)), RegEx(std::move(Other.RegEx)) {} - Entry() : RegEx(nullptr) {} + StringSet<> Strings; + std::unique_ptr RegEx; bool match(StringRef Query) const { return Strings.count(Query) || (RegEx && RegEx->match(Query)); @@ -54,12 +52,13 @@ SpecialCaseList *SpecialCaseList::create( const StringRef Path, std::string &Error) { if (Path.empty()) return new SpecialCaseList(); - std::unique_ptr File; - if (std::error_code EC = MemoryBuffer::getFile(Path, File)) { + ErrorOr> FileOrErr = + MemoryBuffer::getFile(Path); + if (std::error_code EC = FileOrErr.getError()) { Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str(); return nullptr; } - return create(File.get(), Error); + return create(FileOrErr.get().get(), Error); } SpecialCaseList *SpecialCaseList::create( @@ -150,66 +149,16 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) { for (StringMap::const_iterator II = I->second.begin(), IE = I->second.end(); II != IE; ++II) { - Entries[I->getKey()][II->getKey()].RegEx = new Regex(II->getValue()); + Entries[I->getKey()][II->getKey()].RegEx.reset(new Regex(II->getValue())); } } return true; } -SpecialCaseList::~SpecialCaseList() { - for (StringMap >::iterator I = Entries.begin(), - E = Entries.end(); - I != E; ++I) { - for (StringMap::const_iterator II = I->second.begin(), - IE = I->second.end(); - II != IE; ++II) { - delete II->second.RegEx; - } - } -} - -bool SpecialCaseList::isIn(const Function& F, const StringRef Category) const { - return isIn(*F.getParent(), Category) || - inSectionCategory("fun", F.getName(), Category); -} - -static StringRef GetGlobalTypeString(const GlobalValue &G) { - // Types of GlobalVariables are always pointer types. - Type *GType = G.getType()->getElementType(); - // For now we support blacklisting struct types only. - if (StructType *SGType = dyn_cast(GType)) { - if (!SGType->isLiteral()) - return SGType->getName(); - } - return ""; -} - -bool SpecialCaseList::isIn(const GlobalVariable &G, - const StringRef Category) const { - return isIn(*G.getParent(), Category) || - inSectionCategory("global", G.getName(), Category) || - inSectionCategory("type", GetGlobalTypeString(G), Category); -} - -bool SpecialCaseList::isIn(const GlobalAlias &GA, - const StringRef Category) const { - if (isIn(*GA.getParent(), Category)) - return true; - - if (isa(GA.getType()->getElementType())) - return inSectionCategory("fun", GA.getName(), Category); - - return inSectionCategory("global", GA.getName(), Category) || - inSectionCategory("type", GetGlobalTypeString(GA), Category); -} - -bool SpecialCaseList::isIn(const Module &M, const StringRef Category) const { - return inSectionCategory("src", M.getModuleIdentifier(), Category); -} +SpecialCaseList::~SpecialCaseList() {} -bool SpecialCaseList::inSectionCategory(const StringRef Section, - const StringRef Query, - const StringRef Category) const { +bool SpecialCaseList::inSection(const StringRef Section, const StringRef Query, + const StringRef Category) const { StringMap >::const_iterator I = Entries.find(Section); if (I == Entries.end()) return false; StringMap::const_iterator II = I->second.find(Category); diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp index 72a6d822d2b6..ddb73494ff5d 100644 --- a/lib/Support/StringMap.cpp +++ b/lib/Support/StringMap.cpp @@ -181,7 +181,7 @@ StringMapEntryBase *StringMapImpl::RemoveKey(StringRef Key) { /// RehashTable - Grow the table, redistributing values into the buckets with /// the appropriate mod-of-hashtable-size. -void StringMapImpl::RehashTable() { +unsigned StringMapImpl::RehashTable(unsigned BucketNo) { unsigned NewSize; unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1); @@ -193,9 +193,10 @@ void StringMapImpl::RehashTable() { } else if (NumBuckets-(NumItems+NumTombstones) <= NumBuckets/8) { NewSize = NumBuckets; } else { - return; + return BucketNo; } + unsigned NewBucketNo = BucketNo; // Allocate one extra bucket which will always be non-empty. This allows the // iterators to stop at end. StringMapEntryBase **NewTableArray = @@ -215,6 +216,8 @@ void StringMapImpl::RehashTable() { if (!NewTableArray[NewBucket]) { NewTableArray[FullHash & (NewSize-1)] = Bucket; NewHashArray[FullHash & (NewSize-1)] = FullHash; + if (I == BucketNo) + NewBucketNo = NewBucket; continue; } @@ -227,6 +230,8 @@ void StringMapImpl::RehashTable() { // Finally found a slot. Fill it in. NewTableArray[NewBucket] = Bucket; NewHashArray[NewBucket] = FullHash; + if (I == BucketNo) + NewBucketNo = NewBucket; } } @@ -235,4 +240,5 @@ void StringMapImpl::RehashTable() { TheTable = NewTableArray; NumBuckets = NewSize; NumTombstones = 0; + return NewBucketNo; } diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp index a008831eb6be..f6918835f74b 100644 --- a/lib/Support/TargetRegistry.cpp +++ b/lib/Support/TargetRegistry.cpp @@ -116,17 +116,6 @@ void TargetRegistry::RegisterTarget(Target &T, T.HasJIT = HasJIT; } -const Target *TargetRegistry::getClosestTargetForJIT(std::string &Error) { - const Target *TheTarget = lookupTarget(sys::getDefaultTargetTriple(), Error); - - if (TheTarget && !TheTarget->hasJIT()) { - Error = "No JIT compatible target available for this host"; - return nullptr; - } - - return TheTarget; -} - static int TargetArraySortFn(const std::pair *LHS, const std::pair *RHS) { return LHS->first.compare(RHS->first); diff --git a/lib/Support/Threading.cpp b/lib/Support/Threading.cpp index 1acfa79b11d5..ca7f3f64aa37 100644 --- a/lib/Support/Threading.cpp +++ b/lib/Support/Threading.cpp @@ -7,7 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This file implements llvm_start_multithreaded() and friends. +// This file defines helper functions for running LLVM in a multi-threaded +// environment. // //===----------------------------------------------------------------------===// @@ -19,50 +20,14 @@ using namespace llvm; -static bool multithreaded_mode = false; - -static sys::Mutex* global_lock = nullptr; - -bool llvm::llvm_start_multithreaded() { +bool llvm::llvm_is_multithreaded() { #if LLVM_ENABLE_THREADS != 0 - assert(!multithreaded_mode && "Already multithreaded!"); - multithreaded_mode = true; - global_lock = new sys::Mutex(true); - - // We fence here to ensure that all initialization is complete BEFORE we - // return from llvm_start_multithreaded(). - sys::MemoryFence(); return true; #else return false; #endif } -void llvm::llvm_stop_multithreaded() { -#if LLVM_ENABLE_THREADS != 0 - assert(multithreaded_mode && "Not currently multithreaded!"); - - // We fence here to insure that all threaded operations are complete BEFORE we - // return from llvm_stop_multithreaded(). - sys::MemoryFence(); - - multithreaded_mode = false; - delete global_lock; -#endif -} - -bool llvm::llvm_is_multithreaded() { - return multithreaded_mode; -} - -void llvm::llvm_acquire_global_lock() { - if (multithreaded_mode) global_lock->acquire(); -} - -void llvm::llvm_release_global_lock() { - if (multithreaded_mode) global_lock->release(); -} - #if LLVM_ENABLE_THREADS != 0 && defined(HAVE_PTHREAD_H) #include diff --git a/lib/Support/TimeValue.cpp b/lib/Support/TimeValue.cpp index bd8af174bcd0..4a7079794204 100644 --- a/lib/Support/TimeValue.cpp +++ b/lib/Support/TimeValue.cpp @@ -53,7 +53,7 @@ TimeValue::normalize( void ) { } -/// Include the platform specific portion of TimeValue class +/// Include the platform-specific portion of TimeValue class #ifdef LLVM_ON_UNIX #include "Unix/TimeValue.inc" #endif diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp index 61465ae5e8be..210bda754e74 100644 --- a/lib/Support/Timer.cpp +++ b/lib/Support/Timer.cpp @@ -19,6 +19,7 @@ #include "llvm/Support/Format.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Mutex.h" +#include "llvm/Support/MutexGuard.h" #include "llvm/Support/Process.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -84,14 +85,13 @@ static TimerGroup *getDefaultTimerGroup() { sys::MemoryFence(); if (tmp) return tmp; - llvm_acquire_global_lock(); + sys::SmartScopedLock Lock(*TimerLock); tmp = DefaultTimerGroup; if (!tmp) { tmp = new TimerGroup("Miscellaneous Ungrouped Timers"); sys::MemoryFence(); DefaultTimerGroup = tmp; } - llvm_release_global_lock(); return tmp; } diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp index b3d48fb537af..714d9e8f2964 100644 --- a/lib/Support/Triple.cpp +++ b/lib/Support/Triple.cpp @@ -50,6 +50,7 @@ const char *Triple::getArchTypeName(ArchType Kind) { case amdil: return "amdil"; case spir: return "spir"; case spir64: return "spir64"; + case kalimba: return "kalimba"; } llvm_unreachable("Invalid ArchType!"); @@ -60,6 +61,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) { default: return nullptr; + case arm64: + case arm64_be: case aarch64: case aarch64_be: return "aarch64"; @@ -68,9 +71,6 @@ const char *Triple::getArchTypePrefix(ArchType Kind) { case thumb: case thumbeb: return "arm"; - case arm64: - case arm64_be: return "arm64"; - case ppc64: case ppc64le: case ppc: return "ppc"; @@ -101,6 +101,7 @@ const char *Triple::getArchTypePrefix(ArchType Kind) { case amdil: return "amdil"; case spir: return "spir"; case spir64: return "spir"; + case kalimba: return "kalimba"; } } @@ -115,7 +116,10 @@ const char *Triple::getVendorTypeName(VendorType Kind) { case BGQ: return "bgq"; case Freescale: return "fsl"; case IBM: return "ibm"; + case ImaginationTechnologies: return "img"; + case MipsTechnologies: return "mti"; case NVIDIA: return "nvidia"; + case CSR: return "csr"; } llvm_unreachable("Invalid VendorType!"); @@ -207,6 +211,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) { .Case("amdil", amdil) .Case("spir", spir) .Case("spir64", spir64) + .Case("kalimba", kalimba) .Default(UnknownArch); } @@ -280,6 +285,7 @@ static Triple::ArchType parseArch(StringRef ArchName) { .Case("amdil", Triple::amdil) .Case("spir", Triple::spir) .Case("spir64", Triple::spir64) + .Case("kalimba", Triple::kalimba) .Default(Triple::UnknownArch); } @@ -292,7 +298,10 @@ static Triple::VendorType parseVendor(StringRef VendorName) { .Case("bgq", Triple::BGQ) .Case("fsl", Triple::Freescale) .Case("ibm", Triple::IBM) + .Case("img", Triple::ImaginationTechnologies) + .Case("mti", Triple::MipsTechnologies) .Case("nvidia", Triple::NVIDIA) + .Case("csr", Triple::CSR) .Default(Triple::UnknownVendor); } @@ -350,6 +359,28 @@ static Triple::ObjectFormatType parseFormat(StringRef EnvironmentName) { .Default(Triple::UnknownObjectFormat); } +static Triple::SubArchType parseSubArch(StringRef SubArchName) { + return StringSwitch(SubArchName) + .EndsWith("v8", Triple::ARMSubArch_v8) + .EndsWith("v8a", Triple::ARMSubArch_v8) + .EndsWith("v7", Triple::ARMSubArch_v7) + .EndsWith("v7a", Triple::ARMSubArch_v7) + .EndsWith("v7em", Triple::ARMSubArch_v7em) + .EndsWith("v7l", Triple::ARMSubArch_v7) + .EndsWith("v7m", Triple::ARMSubArch_v7m) + .EndsWith("v7r", Triple::ARMSubArch_v7) + .EndsWith("v7s", Triple::ARMSubArch_v7s) + .EndsWith("v6", Triple::ARMSubArch_v6) + .EndsWith("v6m", Triple::ARMSubArch_v6m) + .EndsWith("v6t2", Triple::ARMSubArch_v6t2) + .EndsWith("v5", Triple::ARMSubArch_v5) + .EndsWith("v5e", Triple::ARMSubArch_v5) + .EndsWith("v5t", Triple::ARMSubArch_v5) + .EndsWith("v5te", Triple::ARMSubArch_v5te) + .EndsWith("v4t", Triple::ARMSubArch_v4t) + .Default(Triple::NoSubArch); +} + static const char *getObjectFormatTypeName(Triple::ObjectFormatType Kind) { switch (Kind) { case Triple::UnknownObjectFormat: return ""; @@ -375,6 +406,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) { Triple::Triple(const Twine &Str) : Data(Str.str()), Arch(parseArch(getArchName())), + SubArch(parseSubArch(getArchName())), Vendor(parseVendor(getVendorName())), OS(parseOS(getOSName())), Environment(parseEnvironment(getEnvironmentName())), @@ -392,6 +424,7 @@ Triple::Triple(const Twine &Str) Triple::Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr) : Data((ArchStr + Twine('-') + VendorStr + Twine('-') + OSStr).str()), Arch(parseArch(ArchStr.str())), + SubArch(parseSubArch(ArchStr.str())), Vendor(parseVendor(VendorStr.str())), OS(parseOS(OSStr.str())), Environment(), ObjectFormat(Triple::UnknownObjectFormat) { @@ -408,6 +441,7 @@ Triple::Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr, : Data((ArchStr + Twine('-') + VendorStr + Twine('-') + OSStr + Twine('-') + EnvironmentStr).str()), Arch(parseArch(ArchStr.str())), + SubArch(parseSubArch(ArchStr.str())), Vendor(parseVendor(VendorStr.str())), OS(parseOS(OSStr.str())), Environment(parseEnvironment(EnvironmentStr.str())), @@ -737,9 +771,8 @@ void Triple::setObjectFormat(ObjectFormatType Kind) { if (Environment == UnknownEnvironment) return setEnvironmentName(getObjectFormatTypeName(Kind)); - Twine Env = getEnvironmentTypeName(Environment) + Twine("-") + - getObjectFormatTypeName(Kind); - setEnvironmentName(Env.str()); + setEnvironmentName((getEnvironmentTypeName(Environment) + Twine("-") + + getObjectFormatTypeName(Kind)).str()); } void Triple::setArchName(StringRef Str) { @@ -799,6 +832,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::x86: case llvm::Triple::xcore: case llvm::Triple::spir: + case llvm::Triple::kalimba: return 32; case llvm::Triple::arm64: @@ -850,6 +884,7 @@ Triple Triple::get32BitArchVariant() const { case Triple::arm: case Triple::armeb: case Triple::hexagon: + case Triple::kalimba: case Triple::le32: case Triple::mips: case Triple::mipsel: @@ -884,6 +919,7 @@ Triple Triple::get64BitArchVariant() const { case Triple::arm: case Triple::armeb: case Triple::hexagon: + case Triple::kalimba: case Triple::le32: case Triple::msp430: case Triple::r600: @@ -920,3 +956,85 @@ Triple Triple::get64BitArchVariant() const { } return T; } + +// FIXME: tblgen this. +const char *Triple::getARMCPUForArch(StringRef MArch) const { + if (MArch.empty()) + MArch = getArchName(); + + switch (getOS()) { + case llvm::Triple::NetBSD: + if (MArch == "armv6") + return "arm1176jzf-s"; + break; + case llvm::Triple::Win32: + // FIXME: this is invalid for WindowsCE + return "cortex-a9"; + default: + break; + } + + const char *result = nullptr; + size_t offset = StringRef::npos; + if (MArch.startswith("arm")) + offset = 3; + if (MArch.startswith("thumb")) + offset = 5; + if (offset != StringRef::npos && MArch.substr(offset, 2) == "eb") + offset += 2; + if (offset != StringRef::npos) + result = llvm::StringSwitch(MArch.substr(offset)) + .Cases("v2", "v2a", "arm2") + .Case("v3", "arm6") + .Case("v3m", "arm7m") + .Case("v4", "strongarm") + .Case("v4t", "arm7tdmi") + .Cases("v5", "v5t", "arm10tdmi") + .Cases("v5e", "v5te", "arm1022e") + .Case("v5tej", "arm926ej-s") + .Cases("v6", "v6k", "arm1136jf-s") + .Case("v6j", "arm1136j-s") + .Cases("v6z", "v6zk", "arm1176jzf-s") + .Case("v6t2", "arm1156t2-s") + .Cases("v6m", "v6-m", "cortex-m0") + .Cases("v7", "v7a", "v7-a", "v7l", "v7-l", "cortex-a8") + .Cases("v7s", "v7-s", "swift") + .Cases("v7r", "v7-r", "cortex-r4") + .Cases("v7m", "v7-m", "cortex-m3") + .Cases("v7em", "v7e-m", "cortex-m4") + .Cases("v8", "v8a", "v8-a", "cortex-a53") + .Default(nullptr); + else + result = llvm::StringSwitch(MArch) + .Case("ep9312", "ep9312") + .Case("iwmmxt", "iwmmxt") + .Case("xscale", "xscale") + .Default(nullptr); + + if (result) + return result; + + // If all else failed, return the most base CPU with thumb interworking + // supported by LLVM. + // FIXME: Should warn once that we're falling back. + switch (getOS()) { + case llvm::Triple::NetBSD: + switch (getEnvironment()) { + case llvm::Triple::GNUEABIHF: + case llvm::Triple::GNUEABI: + case llvm::Triple::EABIHF: + case llvm::Triple::EABI: + return "arm926ej-s"; + default: + return "strongarm"; + } + default: + switch (getEnvironment()) { + case llvm::Triple::EABIHF: + case llvm::Triple::GNUEABIHF: + return "arm1176jzf-s"; + default: + return "arm7tdmi"; + } + } +} diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc index c9fae4298689..623547a95ed5 100644 --- a/lib/Support/Unix/Path.inc +++ b/lib/Support/Unix/Path.inc @@ -468,7 +468,7 @@ std::error_code setLastModificationAndAccessTime(int FD, TimeValue Time) { return std::error_code(); #else #warning Missing futimes() and futimens() - return make_error_code(errc::not_supported); + return make_error_code(errc::function_not_supported); #endif } diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc index 50f973a850b6..06a33cd7790f 100644 --- a/lib/Support/Unix/Program.inc +++ b/lib/Support/Unix/Program.inc @@ -350,7 +350,11 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait, // Parent process: Wait for the child process to terminate. int status; ProcessInfo WaitResult; - WaitResult.Pid = waitpid(ChildPid, &status, WaitPidOptions); + + do { + WaitResult.Pid = waitpid(ChildPid, &status, WaitPidOptions); + } while (WaitUntilTerminates && WaitResult.Pid == -1 && errno == EINTR); + if (WaitResult.Pid != PI.Pid) { if (WaitResult.Pid == 0) { // Non-blocking wait. diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc index 81aee0e1b6ac..9eeca62d9d66 100644 --- a/lib/Support/Windows/Process.inc +++ b/lib/Support/Windows/Process.inc @@ -183,37 +183,89 @@ static std::error_code windows_error(DWORD E) { return mapWindowsError(E); } +static void AllocateAndPush(const SmallVectorImpl &S, + SmallVectorImpl &Vector, + SpecificBumpPtrAllocator &Allocator) { + char *Buffer = Allocator.Allocate(S.size() + 1); + ::memcpy(Buffer, S.data(), S.size()); + Buffer[S.size()] = '\0'; + Vector.push_back(Buffer); +} + +/// Convert Arg from UTF-16 to UTF-8 and push it onto Args. +static std::error_code +ConvertAndPushArg(const wchar_t *Arg, SmallVectorImpl &Args, + SpecificBumpPtrAllocator &Allocator) { + SmallVector ArgString; + if (std::error_code ec = windows::UTF16ToUTF8(Arg, wcslen(Arg), ArgString)) + return ec; + AllocateAndPush(ArgString, Args, Allocator); + return std::error_code(); +} + +/// \brief Perform wildcard expansion of Arg, or just push it into Args if it +/// doesn't have wildcards or doesn't match any files. +static std::error_code +WildcardExpand(const wchar_t *Arg, SmallVectorImpl &Args, + SpecificBumpPtrAllocator &Allocator) { + if (!wcspbrk(Arg, L"*?")) { + // Arg does not contain any wildcard characters. This is the common case. + return ConvertAndPushArg(Arg, Args, Allocator); + } + + // Extract any directory part of the argument. + SmallVector Dir; + if (std::error_code ec = windows::UTF16ToUTF8(Arg, wcslen(Arg), Dir)) + return ec; + sys::path::remove_filename(Dir); + const int DirSize = Dir.size(); + + // Search for matching files. + WIN32_FIND_DATAW FileData; + HANDLE FindHandle = FindFirstFileW(Arg, &FileData); + if (FindHandle == INVALID_HANDLE_VALUE) { + return ConvertAndPushArg(Arg, Args, Allocator); + } + + std::error_code ec; + do { + SmallVector FileName; + ec = windows::UTF16ToUTF8(FileData.cFileName, wcslen(FileData.cFileName), + FileName); + if (ec) + break; + + // Push the filename onto Dir, and remove it afterwards. + llvm::sys::path::append(Dir, StringRef(FileName.data(), FileName.size())); + AllocateAndPush(Dir, Args, Allocator); + Dir.resize(DirSize); + } while (FindNextFileW(FindHandle, &FileData)); + + FindClose(FindHandle); + return ec; +} + std::error_code Process::GetArgumentVector(SmallVectorImpl &Args, ArrayRef, SpecificBumpPtrAllocator &ArgAllocator) { - int NewArgCount; - std::error_code ec; - - wchar_t **UnicodeCommandLine = CommandLineToArgvW(GetCommandLineW(), - &NewArgCount); + int ArgCount; + wchar_t **UnicodeCommandLine = + CommandLineToArgvW(GetCommandLineW(), &ArgCount); if (!UnicodeCommandLine) return windows_error(::GetLastError()); - Args.reserve(NewArgCount); + Args.reserve(ArgCount); + std::error_code ec; - for (int i = 0; i < NewArgCount; ++i) { - SmallVector NewArgString; - ec = windows::UTF16ToUTF8(UnicodeCommandLine[i], - wcslen(UnicodeCommandLine[i]), - NewArgString); + for (int i = 0; i < ArgCount; ++i) { + ec = WildcardExpand(UnicodeCommandLine[i], Args, ArgAllocator); if (ec) break; - - char *Buffer = ArgAllocator.Allocate(NewArgString.size() + 1); - ::memcpy(Buffer, NewArgString.data(), NewArgString.size() + 1); - Args.push_back(Buffer); } - LocalFree(UnicodeCommandLine); - if (ec) - return ec; - return std::error_code(); + LocalFree(UnicodeCommandLine); + return ec; } bool Process::StandardInIsUserInput() { diff --git a/lib/Support/Windows/ThreadLocal.inc b/lib/Support/Windows/ThreadLocal.inc index 3914cf72fa95..14ce61933cba 100644 --- a/lib/Support/Windows/ThreadLocal.inc +++ b/lib/Support/Windows/ThreadLocal.inc @@ -23,7 +23,7 @@ namespace llvm { using namespace sys; ThreadLocalImpl::ThreadLocalImpl() : data() { - typedef int SIZE_TOO_BIG[sizeof(DWORD) <= sizeof(data) ? 1 : -1]; + static_assert(sizeof(DWORD) <= sizeof(data), "size too big"); DWORD* tls = reinterpret_cast(&data); *tls = TlsAlloc(); assert(*tls != TLS_OUT_OF_INDEXES); diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp index f7c213ac2b85..0790be5305e2 100644 --- a/lib/Support/raw_ostream.cpp +++ b/lib/Support/raw_ostream.cpp @@ -660,7 +660,7 @@ bool raw_fd_ostream::has_colors() const { /// Use it like: outs() << "foo" << "bar"; raw_ostream &llvm::outs() { // Set buffer settings to model stdout behavior. - // Delete the file descriptor when the program exists, forcing error + // Delete the file descriptor when the program exits, forcing error // detection. If you don't want this behavior, don't use outs(). static raw_fd_ostream S(STDOUT_FILENO, true); return S; @@ -729,24 +729,17 @@ void raw_svector_ostream::resync() { } void raw_svector_ostream::write_impl(const char *Ptr, size_t Size) { - // If we're writing bytes from the end of the buffer into the smallvector, we - // don't need to copy the bytes, just commit the bytes because they are - // already in the right place. if (Ptr == OS.end()) { - assert(OS.size() + Size <= OS.capacity() && "Invalid write_impl() call!"); - OS.set_size(OS.size() + Size); + // Grow the buffer to include the scratch area without copying. + size_t NewSize = OS.size() + Size; + assert(NewSize <= OS.capacity() && "Invalid write_impl() call!"); + OS.set_size(NewSize); } else { - assert(GetNumBytesInBuffer() == 0 && - "Should be writing from buffer if some bytes in it"); - // Otherwise, do copy the bytes. - OS.append(Ptr, Ptr+Size); + assert(!GetNumBytesInBuffer()); + OS.append(Ptr, Ptr + Size); } - // Grow the vector if necessary. - if (OS.capacity() - OS.size() < 64) - OS.reserve(OS.capacity() * 2); - - // Update the buffer position. + OS.reserve(OS.size() + 64); SetBuffer(OS.end(), OS.capacity() - OS.size()); } diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp index 191307a66bc2..e317fbfa373d 100644 --- a/lib/TableGen/Main.cpp +++ b/lib/TableGen/Main.cpp @@ -81,13 +81,14 @@ int TableGenMain(char *argv0, TableGenMainFn *MainFn) { RecordKeeper Records; // Parse the input file. - std::unique_ptr File; - if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, File)) { - errs() << "Could not open input file '" << InputFilename << "': " - << ec.message() <<"\n"; + ErrorOr> FileOrErr = + MemoryBuffer::getFileOrSTDIN(InputFilename); + if (std::error_code EC = FileOrErr.getError()) { + errs() << "Could not open input file '" << InputFilename + << "': " << EC.message() << "\n"; return 1; } - MemoryBuffer *F = File.release(); + MemoryBuffer *F = FileOrErr.get().release(); // Tell SrcMgr about this buffer, which is what TGParser will pick up. SrcMgr.AddNewSourceBuffer(F, SMLoc()); diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp index c553a21c261e..0f40904ae919 100644 --- a/lib/TableGen/Record.cpp +++ b/lib/TableGen/Record.cpp @@ -811,20 +811,14 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const { } case HEAD: { if (ListInit *LHSl = dyn_cast(LHS)) { - if (LHSl->getSize() == 0) { - assert(0 && "Empty list in car"); - return nullptr; - } + assert(LHSl->getSize() != 0 && "Empty list in car"); return LHSl->getElement(0); } break; } case TAIL: { if (ListInit *LHSl = dyn_cast(LHS)) { - if (LHSl->getSize() == 0) { - assert(0 && "Empty list in cdr"); - return nullptr; - } + assert(LHSl->getSize() != 0 && "Empty list in cdr"); // Note the +1. We can't just pass the result of getValues() // directly. ArrayRef::iterator begin = LHSl->getValues().begin()+1; @@ -961,8 +955,10 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const { case SHL: case SRA: case SRL: { - IntInit *LHSi = dyn_cast(LHS); - IntInit *RHSi = dyn_cast(RHS); + IntInit *LHSi = + dyn_cast_or_null(LHS->convertInitializerTo(IntRecTy::get())); + IntInit *RHSi = + dyn_cast_or_null(RHS->convertInitializerTo(IntRecTy::get())); if (LHSi && RHSi) { int64_t LHSv = LHSi->getValue(), RHSv = RHSi->getValue(); int64_t Result; diff --git a/lib/TableGen/TGLexer.cpp b/lib/TableGen/TGLexer.cpp index 1ec2eea67d94..fc1d3ca3392f 100644 --- a/lib/TableGen/TGLexer.cpp +++ b/lib/TableGen/TGLexer.cpp @@ -27,9 +27,9 @@ using namespace llvm; TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { - CurBuffer = 0; - CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); - CurPtr = CurBuf->getBufferStart(); + CurBuffer = SrcMgr.getMainFileID(); + CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); + CurPtr = CurBuf.begin(); TokStart = nullptr; } @@ -52,7 +52,7 @@ int TGLexer::getNextChar() { case 0: { // A nul character in the stream is either the end of the current buffer or // a random nul in the file. Disambiguate that here. - if (CurPtr-1 != CurBuf->getBufferEnd()) + if (CurPtr-1 != CurBuf.end()) return 0; // Just whitespace. // If this is the end of an included file, pop the parent file off the @@ -60,7 +60,7 @@ int TGLexer::getNextChar() { SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); if (ParentIncludeLoc != SMLoc()) { CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); - CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); + CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); CurPtr = ParentIncludeLoc.getPointer(); return getNextChar(); } @@ -187,7 +187,7 @@ tgtok::TokKind TGLexer::LexString() { while (*CurPtr != '"') { // If we hit the end of the buffer, report an error. - if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd()) + if (*CurPtr == 0 && CurPtr == CurBuf.end()) return ReturnError(StrStart, "End of file in string literal"); if (*CurPtr == '\n' || *CurPtr == '\r') @@ -220,7 +220,7 @@ tgtok::TokKind TGLexer::LexString() { // If we hit the end of the buffer, report an error. case '\0': - if (CurPtr == CurBuf->getBufferEnd()) + if (CurPtr == CurBuf.end()) return ReturnError(StrStart, "End of file in string literal"); // FALL THROUGH default: @@ -304,7 +304,7 @@ bool TGLexer::LexInclude() { CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), IncludedFile); - if (CurBuffer == -1) { + if (!CurBuffer) { PrintError(getLoc(), "Could not find include file '" + Filename + "'"); return true; } @@ -319,8 +319,8 @@ bool TGLexer::LexInclude() { } Dependencies.insert(std::make_pair(IncludedFile, getLoc())); // Save the line number and lex buffer of the includer. - CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); - CurPtr = CurBuf->getBufferStart(); + CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); + CurPtr = CurBuf.begin(); return false; } @@ -333,7 +333,7 @@ void TGLexer::SkipBCPLComment() { return; // Newline is end of comment. case 0: // If this is the end of the buffer, end the comment. - if (CurPtr == CurBuf->getBufferEnd()) + if (CurPtr == CurBuf.end()) return; break; } diff --git a/lib/TableGen/TGLexer.h b/lib/TableGen/TGLexer.h index 1e599f84b067..a2c95ca833df 100644 --- a/lib/TableGen/TGLexer.h +++ b/lib/TableGen/TGLexer.h @@ -14,6 +14,7 @@ #ifndef TGLEXER_H #define TGLEXER_H +#include "llvm/ADT/StringRef.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/SMLoc.h" #include @@ -21,7 +22,6 @@ #include namespace llvm { -class MemoryBuffer; class SourceMgr; class SMLoc; class Twine; @@ -63,7 +63,7 @@ class TGLexer { SourceMgr &SrcMgr; const char *CurPtr; - const MemoryBuffer *CurBuf; + StringRef CurBuf; // Information about the current token. const char *TokStart; @@ -73,7 +73,7 @@ class TGLexer { /// CurBuffer - This is the current buffer index we're lexing from as managed /// by the SourceMgr object. - int CurBuffer; + unsigned CurBuffer; public: typedef std::map DependenciesMapTy; diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp index 04906f6078f8..ab2c4b73e67c 100644 --- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -214,8 +214,8 @@ AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const { if (SExt->getType() != ConsideredSExtType) return false; - for (const Use &U : SExt->uses()) { - if (isa(*U)) + for (const User *U : SExt->users()) { + if (isa(U)) return true; } @@ -267,8 +267,7 @@ AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) { } // Now try to get through the chain of definitions. - while (isa(SExt->getOperand(0))) { - Instruction *Inst = dyn_cast(SExt->getOperand(0)); + while (auto *Inst = dyn_cast(SExt->getOperand(0))) { DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n'); if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) { // We cannot get through something that is not an Instruction @@ -285,10 +284,10 @@ AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) { // assertion on the type as all involved sext operation may have not // been moved yet. while (!Inst->use_empty()) { - Value::use_iterator UseIt = Inst->use_begin(); - Instruction *UseInst = dyn_cast(*UseIt); - assert(UseInst && "Use of sext is not an Instruction!"); - UseInst->setOperand(UseIt->getOperandNo(), SExt); + Use &U = *Inst->use_begin(); + Instruction *User = dyn_cast(U.getUser()); + assert(User && "User of sext is not an Instruction!"); + User->setOperand(U.getOperandNo(), SExt); } ToRemove.insert(Inst); SExt->setOperand(0, Inst->getOperand(0)); @@ -385,11 +384,11 @@ void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses, if (ToRemove.count(Inst)) continue; bool inserted = false; - for (auto Pt : CurPts) { + for (auto &Pt : CurPts) { if (DT.dominates(Inst, Pt)) { DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n" << *Inst << '\n'); - (Pt)->replaceAllUsesWith(Inst); + Pt->replaceAllUsesWith(Inst); ToRemove.insert(Pt); Pt = Inst; inserted = true; @@ -436,7 +435,7 @@ void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) { bool insert = false; // #1. - for (const Use &U : SExt->uses()) { + for (const User *U : SExt->users()) { const Instruction *Inst = dyn_cast(U); if (Inst && Inst->getNumOperands() > 2) { DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 8e8bd3d0bcd8..1fe5138b529d 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -54,6 +54,8 @@ def CC_AArch64_AAPCS : CallingConv<[ CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], @@ -65,7 +67,7 @@ def CC_AArch64_AAPCS : CallingConv<[ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. - CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>, + CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>, CCIfType<[i32, f32], CCAssignToStack<8, 8>>, CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], CCAssignToStack<8, 8>>, @@ -88,6 +90,8 @@ def RetCC_AArch64_AAPCS : CallingConv<[ [X0, X1, X2, X3, X4, X5, X6, X7]>>, CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], @@ -129,6 +133,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[ CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], @@ -141,7 +147,7 @@ def CC_AArch64_DarwinPCS : CallingConv<[ // If more than will fit in registers, pass them on the stack instead. CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, - CCIf<"ValVT == MVT::i16", CCAssignToStack<2, 2>>, + CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], CCAssignToStack<8, 8>>, @@ -154,7 +160,7 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ // Handle all scalar types as either i64 or f64. CCIfType<[i8, i16, i32], CCPromoteToType>, - CCIfType<[f32], CCPromoteToType>, + CCIfType<[f16, f32], CCPromoteToType>, // Everything is on the stack. // i128 is split to two i64s, and its stack alignment is 16 bytes. diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index a76fd76e5ed4..8839085c4a80 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -634,19 +634,6 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return true; } - case AArch64::FCVTSHpseudo: { - MachineOperand Src = MI.getOperand(1); - Src.setImplicit(); - unsigned SrcH = - TII->getRegisterInfo().getSubReg(Src.getReg(), AArch64::hsub); - auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::FCVTSHr)) - .addOperand(MI.getOperand(0)) - .addReg(SrcH, RegState::Undef) - .addOperand(Src); - transferImpOps(MI, MIB, MIB); - MI.eraseFromParent(); - return true; - } case AArch64::LOADgot: { // Expand into ADRP + LDR. unsigned DstReg = MI.getOperand(0).getReg(); diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index cc2a70ee8a2e..2164d77b7900 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -1750,6 +1750,17 @@ unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) { unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt) { assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?"); + + // FastISel does not have plumbing to deal with extensions where the SrcVT or + // DestVT are odd things, so test to make sure that they are both types we can + // handle (i1/i8/i16/i32 for SrcVT and i8/i16/i32/i64 for DestVT), otherwise + // bail out to SelectionDAG. + if (((DestVT != MVT::i8) && (DestVT != MVT::i16) && + (DestVT != MVT::i32) && (DestVT != MVT::i64)) || + ((SrcVT != MVT::i1) && (SrcVT != MVT::i8) && + (SrcVT != MVT::i16) && (SrcVT != MVT::i32))) + return 0; + unsigned Opc; unsigned Imm = 0; @@ -1896,6 +1907,7 @@ bool AArch64FastISel::SelectMul(const Instruction *I) { case MVT::i32: ZReg = AArch64::WZR; Opc = AArch64::MADDWrrr; + SrcVT = MVT::i32; break; case MVT::i64: ZReg = AArch64::XZR; diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 98609760a73a..3f49fabfb58e 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -593,8 +593,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, const GlobalValue *GV = GAN->getGlobal(); unsigned Alignment = GV->getAlignment(); const DataLayout *DL = TLI->getDataLayout(); - if (Alignment == 0 && !Subtarget->isTargetDarwin()) - Alignment = DL->getABITypeAlignment(GV->getType()->getElementType()); + Type *Ty = GV->getType()->getElementType(); + if (Alignment == 0 && Ty->isSized() && !Subtarget->isTargetDarwin()) + Alignment = DL->getABITypeAlignment(Ty); if (Alignment >= Size) return true; diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index e45ca4dbc0d9..4921826034e9 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -305,6 +305,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) // AArch64 does not have floating-point extending loads, i1 sign-extending // load, floating-point truncating stores, or v2i32->v2i16 truncating store. + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand); @@ -316,6 +317,10 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) setTruncStoreAction(MVT::f128, MVT::f64, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); + + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setOperationAction(ISD::BITCAST, MVT::f16, Custom); + // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { @@ -627,7 +632,7 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { unsigned AArch64TargetLowering::getMaximalGlobalOffset() const { // FIXME: On AArch64, this depends on the type. - // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes(). + // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). // and the offset has to be a multiple of the related size in bytes. return 4095; } @@ -823,8 +828,7 @@ AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, #ifndef NDEBUG MI->dump(); #endif - assert(0 && "Unexpected instruction for custom inserter!"); - break; + llvm_unreachable("Unexpected instruction for custom inserter!"); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); @@ -833,7 +837,6 @@ AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); } - llvm_unreachable("Unexpected instruction for custom inserter!"); } //===----------------------------------------------------------------------===// @@ -1505,18 +1508,36 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0); + .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; } +static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { + if (Op.getValueType() != MVT::f16) + return SDValue(); + + assert(Op.getOperand(0).getValueType() == MVT::i16); + SDLoc DL(Op); + + Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); + Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); + return SDValue( + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, + DAG.getTargetConstant(AArch64::hsub, MVT::i32)), + 0); +} + + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("unimplemented operand"); return SDValue(); + case ISD::BITCAST: + return LowerBITCAST(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: @@ -1713,6 +1734,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( RC = &AArch64::GPR32RegClass; else if (RegVT == MVT::i64) RC = &AArch64::GPR64RegClass; + else if (RegVT == MVT::f16) + RC = &AArch64::FPR16RegClass; else if (RegVT == MVT::f32) RC = &AArch64::FPR32RegClass; else if (RegVT == MVT::f64 || RegVT.is64BitVector()) @@ -5183,11 +5206,37 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, return Op; } +// Normalize the operands of BUILD_VECTOR. The value of constant operands will +// be truncated to fit element width. +static SDValue NormalizeBuildVector(SDValue Op, + SelectionDAG &DAG) { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT EltTy= VT.getVectorElementType(); + + if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) + return Op; + + SmallVector Ops; + for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) { + SDValue Lane = Op.getOperand(I); + if (Lane.getOpcode() == ISD::Constant) { + APInt LowBits(EltTy.getSizeInBits(), + cast(Lane)->getZExtValue()); + Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32); + } + Ops.push_back(Lane); + } + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); +} + SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { - BuildVectorSDNode *BVN = cast(Op.getNode()); SDLoc dl(Op); EVT VT = Op.getValueType(); + Op = NormalizeBuildVector(Op, DAG); + BuildVectorSDNode *BVN = cast(Op.getNode()); APInt CnstBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); @@ -5558,11 +5607,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); - // Check for non-constant lane. - if (!isa(Op.getOperand(2))) + // Check for non-constant or out of range lane. + EVT VT = Op.getOperand(0).getValueType(); + ConstantSDNode *CI = dyn_cast(Op.getOperand(2)); + if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); - EVT VT = Op.getOperand(0).getValueType(); // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || @@ -5590,11 +5640,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); - // Check for non-constant lane. - if (!isa(Op.getOperand(1))) + // Check for non-constant or out of range lane. + EVT VT = Op.getOperand(0).getValueType(); + ConstantSDNode *CI = dyn_cast(Op.getOperand(1)); + if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); - EVT VT = Op.getOperand(0).getValueType(); // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || @@ -6345,32 +6396,105 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, if (ConstantSDNode *C = dyn_cast(N->getOperand(1))) { APInt Value = C->getAPIntValue(); EVT VT = N->getValueType(0); - APInt VM1 = Value - 1; - if (VM1.isPowerOf2()) { - // Multiplying by one more than a power of two, replace with a shift - // and an add. - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VM1.logBase2(), MVT::i64)); - return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); - } - APInt VP1 = Value + 1; - if (VP1.isPowerOf2()) { - // Multiplying by one less than a power of two, replace with a shift - // and a subtract. - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VP1.logBase2(), MVT::i64)); - return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); + if (Value.isNonNegative()) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + APInt VM1 = Value - 1; + if (VM1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VM1.logBase2(), MVT::i64)); + return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, + N->getOperand(0)); + } + // (mul x, 2^N - 1) => (sub (shl x, N), x) + APInt VP1 = Value + 1; + if (VP1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VP1.logBase2(), MVT::i64)); + return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, + N->getOperand(0)); + } + } else { + // (mul x, -(2^N + 1)) => - (add (shl x, N), x) + APInt VNM1 = -Value - 1; + if (VNM1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VNM1.logBase2(), MVT::i64)); + SDValue Add = + DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); + return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add); + } + // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) + APInt VNP1 = -Value + 1; + if (VNP1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VNP1.logBase2(), MVT::i64)); + return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0), + ShiftedVal); + } } } return SDValue(); } +static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, + SelectionDAG &DAG) { + // Take advantage of vector comparisons producing 0 or -1 in each lane to + // optimize away operation when it's from a constant. + // + // The general transformation is: + // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> + // AND(VECTOR_CMP(x,y), constant2) + // constant2 = UNARYOP(constant) + + // Early exit if this isn't a vector operation or if the operand of the + // unary operation isn't a bitwise AND. + EVT VT = N->getValueType(0); + if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || + N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC) + return SDValue(); + + // Now check that the other operand of the AND is a constant splat. We could + // make the transformation for non-constant splats as well, but it's unclear + // that would be a benefit as it would not eliminate any operations, just + // perform one more step in scalar code before moving to the vector unit. + if (BuildVectorSDNode *BV = + dyn_cast(N->getOperand(0)->getOperand(1))) { + // Bail out if the vector isn't a constant splat. + if (!BV->getConstantSplatNode()) + return SDValue(); + + // Everything checks out. Build up the new and improved node. + SDLoc DL(N); + EVT IntVT = BV->getValueType(0); + // Create a new constant of the appropriate type for the transformed + // DAG. + SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); + // The AND node needs bitcasts to/from an integer vector type around it. + SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, + N->getOperand(0)->getOperand(0), MaskConst); + SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); + return Res; + } + + return SDValue(); +} + static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) { + // First try to optimize away the conversion when it's conditionally from + // a constant. Vectors only. + SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); + if (Res != SDValue()) + return Res; + EVT VT = N->getValueType(0); if (VT != MVT::f32 && VT != MVT::f64) return SDValue(); + // Only optimize when the source and destination types have the same width. if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) return SDValue(); @@ -7840,11 +7964,32 @@ bool AArch64TargetLowering::getPostIndexedAddressParts( return true; } +static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) { + if (N->getValueType(0) != MVT::i16) + return; + + SDLoc DL(N); + SDValue Op = N->getOperand(0); + assert(Op.getValueType() == MVT::f16 && + "Inconsistent bitcast? Only 16-bit types should be i16 or f16"); + Op = SDValue( + DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, + DAG.getUNDEF(MVT::i32), Op, + DAG.getTargetConstant(AArch64::hsub, MVT::i32)), + 0); + Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); +} + void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this"); + case ISD::BITCAST: + ReplaceBITCASTResults(N, Results, DAG); + return; case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); @@ -7866,6 +8011,18 @@ bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const { return Inst->getType()->getPrimitiveSizeInBits() <= 128; } +TargetLoweringBase::LegalizeTypeAction +AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { + MVT SVT = VT.getSimpleVT(); + // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, + // v4i16, v2i32 instead of to promote. + if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 + || SVT == MVT::v1f32) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 139217d34402..cb0b9ef261dc 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -324,6 +324,9 @@ class AArch64TargetLowering : public TargetLowering { bool shouldExpandAtomicInIR(Instruction *Inst) const override; + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; + private: /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 446149b4fb0d..e88c0c038c33 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -448,13 +448,19 @@ def logical_imm64_XFORM : SDNodeXFormgetTargetConstant(enc, MVT::i32); }]>; -def LogicalImm32Operand : AsmOperandClass { - let Name = "LogicalImm32"; - let DiagnosticType = "LogicalSecondSource"; -} -def LogicalImm64Operand : AsmOperandClass { - let Name = "LogicalImm64"; - let DiagnosticType = "LogicalSecondSource"; +let DiagnosticType = "LogicalSecondSource" in { + def LogicalImm32Operand : AsmOperandClass { + let Name = "LogicalImm32"; + } + def LogicalImm64Operand : AsmOperandClass { + let Name = "LogicalImm64"; + } + def LogicalImm32NotOperand : AsmOperandClass { + let Name = "LogicalImm32Not"; + } + def LogicalImm64NotOperand : AsmOperandClass { + let Name = "LogicalImm64Not"; + } } def logical_imm32 : Operand, PatLeaf<(imm), [{ return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32); @@ -468,6 +474,12 @@ def logical_imm64 : Operand, PatLeaf<(imm), [{ let PrintMethod = "printLogicalImm64"; let ParserMatchClass = LogicalImm64Operand; } +def logical_imm32_not : Operand { + let ParserMatchClass = LogicalImm32NotOperand; +} +def logical_imm64_not : Operand { + let ParserMatchClass = LogicalImm64NotOperand; +} // imm0_65535 predicate - True if the immediate is in the range [0,65535]. def Imm0_65535Operand : AsmImmRange<0, 65535>; @@ -527,6 +539,11 @@ def imm0_7 : Operand, ImmLeaf, ImmLeaf; + // An arithmetic shifter operand: // {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr // {5-0} - imm6 @@ -764,15 +781,17 @@ def simdimmtype10 : Operand, // Base encoding for system instruction operands. let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in -class BaseSystemI - : I { +class BaseSystemI pattern = []> + : I { let Inst{31-22} = 0b1101010100; let Inst{21} = L; } // System instructions which do not have an Rt register. -class SimpleSystemI - : BaseSystemI { +class SimpleSystemI pattern = []> + : BaseSystemI { let Inst{4-0} = 0b11111; } @@ -785,13 +804,17 @@ class RtSystemI } // Hint instructions that take both a CRm and a 3-bit immediate. -class HintI - : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">, - Sched<[WriteHint]> { - bits <7> imm; - let Inst{20-12} = 0b000110010; - let Inst{11-5} = imm; -} +// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot +// model patterns with sufficiently fine granularity +let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in + class HintI + : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "", + [(int_aarch64_hint imm0_127:$imm)]>, + Sched<[WriteHint]> { + bits <7> imm; + let Inst{20-12} = 0b000110010; + let Inst{11-5} = imm; + } // System instructions taking a single literal operand which encodes into // CRm. op2 differentiates the opcodes. @@ -803,8 +826,9 @@ def barrier_op : Operand { let PrintMethod = "printBarrierOption"; let ParserMatchClass = BarrierAsmOperand; } -class CRmSystemI opc, string asm> - : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">, +class CRmSystemI opc, string asm, + list pattern = []> + : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>, Sched<[WriteBarrier]> { bits<4> CRm; let Inst{20-12} = 0b000110011; @@ -1935,22 +1959,32 @@ class LogicalRegAlias : InstAlias; -let AddedComplexity = 6 in -multiclass LogicalImm opc, string mnemonic, SDNode OpNode> { +multiclass LogicalImm opc, string mnemonic, SDNode OpNode, + string Alias> { + let AddedComplexity = 6 in def Wri : BaseLogicalImm { let Inst{31} = 0; let Inst{22} = 0; // 64-bit version has an additional bit of immediate. } + let AddedComplexity = 6 in def Xri : BaseLogicalImm { let Inst{31} = 1; } + + def : InstAlias(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn, + logical_imm32_not:$imm), 0>; + def : InstAlias(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn, + logical_imm64_not:$imm), 0>; } -multiclass LogicalImmS opc, string mnemonic, SDNode OpNode> { +multiclass LogicalImmS opc, string mnemonic, SDNode OpNode, + string Alias> { let isCompare = 1, Defs = [NZCV] in { def Wri : BaseLogicalImm { @@ -1962,6 +1996,13 @@ multiclass LogicalImmS opc, string mnemonic, SDNode OpNode> { let Inst{31} = 1; } } // end Defs = [NZCV] + + def : InstAlias(NAME # "Wri") GPR32:$Rd, GPR32:$Rn, + logical_imm32_not:$imm), 0>; + def : InstAlias(NAME # "Xri") GPR64:$Rd, GPR64:$Rn, + logical_imm64_not:$imm), 0>; } class BaseLogicalRegPseudo diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index b1e8fa64c2da..0ba069e99a86 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -323,7 +323,7 @@ def : Pat<(AArch64LOADgot tconstpool:$addr), // System instructions. //===----------------------------------------------------------------------===// -def HINT : HintI<"hint">; +def HINT : HintI<"hint">; def : InstAlias<"nop", (HINT 0b000)>; def : InstAlias<"yield",(HINT 0b001)>; def : InstAlias<"wfe", (HINT 0b010)>; @@ -331,13 +331,23 @@ def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; - // As far as LLVM is concerned this writes to the system's exclusive monitors. +// As far as LLVM is concerned this writes to the system's exclusive monitors. let mayLoad = 1, mayStore = 1 in def CLREX : CRmSystemI; -def DMB : CRmSystemI; -def DSB : CRmSystemI; -def ISB : CRmSystemI; +// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot +// model patterns with sufficiently fine granularity. +let mayLoad = ?, mayStore = ? in { +def DMB : CRmSystemI; + +def DSB : CRmSystemI; + +def ISB : CRmSystemI; +} + def : InstAlias<"clrex", (CLREX 0xf)>; def : InstAlias<"isb", (ISB 0xf)>; @@ -671,10 +681,10 @@ def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">; //===----------------------------------------------------------------------===// // (immediate) -defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag>; -defm AND : LogicalImm<0b00, "and", and>; -defm EOR : LogicalImm<0b10, "eor", xor>; -defm ORR : LogicalImm<0b01, "orr", or>; +defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">; +defm AND : LogicalImm<0b00, "and", and, "bic">; +defm EOR : LogicalImm<0b10, "eor", xor, "eon">; +defm ORR : LogicalImm<0b01, "orr", or, "orn">; // FIXME: these aliases *are* canonical sometimes (when movz can't be // used). Actually, it seems to be working right now, but putting logical_immXX @@ -2234,14 +2244,6 @@ def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>; defm FCVT : FPConversion<"fcvt">; -def : Pat<(f32_to_f16 FPR32:$Rn), - (i32 (COPY_TO_REGCLASS - (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)), - GPR32))>; - -def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn), - [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>; - //===----------------------------------------------------------------------===// // Floating point single operand instructions. //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 19c9e6451fe1..1bf64fc4310b 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -51,7 +51,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0) + DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), 0) .setDiscardResult(); std::pair CallResult = TLI.LowerCallTo(CLI); return CallResult.second; diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 6a159e91e22f..852cb3f8d2e9 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -15,13 +15,9 @@ #define AArch64TARGETMACHINE_H #include "AArch64InstrInfo.h" -#include "AArch64ISelLowering.h" #include "AArch64Subtarget.h" -#include "AArch64FrameLowering.h" -#include "AArch64SelectionDAGInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/MC/MCStreamer.h" namespace llvm { diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index f861df0bf99f..37e92961ff07 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -25,6 +25,7 @@ #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" @@ -43,6 +44,14 @@ class AArch64AsmParser : public MCTargetAsmParser { MCSubtargetInfo &STI; MCAsmParser &Parser; + // Map of register aliases registers via the .req directive. + StringMap > RegisterReqs; + + AArch64TargetStreamer &getTargetStreamer() { + MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); + return static_cast(TS); + } + MCAsmParser &getParser() const { return Parser; } MCAsmLexer &getLexer() const { return Parser.getLexer(); } @@ -51,6 +60,7 @@ class AArch64AsmParser : public MCTargetAsmParser { bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); AArch64CC::CondCode parseCondCodeString(StringRef Cond); bool parseCondCode(OperandVector &Operands, bool invertCondCode); + unsigned matchRegisterNameAlias(StringRef Name, bool isVector); int tryParseRegister(); int tryMatchVectorRegister(StringRef &Kind, bool expected); bool parseRegister(OperandVector &Operands); @@ -67,6 +77,10 @@ class AArch64AsmParser : public MCTargetAsmParser { bool parseDirectiveTLSDescCall(SMLoc L); bool parseDirectiveLOH(StringRef LOH, SMLoc L); + bool parseDirectiveLtorg(SMLoc L); + + bool parseDirectiveReq(StringRef Name, SMLoc L); + bool parseDirectiveUnreq(SMLoc L); bool validateInstruction(MCInst &Inst, SmallVectorImpl &Loc); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -105,6 +119,8 @@ class AArch64AsmParser : public MCTargetAsmParser { const MCTargetOptions &Options) : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { MCAsmParserExtension::Initialize(_Parser); + if (Parser.getStreamer().getTargetStreamer() == nullptr) + new AArch64TargetStreamer(Parser.getStreamer()); // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); @@ -604,7 +620,11 @@ class AArch64Operand : public MCParsedAsmOperand { const MCConstantExpr *MCE = dyn_cast(getImm()); if (!MCE) return false; - return AArch64_AM::isLogicalImmediate(MCE->getValue(), 32); + int64_t Val = MCE->getValue(); + if (Val >> 32 != 0 && Val >> 32 != ~0LL) + return false; + Val &= 0xFFFFFFFF; + return AArch64_AM::isLogicalImmediate(Val, 32); } bool isLogicalImm64() const { if (!isImm()) @@ -614,6 +634,23 @@ class AArch64Operand : public MCParsedAsmOperand { return false; return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64); } + bool isLogicalImm32Not() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = ~MCE->getValue() & 0xFFFFFFFF; + return AArch64_AM::isLogicalImmediate(Val, 32); + } + bool isLogicalImm64Not() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64); + } bool isShiftedImm() const { return Kind == k_ShiftedImm; } bool isAddSubImm() const { if (!isShiftedImm() && !isImm()) @@ -1217,150 +1254,147 @@ class AArch64Operand : public MCParsedAsmOperand { void addSImm9Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addSImm7s4Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4)); } void addSImm7s8Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8)); } void addSImm7s16Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16)); } void addImm0_7Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm1_8Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm0_15Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm1_16Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); + const MCConstantExpr *MCE = cast(getImm()); assert(MCE && "Invalid constant immediate operand!"); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm0_31Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm1_31Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm1_32Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm0_63Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm1_63Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm1_64Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm0_127Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm0_255Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm0_65535Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addImm32_63Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); } void addLogicalImm32Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid logical immediate operand!"); - uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 32); + const MCConstantExpr *MCE = cast(getImm()); + uint64_t encoding = + AArch64_AM::encodeLogicalImmediate(MCE->getValue() & 0xFFFFFFFF, 32); Inst.addOperand(MCOperand::CreateImm(encoding)); } void addLogicalImm64Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid logical immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64); Inst.addOperand(MCOperand::CreateImm(encoding)); } + void addLogicalImm32NotOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast(getImm()); + int64_t Val = ~MCE->getValue() & 0xFFFFFFFF; + uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, 32); + Inst.addOperand(MCOperand::CreateImm(encoding)); + } + + void addLogicalImm64NotOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast(getImm()); + uint64_t encoding = + AArch64_AM::encodeLogicalImmediate(~MCE->getValue(), 64); + Inst.addOperand(MCOperand::CreateImm(encoding)); + } + void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid immediate operand!"); + const MCConstantExpr *MCE = cast(getImm()); uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue()); Inst.addOperand(MCOperand::CreateImm(encoding)); } @@ -1817,6 +1851,26 @@ bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, return (RegNo == (unsigned)-1); } +// Matches a register name or register alias previously defined by '.req' +unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name, + bool isVector) { + unsigned RegNum = isVector ? matchVectorRegName(Name) + : MatchRegisterName(Name); + + if (RegNum == 0) { + // Check for aliases registered via .req. Canonicalize to lower case. + // That's more consistent since register names are case insensitive, and + // it's how the original entry was passed in from MC/MCParser/AsmParser. + auto Entry = RegisterReqs.find(Name.lower()); + if (Entry == RegisterReqs.end()) + return 0; + // set RegNum if the match is the right kind of register + if (isVector == Entry->getValue().first) + RegNum = Entry->getValue().second; + } + return RegNum; +} + /// tryParseRegister - Try to parse a register name. The token must be an /// Identifier when called, and if it is a register name the token is eaten and /// the register is added to the operand list. @@ -1825,7 +1879,7 @@ int AArch64AsmParser::tryParseRegister() { assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); std::string lowerCase = Tok.getString().lower(); - unsigned RegNum = MatchRegisterName(lowerCase); + unsigned RegNum = matchRegisterNameAlias(lowerCase, false); // Also handle a few aliases of registers. if (RegNum == 0) RegNum = StringSwitch(lowerCase) @@ -1855,7 +1909,8 @@ int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) { // a '.'. size_t Start = 0, Next = Name.find('.'); StringRef Head = Name.slice(Start, Next); - unsigned RegNum = matchVectorRegName(Head); + unsigned RegNum = matchRegisterNameAlias(Head, true); + if (RegNum) { if (Next != StringRef::npos) { Kind = Name.slice(Next, StringRef::npos); @@ -2853,7 +2908,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { if (!Tok.is(AsmToken::Identifier)) return MatchOperand_NoMatch; - unsigned RegNum = MatchRegisterName(Tok.getString().lower()); + unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), false); MCContext &Ctx = getContext(); const MCRegisterInfo *RI = Ctx.getRegisterInfo(); @@ -3004,6 +3059,53 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext())); return false; } + case AsmToken::Equal: { + SMLoc Loc = Parser.getTok().getLoc(); + if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val) + return Error(Loc, "unexpected token in operand"); + Parser.Lex(); // Eat '=' + const MCExpr *SubExprVal; + if (getParser().parseExpression(SubExprVal)) + return true; + + if (Operands.size() < 2 || + !static_cast(*Operands[1]).isReg()) + return true; + + bool IsXReg = + AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Operands[1]->getReg()); + + MCContext& Ctx = getContext(); + E = SMLoc::getFromPointer(Loc.getPointer() - 1); + // If the op is an imm and can be fit into a mov, then replace ldr with mov. + if (isa(SubExprVal)) { + uint64_t Imm = (cast(SubExprVal))->getValue(); + uint32_t ShiftAmt = 0, MaxShiftAmt = IsXReg ? 48 : 16; + while(Imm > 0xFFFF && countTrailingZeros(Imm) >= 16) { + ShiftAmt += 16; + Imm >>= 16; + } + if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) { + Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx); + Operands.push_back(AArch64Operand::CreateImm( + MCConstantExpr::Create(Imm, Ctx), S, E, Ctx)); + if (ShiftAmt) + Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL, + ShiftAmt, true, S, E, Ctx)); + return false; + } + APInt Simm = APInt(64, Imm << ShiftAmt); + // check if the immediate is an unsigned or signed 32-bit int for W regs + if (!IsXReg && !(Simm.isIntN(32) || Simm.isSignedIntN(32))) + return Error(Loc, "Immediate too large for register"); + } + // If it is a label or an imm that cannot fit in a movz, put it into CP. + const MCExpr *CPLoc = + getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4); + Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx)); + return false; + } } } @@ -3033,6 +3135,15 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, .Case("bnv", "b.nv") .Default(Name); + // First check for the AArch64-specific .req directive. + if (Parser.getTok().is(AsmToken::Identifier) && + Parser.getTok().getIdentifier() == ".req") { + parseDirectiveReq(Name, NameLoc); + // We always return 'error' for this, as we're done with this + // statement and don't need to match the 'instruction." + return true; + } + // Create the leading tokens for the mnemonic, split by '.' characters. size_t Start = 0, Next = Name.find('.'); StringRef Head = Name.slice(Start, Next); @@ -3447,8 +3558,7 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { case Match_MnemonicFail: return Error(Loc, "unrecognized instruction mnemonic"); default: - assert(0 && "unexpected error code!"); - return Error(Loc, "invalid instruction format"); + llvm_unreachable("unexpected error code!"); } } @@ -3811,6 +3921,10 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { return parseDirectiveWord(8, Loc); if (IDVal == ".tlsdesccall") return parseDirectiveTLSDescCall(Loc); + if (IDVal == ".ltorg" || IDVal == ".pool") + return parseDirectiveLtorg(Loc); + if (IDVal == ".unreq") + return parseDirectiveUnreq(DirectiveID.getLoc()); return parseDirectiveLOH(IDVal, Loc); } @@ -3912,6 +4026,66 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) { return false; } +/// parseDirectiveLtorg +/// ::= .ltorg | .pool +bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) { + getTargetStreamer().emitCurrentConstantPool(); + return false; +} + +/// parseDirectiveReq +/// ::= name .req registername +bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { + Parser.Lex(); // Eat the '.req' token. + SMLoc SRegLoc = getLoc(); + unsigned RegNum = tryParseRegister(); + bool IsVector = false; + + if (RegNum == static_cast(-1)) { + StringRef Kind; + RegNum = tryMatchVectorRegister(Kind, false); + if (!Kind.empty()) { + Error(SRegLoc, "vector register without type specifier expected"); + return false; + } + IsVector = true; + } + + if (RegNum == static_cast(-1)) { + Parser.eatToEndOfStatement(); + Error(SRegLoc, "register name or alias expected"); + return false; + } + + // Shouldn't be anything else. + if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { + Error(Parser.getTok().getLoc(), "unexpected input in .req directive"); + Parser.eatToEndOfStatement(); + return false; + } + + Parser.Lex(); // Consume the EndOfStatement + + auto pair = std::make_pair(IsVector, RegNum); + if (RegisterReqs.GetOrCreateValue(Name, pair).getValue() != pair) + Warning(L, "ignoring redefinition of register alias '" + Name + "'"); + + return true; +} + +/// parseDirectiveUneq +/// ::= .unreq registername +bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) { + if (Parser.getTok().isNot(AsmToken::Identifier)) { + Error(Parser.getTok().getLoc(), "unexpected input in .unreq directive."); + Parser.eatToEndOfStatement(); + return false; + } + RegisterReqs.erase(Parser.getTok().getIdentifier().lower()); + Parser.Lex(); // Eat the identifier. + return false; +} + bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, AArch64MCExpr::VariantKind &ELFRefKind, diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp index 24663684a3fd..2057c51346af 100644 --- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -37,8 +37,7 @@ getVariant(uint64_t LLVMDisassembler_VariantKind) { case LLVMDisassembler_VariantKind_ARM64_TLVP: case LLVMDisassembler_VariantKind_ARM64_TLVOFF: default: - assert(0 && "bad LLVMDisassembler_VariantKind"); - return MCSymbolRefExpr::VK_None; + llvm_unreachable("bad LLVMDisassembler_VariantKind"); } } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 464a18cdbc04..f0513575edb4 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -218,13 +218,9 @@ AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, const MCSubtargetInfo &STI) const { if (MO.isReg()) return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()); - else { - assert(MO.isImm() && "did not expect relocated expression"); - return static_cast(MO.getImm()); - } - assert(0 && "Unable to encode MCOperand!"); - return 0; + assert(MO.isImm() && "did not expect relocated expression"); + return static_cast(MO.getImm()); } template uint32_t diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 85c3ec7a55f1..42a6787da48e 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -81,37 +81,8 @@ void AArch64MCExpr::PrintImpl(raw_ostream &OS) const { OS << *Expr; } -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -// FIXME: really do above: now that two backends are using it. -static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - break; - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast(Value); - AddValueSymbolsImpl(BE->getLHS(), Asm); - AddValueSymbolsImpl(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbolsImpl(cast(Value)->getSubExpr(), Asm); - break; - } -} - -void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbolsImpl(getSubExpr(), Asm); +void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); } const MCSection *AArch64MCExpr::FindAssociatedSection() const { diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index e869ed0a26a4..5422f9d7067e 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -147,7 +147,7 @@ class AArch64MCExpr : public MCTargetExpr { void PrintImpl(raw_ostream &OS) const override; - void AddValueSymbols(MCAssembler *) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; const MCSection *FindAssociatedSection() const override; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp new file mode 100644 index 000000000000..dcc1a3c9f05e --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -0,0 +1,41 @@ +//===- AArch64TargetStreamer.cpp - AArch64TargetStreamer class --*- C++ -*---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64TargetStreamer class. +// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/MapVector.h" +#include "llvm/MC/ConstantPools.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCStreamer.h" + +using namespace llvm; + +// +// AArch64TargetStreamer Implemenation +// +AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S) + : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {} + +AArch64TargetStreamer::~AArch64TargetStreamer() {} + +// The constant pool handling is shared by all AArch64TargetStreamer +// implementations. +const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr, + unsigned Size) { + return ConstantPools->addEntry(Streamer, Expr, Size); +} + +void AArch64TargetStreamer::emitCurrentConstantPool() { + ConstantPools->emitForCurrentSection(Streamer); +} + +// finish() - write out any non-empty assembler constant pools. +void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); } diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt index 7d5bced17a6a..6d8be5e63fbb 100644 --- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt @@ -7,6 +7,7 @@ add_llvm_library(LLVMAArch64Desc AArch64MCExpr.cpp AArch64MCTargetDesc.cpp AArch64MachObjectWriter.cpp + AArch64TargetStreamer.cpp ) add_dependencies(LLVMAArch64Desc AArch64CommonTableGen) diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp index 94faf6f60db3..92eaf9e1c9b3 100644 --- a/lib/Target/ARM/A15SDOptimizer.cpp +++ b/lib/Target/ARM/A15SDOptimizer.cpp @@ -321,8 +321,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg()); } - assert(0 && "Unhandled update pattern!"); - return 0; + llvm_unreachable("Unhandled update pattern!"); } // Return true if this MachineInstr inserts a scalar (SPR) value into diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index 55e9fe5f5c57..28d2610c3903 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -82,7 +82,8 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) { const GlobalValue *GV = dyn_cast(CV->stripPointerCasts()); assert(GV && "C++ constructor pointer was not a GlobalValue!"); - const MCExpr *E = MCSymbolRefExpr::Create(getSymbol(GV), + const MCExpr *E = MCSymbolRefExpr::Create(GetARMGVSymbol(GV, + ARMII::MO_NO_FLAG), (Subtarget->isTargetELF() ? MCSymbolRefExpr::VK_ARM_TARGET1 : MCSymbolRefExpr::VK_None), @@ -164,7 +165,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, else if ((Modifier && strcmp(Modifier, "hi16") == 0) || (TF & ARMII::MO_HI16)) O << ":upper16:"; - O << *getSymbol(GV); + O << *GetARMGVSymbol(GV, TF); printOffset(MO.getOffset(), O); if (TF == ARMII::MO_PLT) @@ -730,6 +731,32 @@ void ARMAsmPrinter::emitAttributes() { if (Subtarget->hasDivideInARMMode() && !Subtarget->hasV8Ops()) ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt); + if (MMI) { + if (const Module *SourceModule = MMI->getModule()) { + // ABI_PCS_wchar_t to indicate wchar_t width + // FIXME: There is no way to emit value 0 (wchar_t prohibited). + if (auto WCharWidthValue = cast_or_null( + SourceModule->getModuleFlag("wchar_size"))) { + int WCharWidth = WCharWidthValue->getZExtValue(); + assert((WCharWidth == 2 || WCharWidth == 4) && + "wchar_t width must be 2 or 4 bytes"); + ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth); + } + + // ABI_enum_size to indicate enum width + // FIXME: There is no way to emit value 0 (enums prohibited) or value 3 + // (all enums contain a value needing 32 bits to encode). + if (auto EnumWidthValue = cast_or_null( + SourceModule->getModuleFlag("min_enum_size"))) { + int EnumWidth = EnumWidthValue->getZExtValue(); + assert((EnumWidth == 1 || EnumWidth == 4) && + "Minimum enum width must be 1 or 4 bytes"); + int EnumBuildAttr = EnumWidth == 1 ? 1 : 2; + ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr); + } + } + } + if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization()) ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, ARMBuildAttrs::AllowTZVirtualization); @@ -768,23 +795,41 @@ getModifierVariantKind(ARMCP::ARMCPModifier Modifier) { MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags) { - bool isIndirect = Subtarget->isTargetMachO() && - (TargetFlags & ARMII::MO_NONLAZY) && - Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); - if (!isIndirect) - return getSymbol(GV); + if (Subtarget->isTargetMachO()) { + bool IsIndirect = (TargetFlags & ARMII::MO_NONLAZY) && + Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); + + if (!IsIndirect) + return getSymbol(GV); - // FIXME: Remove this when Darwin transition to @GOT like syntax. - MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - MachineModuleInfoMachO &MMIMachO = - MMI->getObjFileInfo(); - MachineModuleInfoImpl::StubValueTy &StubSym = - GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) : - MMIMachO.getGVStubEntry(MCSym); - if (!StubSym.getPointer()) - StubSym = MachineModuleInfoImpl:: - StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); - return MCSym; + // FIXME: Remove this when Darwin transition to @GOT like syntax. + MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoMachO &MMIMachO = + MMI->getObjFileInfo(); + MachineModuleInfoImpl::StubValueTy &StubSym = + GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) + : MMIMachO.getGVStubEntry(MCSym); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), + !GV->hasInternalLinkage()); + return MCSym; + } else if (Subtarget->isTargetCOFF()) { + assert(Subtarget->isTargetWindows() && + "Windows is the only supported COFF target"); + + bool IsIndirect = (TargetFlags & ARMII::MO_DLLIMPORT); + if (!IsIndirect) + return getSymbol(GV); + + SmallString<128> Name; + Name = "__imp_"; + getNameWithPrefix(Name, GV); + + return OutContext.GetOrCreateSymbol(Name); + } else if (Subtarget->isTargetELF()) { + return getSymbol(GV); + } + llvm_unreachable("unexpected target"); } void ARMAsmPrinter:: @@ -928,7 +973,7 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) { for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { MachineBasicBlock *MBB = JTBBs[i]; const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::Create(MBB->getSymbol(), - OutContext); + OutContext); // If this isn't a TBB or TBH, the entries are direct branch instructions. if (OffsetWidth == 4) { EmitToStreamer(OutStreamer, MCInstBuilder(ARM::t2B) @@ -1225,8 +1270,10 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // Add 's' bit operand (always reg0 for this) .addReg(0)); - const GlobalValue *GV = MI->getOperand(0).getGlobal(); - MCSymbol *GVSym = getSymbol(GV); + const MachineOperand &Op = MI->getOperand(0); + const GlobalValue *GV = Op.getGlobal(); + const unsigned TF = Op.getTargetFlags(); + MCSymbol *GVSym = GetARMGVSymbol(GV, TF); const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext); EmitToStreamer(OutStreamer, MCInstBuilder(ARM::Bcc) .addExpr(GVSymExpr) diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index eec5d14100c2..0288db91dcbf 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1887,7 +1887,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, unsigned NumBytes) { // This optimisation potentially adds lots of load and store // micro-operations, it's only really a great benefit to code-size. - if (!Subtarget.isMinSize()) + if (!MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize)) return false; // If only one register is pushed/popped, LLVM can use an LDR/STR diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index cdd91c7a7036..32b5f4aa2942 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -38,6 +38,8 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#define DEBUG_TYPE "arm-register-info" + #define GET_REGINFO_TARGET_DESC #include "ARMGenRegisterInfo.inc" @@ -775,3 +777,60 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false,true); } } + +bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const { + auto MBB = MI->getParent(); + auto MF = MBB->getParent(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + // If not copying into a sub-register this should be ok because we shouldn't + // need to split the reg. + if (!DstSubReg) + return true; + // Small registers don't frequently cause a problem, so we can coalesce them. + if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32) + return true; + + auto NewRCWeight = + MRI.getTargetRegisterInfo()->getRegClassWeight(NewRC); + auto SrcRCWeight = + MRI.getTargetRegisterInfo()->getRegClassWeight(SrcRC); + auto DstRCWeight = + MRI.getTargetRegisterInfo()->getRegClassWeight(DstRC); + // If the source register class is more expensive than the destination, the + // coalescing is probably profitable. + if (SrcRCWeight.RegWeight > NewRCWeight.RegWeight) + return true; + if (DstRCWeight.RegWeight > NewRCWeight.RegWeight) + return true; + + // If the register allocator isn't constrained, we can always allow coalescing + // unfortunately we don't know yet if we will be constrained. + // The goal of this heuristic is to restrict how many expensive registers + // we allow to coalesce in a given basic block. + auto AFI = MF->getInfo(); + auto It = AFI->getCoalescedWeight(MBB); + + DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: " + << It->second << "\n"); + DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: " + << NewRCWeight.RegWeight << "\n"); + + // This number is the largest round number that which meets the criteria: + // (1) addresses PR18825 + // (2) generates better code in some test cases (like vldm-shed-a9.ll) + // (3) Doesn't regress any test cases (in-tree, test-suite, and SPEC) + // In practice the SizeMultiplier will only factor in for straight line code + // that uses a lot of NEON vectors, which isn't terribly common. + unsigned SizeMultiplier = MBB->size()/100; + SizeMultiplier = SizeMultiplier ? SizeMultiplier : 1; + if (It->second < NewRCWeight.WeightLimit * SizeMultiplier) { + It->second += NewRCWeight.RegWeight; + return true; + } + return false; +} diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index 91df565a27d8..833d3f218480 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -187,6 +187,14 @@ class ARMBaseRegisterInfo : public ARMGenRegisterInfo { void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; + + /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true + bool shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const override; }; } // end namespace llvm diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp index 2fd7eddd8748..5fb6ebfeaae1 100644 --- a/lib/Target/ARM/ARMCodeEmitter.cpp +++ b/lib/Target/ARM/ARMCodeEmitter.cpp @@ -15,6 +15,7 @@ #include "ARM.h" #include "ARMBaseInstrInfo.h" #include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" #include "ARMRelocations.h" #include "ARMSubtarget.h" #include "ARMTargetMachine.h" diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 6045738e2e34..51d3dbb5bd8e 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -927,10 +927,16 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, } case ARM::tTPsoft: case ARM::TPsoft: { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == ARM::tTPsoft ? ARM::tBL : ARM::BL)) - .addExternalSymbol("__aeabi_read_tp", 0); + MachineInstrBuilder MIB; + if (Opcode == ARM::tTPsoft) + MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get( ARM::tBL)) + .addImm((unsigned)ARMCC::AL).addReg(0) + .addExternalSymbol("__aeabi_read_tp", 0); + else + MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get( ARM::BL)) + .addExternalSymbol("__aeabi_read_tp", 0); MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); TransferImpOps(MI, MIB, MIB); diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 6f8fb1a3521c..e2d90cd9306c 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -590,7 +590,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { // Use movw+movt when possible, it avoids constant pool entries. // Non-darwin targets only support static movt relocations in FastISel. - if (Subtarget->useMovt() && + if (Subtarget->useMovt(*FuncInfo.MF) && (Subtarget->isTargetMachO() || RelocM == Reloc::Static)) { unsigned Opc; unsigned char TF = 0; diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 6888ae994c58..a67b3600c4fe 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -39,6 +39,10 @@ static MachineBasicBlock::iterator skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, unsigned NumAlignedDPRCS2Regs); +ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti) + : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4), + STI(sti) {} + /// hasFP - Return true if the specified function should have a dedicated frame /// pointer register. This is true if the function has variable sized allocas /// or if frame pointer elimination is disabled. diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index 981d3209710e..709afbcdc681 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -14,7 +14,6 @@ #ifndef ARM_FRAMEINFO_H #define ARM_FRAMEINFO_H -#include "ARMSubtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { @@ -25,10 +24,7 @@ class ARMFrameLowering : public TargetFrameLowering { const ARMSubtarget &STI; public: - explicit ARMFrameLowering(const ARMSubtarget &sti) - : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4), - STI(sti) { - } + explicit ARMFrameLowering(const ARMSubtarget &sti); /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 08d598d7c5a6..38547cfae2e0 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -60,22 +60,17 @@ enum AddrMode2Type { }; class ARMDAGToDAGISel : public SelectionDAGISel { - ARMBaseTargetMachine &TM; - /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can /// make the right decision when generating code for different targets. const ARMSubtarget *Subtarget; public: - explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, - CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), TM(tm), - Subtarget(&TM.getSubtarget()) { - } + explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel) {} bool runOnMachineFunction(MachineFunction &MF) override { // Reset the subtarget each time through. - Subtarget = &TM.getSubtarget(); + Subtarget = &MF.getTarget().getSubtarget(); SelectionDAGISel::runOnMachineFunction(MF); return true; } @@ -429,8 +424,8 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { if (Use->getOpcode() == ISD::CopyToReg) return true; if (Use->isMachineOpcode()) { - const ARMBaseInstrInfo *TII = - static_cast(TM.getInstrInfo()); + const ARMBaseInstrInfo *TII = static_cast( + CurDAG->getTarget().getInstrInfo()); const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode()); if (MCID.mayStore()) @@ -2444,7 +2439,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case ISD::Constant: { unsigned Val = cast(N)->getZExtValue(); bool UseCP = true; - if (Subtarget->useMovt()) + if (Subtarget->useMovt(*MF)) // Thumb2-aware targets have the MOVT instruction, so all immediates can // be done with MOV + MOVT, at worst. UseCP = false; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 4a762ce6d3bd..5c14ed6641a3 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -396,8 +396,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) addRegisterClass(MVT::f32, &ARM::SPRRegClass); if (!Subtarget->isFPOnlySP()) addRegisterClass(MVT::f64, &ARM::DPRRegClass); - - setTruncStoreAction(MVT::f64, MVT::f32, Expand); } for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; @@ -582,8 +580,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) computeRegisterProperties(); - // ARM does not have f32 extending load. + // ARM does not have floating-point extending loads. setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + + // ... or truncating stores + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); // ARM does not have i1 sign extending load. setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); @@ -825,10 +829,17 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); } - // Special handling for half-precision FP. + + // v8 adds f64 <-> f16 conversion. Before that it should be expanded. + if (!Subtarget->hasV8Ops()) { + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + } + + // fp16 is a special v7 extension that adds f16 <-> f32 conversions. if (!Subtarget->hasFP16()) { - setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand); - setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } } @@ -1654,6 +1665,19 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(), DAG.getTargetGlobalAddress(GV, dl, getPointerTy())); + } else if (Subtarget->isTargetCOFF()) { + assert(Subtarget->isTargetWindows() && + "Windows is the only supported COFF target"); + unsigned TargetFlags = GV->hasDLLImportStorageClass() + ? ARMII::MO_DLLIMPORT + : ARMII::MO_NO_FLAG; + Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0, + TargetFlags); + if (GV->hasDLLImportStorageClass()) + Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), + DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(), + Callee), MachinePointerInfo::getGOT(), + false, false, false, 0); } else { // On ELF targets for PIC code, direct calls should go through the PLT unsigned OpFlags = 0; @@ -1695,7 +1719,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // FIXME: handle tail calls differently. unsigned CallOpc; - bool HasMinSizeAttr = Subtarget->isMinSize(); + bool HasMinSizeAttr = MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize); if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; @@ -2333,7 +2358,8 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), - DAG.getExternalSymbol("__tls_get_addr", PtrVT), &Args, 0); + DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args), + 0); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; @@ -2441,7 +2467,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, // If we have T2 ops, we can materialize the address directly via movt/movw // pair. This is always cheaper. - if (Subtarget->useMovt()) { + if (Subtarget->useMovt(DAG.getMachineFunction())) { ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. @@ -2463,7 +2489,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, const GlobalValue *GV = cast(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); - if (Subtarget->useMovt()) + if (Subtarget->useMovt(DAG.getMachineFunction())) ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register @@ -2483,18 +2509,27 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); - assert(Subtarget->useMovt() && "Windows on ARM expects to use movw/movt"); + assert(Subtarget->useMovt(DAG.getMachineFunction()) && + "Windows on ARM expects to use movw/movt"); const GlobalValue *GV = cast(Op)->getGlobal(); + const ARMII::TOF TargetFlags = + (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); EVT PtrVT = getPointerTy(); + SDValue Result; SDLoc DL(Op); ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. - return DAG.getNode(ARMISD::Wrapper, DL, PtrVT, - DAG.getTargetGlobalAddress(GV, DL, PtrVT)); + Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, + DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, + TargetFlags)); + if (GV->hasDLLImportStorageClass()) + Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(), false, false, false, 0); + return Result; } SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, @@ -4504,6 +4539,11 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, BitMask <<= 8; ImmMask <<= 1; } + + if (DAG.getTargetLoweringInfo().isBigEndian()) + // swap higher and lower 32 bit word + Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); + // Op=1, Cmode=1110. OpCmode = 0x1e; VT = is128Bits ? MVT::v2i64 : MVT::v1i64; @@ -6090,7 +6130,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee, - &Args, 0) + std::move(Args), 0) .setDiscardResult(); std::pair CallResult = LowerCallTo(CLI); @@ -7147,7 +7187,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, // thumb-2 environment, so there is no interworking required. As a result, we // do not expect a veneer to be emitted by the linker, clobbering IP. // - // Each module recieves its own copy of __chkstk, so no import thunk is + // Each module receives its own copy of __chkstk, so no import thunk is // required, again, ensuring that IP is not clobbered. // // Finally, although some linkers may theoretically provide a trampoline for @@ -7187,8 +7227,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) - .addReg(ARM::SP, RegState::Define) - .addReg(ARM::R4, RegState::Kill))); + .addReg(ARM::SP).addReg(ARM::R4))); MI->eraseFromParent(); return MBB; @@ -10559,7 +10598,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(getLibcallCallingConv(LC), RetTy, Callee, &Args, 0) + .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); @@ -10582,7 +10621,7 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); Flag = Chain.getValue(1); - SDVTList NodeTys = DAG.getVTList(MVT::i32, MVT::Glue); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 03eac2e0c84f..f0e145a394b2 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -270,8 +270,8 @@ def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">, def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">; // FIXME: Eventually this will be just "hasV6T2Ops". -def UseMovt : Predicate<"Subtarget->useMovt()">; -def DontUseMovt : Predicate<"!Subtarget->useMovt()">; +def UseMovt : Predicate<"Subtarget->useMovt(*MF)">; +def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">; def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; def UseMulOps : Predicate<"Subtarget->useMulOps()">; @@ -594,7 +594,7 @@ def so_imm2part : PatLeaf<(imm), [{ /// arm_i32imm - True for +V6T2, or true only if so_imm2part is true. /// def arm_i32imm : PatLeaf<(imm), [{ - if (Subtarget->useMovt()) + if (Subtarget->useMovt(*MF)) return true; return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue()); }]>; @@ -2708,7 +2708,8 @@ multiclass AI2_stridx { + opc, "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { bits<17> addr; let Inst{25} = 0; let Inst{23} = addr{12}; // U (add = ('U' == 1)) @@ -2720,7 +2721,8 @@ multiclass AI2_stridx { + opc, "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { bits<17> addr; let Inst{25} = 1; let Inst{23} = addr{12}; // U (add = ('U' == 1)) @@ -2733,7 +2735,7 @@ multiclass AI2_stridx { + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; @@ -2751,7 +2753,7 @@ multiclass AI2_stridx { + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; @@ -3334,8 +3336,8 @@ def SBFX : I<(outs GPRnopc:$Rd), let Inst{3-0} = Rn; } -def UBFX : I<(outs GPR:$Rd), - (ins GPR:$Rn, imm0_31:$lsb, imm1_32:$width), +def UBFX : I<(outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width), AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>, Requires<[IsARM, HasV6T2]> { @@ -4443,7 +4445,7 @@ def instsyncb_opt : Operand { let DecoderMethod = "DecodeInstSyncBarrierOption"; } -// memory barriers protect the atomic sequences +// Memory barriers protect the atomic sequences let hasSideEffects = 1 in { def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>, @@ -4452,7 +4454,6 @@ def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, let Inst{31-4} = 0xf57ff05; let Inst{3-0} = opt; } -} def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>, @@ -4464,12 +4465,13 @@ def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, // ISB has only full system option def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary, - "isb", "\t$opt", []>, + "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>, Requires<[IsARM, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf57ff06; let Inst{3-0} = opt; } +} let usesCustomInserter = 1, Defs = [CPSR] in { @@ -5113,9 +5115,11 @@ let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in // __aeabi_read_tp preserves the registers r1-r3. // This is a pseudo inst so that we can get the encoding right, // complete with fixup for the aeabi_read_tp function. +// TPsoft is valid for ARM mode only, in case of Thumb mode a tTPsoft pattern +// is defined in "ARMInstrThumb.td". let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in { - def TPsoft : PseudoInst<(outs), (ins), IIC_Br, + def TPsoft : ARMPseudoInst<(outs), (ins), 4, IIC_Br, [(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>; } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index b32b5d24af19..c02bb3b55f5b 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -6372,6 +6372,32 @@ multiclass Lengthen_HalfSingle; } +// The following class definition is basically a copy of the +// Lengthen_HalfSingle definition above, however with an additional parameter +// "RevLanes" to select the correct VREV32dXX instruction. This is to convert +// data loaded by VLD1LN into proper vector format in big endian mode. +multiclass Lengthen_HalfSingle_Big_Endian { + def _Any : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("extloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast("VMOVLuv" # InsnLanes # InsnTy) + (!cast("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)>; + def _Z : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("zextloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast("VMOVLuv" # InsnLanes # InsnTy) + (!cast("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)>; + def _S : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("sextloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast("VMOVLsv" # InsnLanes # InsnTy) + (!cast("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)>; +} + // extload, zextload and sextload for a lengthening load followed by another // lengthening load, to quadruple the initial length. // @@ -6406,6 +6432,36 @@ multiclass Lengthen_Double; } +// The following class definition is basically a copy of the +// Lengthen_Double definition above, however with an additional parameter +// "RevLanes" to select the correct VREV32dXX instruction. This is to convert +// data loaded by VLD1LN into proper vector format in big endian mode. +multiclass Lengthen_Double_Big_Endian { + def _Any : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("extloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0))>; + def _Z : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("zextloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0))>; + def _S : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("sextloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast("VMOVLsv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn1Lanes # Insn1Ty) + (!cast("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0))>; +} + // extload, zextload and sextload for a lengthening load followed by another // lengthening load, to quadruple the initial length, but which ends up only // requiring half the available lanes (a 64-bit outcome instead of a 128-bit). @@ -6443,33 +6499,102 @@ multiclass Lengthen_HalfDouble; } +// The following class definition is basically a copy of the +// Lengthen_HalfDouble definition above, however with an additional VREV16d8 +// instruction to convert data loaded by VLD1LN into proper vector format +// in big endian mode. +multiclass Lengthen_HalfDouble_Big_Endian { + def _Any : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("extloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast("VREV16d8") + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)), + dsub_0)>; + def _Z : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("zextloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast("VREV16d8") + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)), + dsub_0)>; + def _S : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("sextloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn1Lanes # Insn1Ty) + (!cast("VREV16d8") + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)), + dsub_0)>; +} + defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16 defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32 defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64 -defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16 -defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32 +let Predicates = [IsLE] in { + defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16 + defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32 -// Double lengthening - v4i8 -> v4i16 -> v4i32 -defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">; -// v2i8 -> v2i16 -> v2i32 -defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">; -// v2i16 -> v2i32 -> v2i64 -defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">; + // Double lengthening - v4i8 -> v4i16 -> v4i32 + defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">; + // v2i8 -> v2i16 -> v2i32 + defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">; + // v2i16 -> v2i32 -> v2i64 + defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">; +} + +let Predicates = [IsBE] in { + defm : Lengthen_HalfSingle_Big_Endian<"4", "i16", "i8", "8", "i16", "8">; // v4i8 -> v4i16 + defm : Lengthen_HalfSingle_Big_Endian<"2", "i32", "i16", "4", "i32", "16">; // v2i16 -> v2i32 + + // Double lengthening - v4i8 -> v4i16 -> v4i32 + defm : Lengthen_Double_Big_Endian<"4", "i32", "i8", "8", "i16", "4", "i32", "8">; + // v2i8 -> v2i16 -> v2i32 + defm : Lengthen_HalfDouble_Big_Endian<"2", "i32", "i8", "8", "i16", "4", "i32">; + // v2i16 -> v2i32 -> v2i64 + defm : Lengthen_Double_Big_Endian<"2", "i64", "i16", "4", "i32", "2", "i64", "16">; +} // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64 -def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)), - (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 - (VLD1LNd16 addrmode6:$addr, - (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; -def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)), - (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 - (VLD1LNd16 addrmode6:$addr, - (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; -def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)), - (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16 - (VLD1LNd16 addrmode6:$addr, - (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; +let Predicates = [IsLE] in { + def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)), + (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16 + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; +} +// The following patterns are basically a copy of the patterns above, +// however with an additional VREV16d instruction to convert data +// loaded by VLD1LN into proper vector format in big endian mode. +let Predicates = [IsBE] in { + def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (!cast("VREV16d8") + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (!cast("VREV16d8") + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)), + (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16 + (!cast("VREV16d8") + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>; +} //===----------------------------------------------------------------------===// // Assembler aliases diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index c30d6abbb299..85e93516807b 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -3209,27 +3209,28 @@ def t2MOVCCi32imm let hasSideEffects = 1 in { def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary, "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>, - Requires<[HasDB]> { + Requires<[IsThumb, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf3bf8f5; let Inst{3-0} = opt; } -} def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary, "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>, - Requires<[HasDB]> { + Requires<[IsThumb, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf3bf8f4; let Inst{3-0} = opt; } def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary, - "isb", "\t$opt", []>, Requires<[HasDB]> { + "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>, + Requires<[IsThumb, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf3bf8f6; let Inst{3-0} = opt; } +} class T2I_ldrex opcod, dag oops, dag iops, AddrMode am, int sz, InstrItinClass itin, string opc, string asm, string cstr, diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index 1d7802a2c7fb..55a6efcb4c04 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -551,12 +551,6 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", [/* For disassembly only; pattern left blank */]>; -def : Pat<(f32_to_f16 SPR:$a), - (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; - -def : Pat<(f16_to_f32 GPR:$a), - (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; - def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", [/* For disassembly only; pattern left blank */]>; @@ -619,6 +613,19 @@ def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0, let Inst{5} = Dm{4}; } +def : Pat<(fp_to_f16 SPR:$a), + (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; + +def : Pat<(fp_to_f16 (f64 DPR:$a)), + (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>; + +def : Pat<(f16_to_fp GPR:$a), + (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; + +def : Pat<(f64 (f16_to_fp GPR:$a)), + (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>; + + multiclass vcvt_inst rm> { let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in { def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0, diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp index 8821c2dd09f8..6d1114d51aab 100644 --- a/lib/Target/ARM/ARMJITInfo.cpp +++ b/lib/Target/ARM/ARMJITInfo.cpp @@ -13,6 +13,7 @@ #include "ARMJITInfo.h" #include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" #include "ARMRelocations.h" #include "MCTargetDesc/ARMBaseInfo.h" #include "llvm/CodeGen/JITCodeEmitter.h" @@ -334,3 +335,10 @@ void ARMJITInfo::relocate(void *Function, MachineRelocation *MR, } } } + +void ARMJITInfo::Initialize(const MachineFunction &MF, bool isPIC) { + const ARMFunctionInfo *AFI = MF.getInfo(); + ConstPoolId2AddrMap.resize(AFI->getNumPICLabels()); + JumpTableId2AddrMap.resize(AFI->getNumJumpTables()); + IsPIC = isPIC; +} diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h index ee4c863543e7..27e2a2013404 100644 --- a/lib/Target/ARM/ARMJITInfo.h +++ b/lib/Target/ARM/ARMJITInfo.h @@ -14,7 +14,6 @@ #ifndef ARMJITINFO_H #define ARMJITINFO_H -#include "ARMMachineFunctionInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -103,12 +102,7 @@ namespace llvm { /// Resize constant pool ids to CONSTPOOL_ENTRY addresses map; resize /// jump table ids to jump table bases map; remember if codegen relocation /// model is PIC. - void Initialize(const MachineFunction &MF, bool isPIC) { - const ARMFunctionInfo *AFI = MF.getInfo(); - ConstPoolId2AddrMap.resize(AFI->getNumPICLabels()); - JumpTableId2AddrMap.resize(AFI->getNumJumpTables()); - IsPIC = isPIC; - } + void Initialize(const MachineFunction &MF, bool isPIC); /// getConstantPoolEntryAddr - The ARM target puts all constant /// pool entries into constant islands. This returns the address of the diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp index 48141b14e137..023f5f8e37a0 100644 --- a/lib/Target/ARM/ARMMCInstLower.cpp +++ b/lib/Target/ARM/ARMMCInstLower.cpp @@ -34,7 +34,7 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO, OutContext); switch (Option) { default: llvm_unreachable("Unknown target flag on symbol operand"); - case 0: + case ARMII::MO_NO_FLAG: break; case ARMII::MO_LO16: Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp index af445e2f35aa..892b269fc181 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp +++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp @@ -12,3 +12,13 @@ using namespace llvm; void ARMFunctionInfo::anchor() { } + +ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF) + : isThumb(MF.getTarget().getSubtarget().isThumb()), + hasThumb2(MF.getTarget().getSubtarget().hasThumb2()), + StByValParamsPadding(0), ArgRegsSaveSize(0), HasStackFrame(false), + RestoreSPFromFP(false), LRSpilledForFarJump(false), + FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), JumpTableUId(0), + PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false), + GlobalBaseReg(0) {} diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h index 7934761af25b..d3fabc3ebb04 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -19,6 +19,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/DenseMap.h" namespace llvm { @@ -118,6 +119,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// being passed on the stack unsigned ArgumentStackSize; + /// CoalescedWeights - mapping of basic blocks to the rolling counter of + /// coalesced weights. + DenseMap CoalescedWeights; + public: ARMFunctionInfo() : isThumb(false), @@ -130,16 +135,7 @@ class ARMFunctionInfo : public MachineFunctionInfo { JumpTableUId(0), PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {} - explicit ARMFunctionInfo(MachineFunction &MF) : - isThumb(MF.getTarget().getSubtarget().isThumb()), - hasThumb2(MF.getTarget().getSubtarget().hasThumb2()), - StByValParamsPadding(0), - ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false), - LRSpilledForFarJump(false), - FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), - GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), - JumpTableUId(0), PICLabelUId(0), - VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {} + explicit ARMFunctionInfo(MachineFunction &MF); bool isThumbFunction() const { return isThumb; } bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; } @@ -230,6 +226,15 @@ class ARMFunctionInfo : public MachineFunctionInfo { else return -1U; } + + DenseMap::iterator getCoalescedWeight( + MachineBasicBlock* MBB) { + auto It = CoalescedWeights.find(MBB); + if (It == CoalescedWeights.end()) { + It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first; + } + return It; + } }; } // End llvm namespace diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp index c3a9131fb767..3dcc0df349d2 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -191,7 +191,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMSET), Type::getVoidTy(*DAG.getContext()), DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET), - TLI.getPointerTy()), &Args, 0) + TLI.getPointerTy()), std::move(Args), 0) .setDiscardResult(); std::pair CallResult = TLI.LowerCallTo(CLI); diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index fc842512ef00..c1b4562f4118 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -12,14 +12,24 @@ //===----------------------------------------------------------------------===// #include "ARMSubtarget.h" -#include "ARMBaseInstrInfo.h" -#include "ARMBaseRegisterInfo.h" +#include "ARMFrameLowering.h" +#include "ARMISelLowering.h" +#include "ARMInstrInfo.h" +#include "ARMJITInfo.h" +#include "ARMSelectionDAGInfo.h" +#include "ARMSubtarget.h" +#include "ARMMachineFunctionInfo.h" +#include "Thumb1FrameLowering.h" +#include "Thumb1InstrInfo.h" +#include "Thumb2InstrInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" using namespace llvm; @@ -142,13 +152,22 @@ ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU, } ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool IsLittle, - const TargetOptions &Options) + const std::string &FS, TargetMachine &TM, + bool IsLittle, const TargetOptions &Options) : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle), TargetTriple(TT), Options(Options), TargetABI(ARM_ABI_UNKNOWN), DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))), - TSInfo(DL) {} + TSInfo(DL), JITInfo(), + InstrInfo(isThumb1Only() + ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this) + : !isThumb() + ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this) + : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)), + TLInfo(TM), + FrameLowering(!isThumb1Only() + ? new ARMFrameLowering(*this) + : (ARMFrameLowering *)new Thumb1FrameLowering(*this)) {} void ARMSubtarget::initializeEnvironment() { HasV4TOps = false; @@ -164,7 +183,6 @@ void ARMSubtarget::initializeEnvironment() { HasVFPv4 = false; HasFPARMv8 = false; HasNEON = false; - MinSize = false; UseNEONForSinglePrecisionFP = false; UseMulOps = UseFusedMulOps; SlowFPVMLx = false; @@ -173,7 +191,6 @@ void ARMSubtarget::initializeEnvironment() { InThumbMode = false; HasThumb2 = false; NoARM = false; - PostRAScheduler = false; IsR9Reserved = ReserveR9; UseMovt = false; SupportsTailCall = false; @@ -215,9 +232,6 @@ void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) { initializeEnvironment(); resetSubtargetFeatures(CPU, FS); } - - MinSize = - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); } void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { @@ -293,9 +307,6 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { SupportsTailCall = !isThumb1Only(); } - if (!isThumb() || hasThumb2()) - PostRAScheduler = true; - switch (Align) { case DefaultAlign: // Assume pre-ARMv6 doesn't support unaligned accesses. @@ -410,17 +421,20 @@ bool ARMSubtarget::hasSinCos() const { !getTargetTriple().isOSVersionLT(7, 0); } -// Enable the PostMachineScheduler if the target selects it instead of -// PostRAScheduler. Currently only available on the command line via -// -misched-postra. +// This overrides the PostRAScheduler bit in the SchedModel for any CPU. bool ARMSubtarget::enablePostMachineScheduler() const { - return PostRAScheduler; + return (!isThumb() || hasThumb2()); +} + +bool ARMSubtarget::enableAtomicExpandLoadLinked() const { + return hasAnyDataBarrier() && !isThumb1Only(); } -bool ARMSubtarget::enablePostRAScheduler( - CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const { - Mode = TargetSubtargetInfo::ANTIDEP_NONE; - return PostRAScheduler && OptLevel >= CodeGenOpt::Default; +bool ARMSubtarget::useMovt(const MachineFunction &MF) const { + // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit + // immediates as it is inherently position independent, and may be out of + // range otherwise. + return UseMovt && (isTargetWindows() || + !MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize)); } diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 7da80ec0d494..f8283b08d485 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -14,7 +14,17 @@ #ifndef ARMSUBTARGET_H #define ARMSUBTARGET_H + +#include "ARMFrameLowering.h" +#include "ARMISelLowering.h" +#include "ARMInstrInfo.h" +#include "ARMJITInfo.h" #include "ARMSelectionDAGInfo.h" +#include "ARMSubtarget.h" +#include "Thumb1FrameLowering.h" +#include "Thumb1InstrInfo.h" +#include "Thumb2InstrInfo.h" +#include "ARMJITInfo.h" #include "MCTargetDesc/ARMMCTargetDesc.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/DataLayout.h" @@ -66,10 +76,6 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool HasFPARMv8; bool HasNEON; - /// MinSize - True if the function being compiled has the "minsize" attribute - /// and should be optimised for size at the expense of speed. - bool MinSize; - /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been /// specified. Use the method useNEONForSinglePrecisionFP() to /// determine if NEON should actually be used. @@ -99,9 +105,6 @@ class ARMSubtarget : public ARMGenSubtargetInfo { /// NoARM - True if subtarget does not support ARM mode execution. bool NoARM; - /// PostRAScheduler - True if using post-register-allocation scheduler. - bool PostRAScheduler; - /// IsR9Reserved - True if R9 is a not available as general purpose register. bool IsR9Reserved; @@ -235,7 +238,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { /// of the specified triple. /// ARMSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool IsLittle, + const std::string &FS, TargetMachine &TM, bool IsLittle, const TargetOptions &Options); /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size @@ -256,10 +259,23 @@ class ARMSubtarget : public ARMGenSubtargetInfo { const DataLayout *getDataLayout() const { return &DL; } const ARMSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + ARMJITInfo *getJITInfo() { return &JITInfo; } + const ARMBaseInstrInfo *getInstrInfo() const { return InstrInfo.get(); } + const ARMTargetLowering *getTargetLowering() const { return &TLInfo; } + const ARMFrameLowering *getFrameLowering() const { return FrameLowering.get(); } + const ARMBaseRegisterInfo *getRegisterInfo() const { + return &InstrInfo->getRegisterInfo(); + } private: const DataLayout DL; ARMSelectionDAGInfo TSInfo; + ARMJITInfo JITInfo; + // Either Thumb1InstrInfo or Thumb2InstrInfo. + std::unique_ptr InstrInfo; + ARMTargetLowering TLInfo; + // Either Thumb1FrameLowering or ARMFrameLowering. + std::unique_ptr FrameLowering; void initializeEnvironment(); void resetSubtargetFeatures(StringRef CPU, StringRef FS); @@ -296,7 +312,6 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } bool hasVirtualization() const { return HasVirtualization; } - bool isMinSize() const { return MinSize; } bool useNEONForSinglePrecisionFP() const { return hasNEON() && UseNEONForSinglePrecisionFP; } @@ -392,12 +407,8 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool isR9Reserved() const { return IsR9Reserved; } - bool useMovt() const { - // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit - // immediates as it is inherently position independent, and may be out of - // range otherwise. - return UseMovt && (isTargetWindows() || !isMinSize()); - } + bool useMovt(const MachineFunction &MF) const; + bool supportsTailCall() const { return SupportsTailCall; } bool allowsUnalignedMem() const { return AllowsUnalignedMem; } @@ -415,12 +426,10 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool hasSinCos() const; /// True for some subtargets at > -O0. - bool enablePostMachineScheduler() const; + bool enablePostMachineScheduler() const override; - /// enablePostRAScheduler - True at 'More' optimization. - bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const override; + // enableAtomicExpandLoadLinked - True if we need to expand our atomics. + bool enableAtomicExpandLoadLinked() const override; /// getInstrItins - Return the instruction itineraies based on subtarget /// selection. @@ -434,6 +443,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect /// symbol. bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const; + }; } // End llvm namespace diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index a93824230d70..d85194b75ecb 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -49,10 +49,9 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, isLittle, Options), JITInfo() { + CodeGenOpt::Level OL, bool isLittle) + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this, isLittle, Options) { // Default to triple-appropriate float ABI if (Options.FloatABIType == FloatABI::Default) @@ -71,16 +70,11 @@ void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) { void ARMTargetMachine::anchor() { } -ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, +ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle) - : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle), - InstrInfo(Subtarget), - TLInfo(*this), - FrameLowering(Subtarget) { + CodeGenOpt::Level OL, bool isLittle) + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { initAsmInfo(); if (!Subtarget.hasARMOps()) report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not " @@ -89,21 +83,21 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, void ARMLETargetMachine::anchor() { } -ARMLETargetMachine:: -ARMLETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} +ARMLETargetMachine::ARMLETargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} void ARMBETargetMachine::anchor() { } -ARMBETargetMachine:: -ARMBETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} +ARMBETargetMachine::ARMBETargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} void ThumbTargetMachine::anchor() { } @@ -111,36 +105,29 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle) - : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle), - InstrInfo(Subtarget.hasThumb2() - ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget)) - : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))), - TLInfo(*this), - FrameLowering(Subtarget.hasThumb2() - ? new ARMFrameLowering(Subtarget) - : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) { + CodeGenOpt::Level OL, bool isLittle) + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, + isLittle) { initAsmInfo(); } void ThumbLETargetMachine::anchor() { } -ThumbLETargetMachine:: -ThumbLETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} +ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} void ThumbBETargetMachine::anchor() { } -ThumbBETargetMachine:: -ThumbBETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} +ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} namespace { /// ARM Code Generator Pass Configuration Options. @@ -171,16 +158,15 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) { } void ARMPassConfig::addIRPasses() { - const ARMSubtarget *Subtarget = &getARMSubtarget(); - if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) { - addPass(createAtomicExpandLoadLinkedPass(TM)); + addPass(createAtomicExpandLoadLinkedPass(TM)); - // Cmpxchg instructions are often used with a subsequent comparison to - // determine whether it succeeded. We can exploit existing control-flow in - // ldrex/strex loops to simplify this, but it needs tidying up. + // Cmpxchg instructions are often used with a subsequent comparison to + // determine whether it succeeded. We can exploit existing control-flow in + // ldrex/strex loops to simplify this, but it needs tidying up. + const ARMSubtarget *Subtarget = &getARMSubtarget(); + if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) addPass(createCFGSimplificationPass()); - } TargetPassConfig::addIRPasses(); } diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index 154927786082..b72b1df4af83 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -14,17 +14,9 @@ #ifndef ARMTARGETMACHINE_H #define ARMTARGETMACHINE_H -#include "ARMFrameLowering.h" -#include "ARMISelLowering.h" #include "ARMInstrInfo.h" -#include "ARMJITInfo.h" -#include "ARMSelectionDAGInfo.h" #include "ARMSubtarget.h" -#include "Thumb1FrameLowering.h" -#include "Thumb1InstrInfo.h" -#include "Thumb2InstrInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/MC/MCStreamer.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -32,10 +24,6 @@ namespace llvm { class ARMBaseTargetMachine : public LLVMTargetMachine { protected: ARMSubtarget Subtarget; - -private: - ARMJITInfo JITInfo; - public: ARMBaseTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, @@ -44,11 +32,21 @@ class ARMBaseTargetMachine : public LLVMTargetMachine { CodeGenOpt::Level OL, bool isLittle); - ARMJITInfo *getJITInfo() override { return &JITInfo; } const ARMSubtarget *getSubtargetImpl() const override { return &Subtarget; } + const ARMBaseRegisterInfo *getRegisterInfo() const override { + return getSubtargetImpl()->getRegisterInfo(); + } const ARMTargetLowering *getTargetLowering() const override { - // Implemented by derived classes - llvm_unreachable("getTargetLowering not implemented"); + return getSubtargetImpl()->getTargetLowering(); + } + const ARMSelectionDAGInfo *getSelectionDAGInfo() const override { + return getSubtargetImpl()->getSelectionDAGInfo(); + } + const ARMBaseInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const ARMFrameLowering *getFrameLowering() const override { + return getSubtargetImpl()->getFrameLowering(); } const InstrItineraryData *getInstrItineraryData() const override { return &getSubtargetImpl()->getInstrItineraryData(); @@ -56,6 +54,8 @@ class ARMBaseTargetMachine : public LLVMTargetMachine { const DataLayout *getDataLayout() const override { return getSubtargetImpl()->getDataLayout(); } + ARMJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); } + /// \brief Register ARM analysis passes with a pass manager. void addAnalysisPasses(PassManagerBase &PM) override; @@ -69,32 +69,10 @@ class ARMBaseTargetMachine : public LLVMTargetMachine { /// class ARMTargetMachine : public ARMBaseTargetMachine { virtual void anchor(); - ARMInstrInfo InstrInfo; - ARMTargetLowering TLInfo; - ARMFrameLowering FrameLowering; public: - ARMTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle); - - const ARMRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); - } - - const ARMTargetLowering *getTargetLowering() const override { - return &TLInfo; - } - - const ARMSelectionDAGInfo *getSelectionDAGInfo() const override { - return getSubtargetImpl()->getSelectionDAGInfo(); - } - const ARMFrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - const ARMInstrInfo *getInstrInfo() const override { return &InstrInfo; } + ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle); }; /// ARMLETargetMachine - ARM little endian target machine. @@ -113,10 +91,9 @@ class ARMLETargetMachine : public ARMTargetMachine { class ARMBETargetMachine : public ARMTargetMachine { void anchor() override; public: - ARMBETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + ARMBETargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); }; /// ThumbTargetMachine - Thumb target machine. @@ -125,40 +102,10 @@ class ARMBETargetMachine : public ARMTargetMachine { /// class ThumbTargetMachine : public ARMBaseTargetMachine { virtual void anchor(); - // Either Thumb1InstrInfo or Thumb2InstrInfo. - std::unique_ptr InstrInfo; - ARMTargetLowering TLInfo; - // Either Thumb1FrameLowering or ARMFrameLowering. - std::unique_ptr FrameLowering; public: - ThumbTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle); - - /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo - const ARMBaseRegisterInfo *getRegisterInfo() const override { - return &InstrInfo->getRegisterInfo(); - } - - const ARMTargetLowering *getTargetLowering() const override { - return &TLInfo; - } - - const ARMSelectionDAGInfo *getSelectionDAGInfo() const override { - return getSubtargetImpl()->getSelectionDAGInfo(); - } - - /// returns either Thumb1InstrInfo or Thumb2InstrInfo - const ARMBaseInstrInfo *getInstrInfo() const override { - return InstrInfo.get(); - } - /// returns either Thumb1FrameLowering or ARMFrameLowering - const ARMFrameLowering *getFrameLowering() const override { - return FrameLowering.get(); - } + ThumbTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle); }; /// ThumbLETargetMachine - Thumb little endian target machine. @@ -166,10 +113,10 @@ class ThumbTargetMachine : public ARMBaseTargetMachine { class ThumbLETargetMachine : public ThumbTargetMachine { void anchor() override; public: - ThumbLETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + ThumbLETargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); }; /// ThumbBETargetMachine - Thumb big endian target machine. diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 57df7da7f310..a2ace629baa1 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -443,31 +443,58 @@ unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { - // We only handle costs of reverse shuffles for now. - if (Kind != SK_Reverse) + // We only handle costs of reverse and alternate shuffles for now. + if (Kind != SK_Reverse && Kind != SK_Alternate) return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - static const CostTblEntry NEONShuffleTbl[] = { - // Reverse shuffle cost one instruction if we are shuffling within a double - // word (vrev) or two if we shuffle a quad word (vrev, vext). - { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, - - { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 } - }; + if (Kind == SK_Reverse) { + static const CostTblEntry NEONShuffleTbl[] = { + // Reverse shuffle cost one instruction if we are shuffling within a + // double word (vrev) or two if we shuffle a quad word (vrev, vext). + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, - std::pair LT = TLI->getTypeLegalizationCost(Tp); + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; - int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx == -1) - return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + std::pair LT = TLI->getTypeLegalizationCost(Tp); + + int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx == -1) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - return LT.first * NEONShuffleTbl[Idx].Cost; + return LT.first * NEONShuffleTbl[Idx].Cost; + } + if (Kind == SK_Alternate) { + static const CostTblEntry NEONAltShuffleTbl[] = { + // Alt shuffle cost table for ARM. Cost is the number of instructions + // required to create the shuffled vector. + + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, + + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, + + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; + + std::pair LT = TLI->getTypeLegalizationCost(Tp); + int Idx = + CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx == -1) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + return LT.first * NEONAltShuffleTbl[Idx].Cost; + } + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); } unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 6bb41e4bbe69..b62706c45fbf 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -3082,17 +3082,25 @@ bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) { } /// MatchCoprocessorOperandName - Try to parse an coprocessor related -/// instruction with a symbolic operand name. Example: "p1", "p7", "c3", -/// "c5", ... +/// instruction with a symbolic operand name. +/// We accept "crN" syntax for GAS compatibility. +/// ::= +/// If CoprocOp is 'c', then: +/// ::= c | cr +/// If CoprocOp is 'p', then : +/// ::= p +/// ::= integer in range [0, 15] static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) { // Use the same layout as the tablegen'erated register name matcher. Ugly, // but efficient. + if (Name.size() < 2 || Name[0] != CoprocOp) + return -1; + Name = (Name[1] == 'r') ? Name.drop_front(2) : Name.drop_front(); + switch (Name.size()) { default: return -1; - case 2: - if (Name[0] != CoprocOp) - return -1; - switch (Name[1]) { + case 1: + switch (Name[0]) { default: return -1; case '0': return 0; case '1': return 1; @@ -3105,10 +3113,10 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) { case '8': return 8; case '9': return 9; } - case 3: - if (Name[0] != CoprocOp || Name[1] != '1') + case 2: + if (Name[0] != '1') return -1; - switch (Name[2]) { + switch (Name[1]) { default: return -1; // p10 and p11 are invalid for coproc instructions (reserved for FP/NEON) case '0': return CoprocOp == 'p'? -1: 10; diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index 42a1cbb8c222..1686d76b8e1b 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -295,7 +295,12 @@ namespace ARMII { /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects /// just that part of the flag set. - MO_OPTION_MASK = 0x7f, + MO_OPTION_MASK = 0x3f, + + /// MO_DLLIMPORT - On a symbol operand, this represents that the reference + /// to the symbol is for an import stub. This is used for DLL import + /// storage class indication on Windows. + MO_DLLIMPORT = 0x40, /// MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it /// represents a symbol which, if indirect, will get special Darwin mangling diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 1c842633f43f..a86601b6bb5b 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -37,7 +37,8 @@ namespace { unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; - bool needsRelocateWithSymbol(unsigned Type) const override; + bool needsRelocateWithSymbol(const MCSymbolData &SD, + unsigned Type) const override; }; } @@ -48,7 +49,8 @@ ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI) ARMELFObjectWriter::~ARMELFObjectWriter() {} -bool ARMELFObjectWriter::needsRelocateWithSymbol(unsigned Type) const { +bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD, + unsigned Type) const { // FIXME: This is extremelly conservative. This really needs to use a // whitelist with a clear explanation for why each realocation needs to // point to the symbol, not to the section. diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index a4d13edd3ac7..7b5d8b01dfe6 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -992,7 +992,8 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) { return; const MCSymbolData &SD = Streamer.getOrCreateSymbolData(Symbol); - if (MCELF::GetType(SD) & (ELF::STT_FUNC << ELF_STT_Shift)) + unsigned Type = MCELF::GetType(SD); + if (Type == ELF_STT_Func || Type == ELF_STT_GnuIFunc) Streamer.EmitThumbFunc(Symbol); } @@ -1160,7 +1161,7 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) { const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::Create( PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext()); - AddValueSymbols(PersonalityRef); + visitUsedExpr(*PersonalityRef); MCDataFragment *DF = getOrCreateDataFragment(); DF->getFixups().push_back(MCFixup::Create(DF->getContents().size(), PersonalityRef, @@ -1332,6 +1333,12 @@ MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, return S; } +MCStreamer *createARMNullStreamer(MCContext &Ctx) { + MCStreamer *S = llvm::createNullStreamer(Ctx); + new ARMTargetStreamer(*S); + return S; +} + MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter, bool RelaxAll, bool NoExecStack, diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index 5b51a52f828a..b8ee55574972 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -1047,8 +1047,7 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, // we have a movt or a movw, but that led to misleadingly results. // This is now disallowed in the the AsmParser in validateInstruction() // so this should never happen. - assert(0 && "expression without :upper16: or :lower16:"); - return 0; + llvm_unreachable("expression without :upper16: or :lower16:"); } uint32_t ARMMCCodeEmitter:: diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp index 87ea8751944a..e545e3c2f301 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp @@ -41,33 +41,6 @@ ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res, return false; } -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast(Value); - AddValueSymbols_(BE->getLHS(), Asm); - AddValueSymbols_(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbols_(cast(Value)->getSubExpr(), Asm); - break; - } -} - -void ARMMCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbols_(getSubExpr(), Asm); +void ARMMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h index d8191395c5ec..c5c0b10f8ad9 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h @@ -59,7 +59,7 @@ class ARMMCExpr : public MCTargetExpr { void PrintImpl(raw_ostream &OS) const override; bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const override; - void AddValueSymbols(MCAssembler *) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; const MCSection *FindAssociatedSection() const override { return getSubExpr()->FindAssociatedSection(); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 04d63a7e6d46..6a3ec8fcc486 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -84,93 +84,87 @@ static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI, std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) { Triple triple(TT); - // Set the boolean corresponding to the current target triple, or the default - // if one cannot be determined, to true. - unsigned Len = TT.size(); - unsigned Idx = 0; - - // FIXME: Enhance Triple helper class to extract ARM version. bool isThumb = triple.getArch() == Triple::thumb || triple.getArch() == Triple::thumbeb; - if (Len >= 5 && TT.substr(0, 4) == "armv") - Idx = 4; - else if (Len >= 7 && TT.substr(0, 6) == "armebv") - Idx = 6; - else if (Len >= 7 && TT.substr(0, 6) == "thumbv") - Idx = 6; - else if (Len >= 9 && TT.substr(0, 8) == "thumbebv") - Idx = 8; bool NoCPU = CPU == "generic" || CPU.empty(); std::string ARMArchFeature; - if (Idx) { - unsigned SubVer = TT[Idx]; - if (SubVer == '8') { - if (NoCPU) - // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, - // FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, - // FeatureT2XtPk, FeatureCrypto, FeatureCRC - ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm," - "+trustzone,+t2xtpk,+crypto,+crc"; - else - // Use CPU to figure out the exact features - ARMArchFeature = "+v8"; - } else if (SubVer == '7') { - if (Len >= Idx+2 && TT[Idx+1] == 'm') { - isThumb = true; - if (NoCPU) - // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass - ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - } else if (Len >= Idx+3 && TT[Idx+1] == 'e'&& TT[Idx+2] == 'm') { - if (NoCPU) - // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2, - // FeatureT2XtPk, FeatureMClass - ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - } else if (Len >= Idx+2 && TT[Idx+1] == 's') { - if (NoCPU) - // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS - // Swift - ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+ras"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - } else { - // v7 CPUs have lots of different feature sets. If no CPU is specified, - // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return - // the "minimum" feature set and use CPU string to figure out the exact - // features. - if (NoCPU) - // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk - ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - } - } else if (SubVer == '6') { - if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2') - ARMArchFeature = "+v6t2"; - else if (Len >= Idx+2 && TT[Idx+1] == 'm') { - isThumb = true; - if (NoCPU) - // v6m: FeatureNoARM, FeatureMClass - ARMArchFeature = "+v6m,+noarm,+mclass"; - else - ARMArchFeature = "+v6"; - } else - ARMArchFeature = "+v6"; - } else if (SubVer == '5') { - if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e') - ARMArchFeature = "+v5te"; - else - ARMArchFeature = "+v5t"; - } else if (SubVer == '4' && Len >= Idx+2 && TT[Idx+1] == 't') - ARMArchFeature = "+v4t"; + switch (triple.getSubArch()) { + case Triple::ARMSubArch_v8: + if (NoCPU) + // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, + // FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, + // FeatureT2XtPk, FeatureCrypto, FeatureCRC + ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm," + "+trustzone,+t2xtpk,+crypto,+crc"; + else + // Use CPU to figure out the exact features + ARMArchFeature = "+v8"; + break; + case Triple::ARMSubArch_v7m: + isThumb = true; + if (NoCPU) + // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass + ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass"; + else + // Use CPU to figure out the exact features. + ARMArchFeature = "+v7"; + break; + case Triple::ARMSubArch_v7em: + if (NoCPU) + // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2, + // FeatureT2XtPk, FeatureMClass + ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass"; + else + // Use CPU to figure out the exact features. + ARMArchFeature = "+v7"; + break; + case Triple::ARMSubArch_v7s: + if (NoCPU) + // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS + // Swift + ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+ras"; + else + // Use CPU to figure out the exact features. + ARMArchFeature = "+v7"; + break; + case Triple::ARMSubArch_v7: + // v7 CPUs have lots of different feature sets. If no CPU is specified, + // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return + // the "minimum" feature set and use CPU string to figure out the exact + // features. + if (NoCPU) + // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk + ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk"; + else + // Use CPU to figure out the exact features. + ARMArchFeature = "+v7"; + break; + case Triple::ARMSubArch_v6t2: + ARMArchFeature = "+v6t2"; + break; + case Triple::ARMSubArch_v6m: + isThumb = true; + if (NoCPU) + // v6m: FeatureNoARM, FeatureMClass + ARMArchFeature = "+v6m,+noarm,+mclass"; + else + ARMArchFeature = "+v6"; + break; + case Triple::ARMSubArch_v6: + ARMArchFeature = "+v6"; + break; + case Triple::ARMSubArch_v5te: + ARMArchFeature = "+v5te"; + break; + case Triple::ARMSubArch_v5: + ARMArchFeature = "+v5t"; + break; + case Triple::ARMSubArch_v4t: + ARMArchFeature = "+v4t"; + break; + case Triple::NoSubArch: + break; } if (isThumb) { @@ -427,6 +421,12 @@ extern "C" void LLVMInitializeARMTargetMC() { TargetRegistry::RegisterAsmStreamer(TheThumbLETarget, createMCAsmStreamer); TargetRegistry::RegisterAsmStreamer(TheThumbBETarget, createMCAsmStreamer); + // Register the null streamer. + TargetRegistry::RegisterNullStreamer(TheARMLETarget, createARMNullStreamer); + TargetRegistry::RegisterNullStreamer(TheARMBETarget, createARMNullStreamer); + TargetRegistry::RegisterNullStreamer(TheThumbLETarget, createARMNullStreamer); + TargetRegistry::RegisterNullStreamer(TheThumbBETarget, createARMNullStreamer); + // Register the MCInstPrinter. TargetRegistry::RegisterMCInstPrinter(TheARMLETarget, createARMMCInstPrinter); TargetRegistry::RegisterMCInstPrinter(TheARMBETarget, createARMMCInstPrinter); diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index 8853a8c69c50..5326e564f363 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -51,6 +51,8 @@ MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, MCInstPrinter *InstPrint, MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst); +MCStreamer *createARMNullStreamer(MCContext &Ctx); + MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index ecfa4e54b26d..186776a1944f 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -32,6 +32,7 @@ class ARMMachObjectWriter : public MCMachObjectTargetWriter { const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + unsigned Type, unsigned Log2Size, uint64_t &FixedValue); void RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, @@ -251,11 +252,11 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + unsigned Type, unsigned Log2Size, uint64_t &FixedValue) { uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); - unsigned Type = MachO::ARM_RELOC_VANILLA; // See . const MCSymbol *A = &Target.getSymA()->getSymbol(); @@ -272,6 +273,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, uint32_t Value2 = 0; if (const MCSymbolRefExpr *B = Target.getSymB()) { + assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols"); const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol()); if (!B_SD->getFragment()) @@ -374,7 +376,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer, return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, FixedValue); return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, - Target, Log2Size, FixedValue); + Target, RelocType, Log2Size, + FixedValue); } // Get the symbol data, if any. @@ -392,7 +395,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer, Offset += 1 << Log2Size; if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD)) return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, - Target, Log2Size, FixedValue); + Target, RelocType, Log2Size, + FixedValue); // See . uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index 0cb795ba3ccc..8acd7aff6bca 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -28,7 +28,7 @@ ARMTargetStreamer::~ARMTargetStreamer() {} // The constant pool handling is shared by all ARMTargetStreamer // implementations. const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr) { - return ConstantPools->addEntry(Streamer, Expr); + return ConstantPools->addEntry(Streamer, Expr, 4); } void ARMTargetStreamer::emitCurrentConstantPool() { @@ -40,78 +40,34 @@ void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); } // The remaining callbacks should be handled separately by each // streamer. -void ARMTargetStreamer::emitFnStart() { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitFnEnd() { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitCantUnwind() { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitHandlerData() { - llvm_unreachable("unimplemented"); -} +void ARMTargetStreamer::emitFnStart() {} +void ARMTargetStreamer::emitFnEnd() {} +void ARMTargetStreamer::emitCantUnwind() {} +void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) {} +void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) {} +void ARMTargetStreamer::emitHandlerData() {} void ARMTargetStreamer::emitSetFP(unsigned FpReg, unsigned SpReg, - int64_t Offset) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitPad(int64_t Offset) { - llvm_unreachable("unimplemented"); -} -void -ARMTargetStreamer::emitRegSave(const SmallVectorImpl &RegList, - bool isVector) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitUnwindRaw( - int64_t StackOffset, const SmallVectorImpl &Opcodes) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::switchVendor(StringRef Vendor) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) { - llvm_unreachable("unimplemented"); -} + int64_t Offset) {} +void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) {} +void ARMTargetStreamer::emitPad(int64_t Offset) {} +void ARMTargetStreamer::emitRegSave(const SmallVectorImpl &RegList, + bool isVector) {} +void ARMTargetStreamer::emitUnwindRaw(int64_t StackOffset, + const SmallVectorImpl &Opcodes) { +} +void ARMTargetStreamer::switchVendor(StringRef Vendor) {} +void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {} void ARMTargetStreamer::emitTextAttribute(unsigned Attribute, - StringRef String) { - llvm_unreachable("unimplemented"); -} + StringRef String) {} void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute, - unsigned IntValue, - StringRef StringValue) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitArch(unsigned Arch) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitObjectArch(unsigned Arch) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitFPU(unsigned FPU) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::finishAttributeSection() { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::AnnotateTLSDescriptorSequence( - const MCSymbolRefExpr *SRE) { - llvm_unreachable("unimplemented"); -} + unsigned IntValue, + StringRef StringValue) {} +void ARMTargetStreamer::emitArch(unsigned Arch) {} +void ARMTargetStreamer::emitObjectArch(unsigned Arch) {} +void ARMTargetStreamer::emitFPU(unsigned FPU) {} +void ARMTargetStreamer::finishAttributeSection() {} +void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {} +void +ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {} -void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) { - llvm_unreachable("unimplemented"); -} +void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {} diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index be29dc5c28d1..baa97a7c479a 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -21,6 +21,9 @@ using namespace llvm; +Thumb1FrameLowering::Thumb1FrameLowering(const ARMSubtarget &sti) + : ARMFrameLowering(sti) {} + bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{ const MachineFrameInfo *FFI = MF.getFrameInfo(); unsigned CFSize = FFI->getMaxCallFrameSize(); diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h index 5916954850ae..a227f8ece73f 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.h +++ b/lib/Target/ARM/Thumb1FrameLowering.h @@ -15,7 +15,6 @@ #define LLVM_ARM_THUMB1FRAMELOWERING_H #include "ARMFrameLowering.h" -#include "ARMSubtarget.h" #include "Thumb1InstrInfo.h" #include "Thumb1RegisterInfo.h" #include "llvm/Target/TargetFrameLowering.h" @@ -24,9 +23,7 @@ namespace llvm { class Thumb1FrameLowering : public ARMFrameLowering { public: - explicit Thumb1FrameLowering(const ARMSubtarget &sti) - : ARMFrameLowering(sti) { - } + explicit Thumb1FrameLowering(const ARMSubtarget &sti); /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 6267ecf53c7d..09debe76f1b6 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -1010,7 +1010,8 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { AttributeSet FnAttrs = MF.getFunction()->getAttributes(); OptimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); - MinimizeSize = STI->isMinSize(); + MinimizeSize = + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); BlockInfo.clear(); BlockInfo.resize(MF.getNumBlockIDs()); diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index d551ca9dc7f5..21df12faefae 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -165,8 +165,8 @@ void HexagonFrameLowering::emitEpilogue(MachineFunction &MF, } // Replace 'jumpr r31' instruction with dealloc_return for V4 and higher // versions. - if (STI.hasV4TOps() && MBBI->getOpcode() == Hexagon::JMPret - && !DisableDeallocRet) { + if (MF.getTarget().getSubtarget().hasV4TOps() && + MBBI->getOpcode() == Hexagon::JMPret && !DisableDeallocRet) { // Check for RESTORE_DEALLOC_RET_JMP_V4 call. Don't emit an extra DEALLOC // instruction if we encounter it. MachineBasicBlock::iterator BeforeJMPR = diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h index 446af161aa75..2d4b0b9d7eb3 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/lib/Target/Hexagon/HexagonFrameLowering.h @@ -11,20 +11,16 @@ #define HEXAGON_FRAMEINFO_H #include "Hexagon.h" -#include "HexagonSubtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { class HexagonFrameLowering : public TargetFrameLowering { private: - const HexagonSubtarget &STI; void determineFrameLayout(MachineFunction &MF) const; public: - explicit HexagonFrameLowering(const HexagonSubtarget &sti) - : TargetFrameLowering(StackGrowsDown, 8, 0), STI(sti) { - } + explicit HexagonFrameLowering() : TargetFrameLowering(StackGrowsDown, 8, 0) {} /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 17a6674cf577..a460ea4f3420 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -463,9 +463,10 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector, 16> RegsToPass; SmallVector MemOpChains; + const HexagonRegisterInfo *QRI = static_cast( + DAG.getTarget().getRegisterInfo()); SDValue StackPtr = - DAG.getCopyFromReg(Chain, dl, TM.getRegisterInfo()->getStackRegister(), - getPointerTy()); + DAG.getCopyFromReg(Chain, dl, QRI->getStackRegister(), getPointerTy()); // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -720,7 +721,10 @@ SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op, cast(Node->getOperand(i))->getReg(); // Check it to be lr - if (Reg == TM.getRegisterInfo()->getRARegister()) { + const HexagonRegisterInfo *QRI = + static_cast( + DAG.getTarget().getRegisterInfo()); + if (Reg == QRI->getRARegister()) { FuncInfo->setHasClobberLR(true); break; } @@ -812,9 +816,9 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // The Sub result contains the new stack start address, so it // must be placed in the stack pointer register. - SDValue CopyChain = DAG.getCopyToReg(Chain, dl, - TM.getRegisterInfo()->getStackRegister(), - Sub); + const HexagonRegisterInfo *QRI = static_cast( + DAG.getTarget().getRegisterInfo()); + SDValue CopyChain = DAG.getCopyToReg(Chain, dl, QRI->getStackRegister(), Sub); SDValue Ops[2] = { ArgAdjust, CopyChain }; return DAG.getMergeValues(Ops, dl); @@ -960,7 +964,7 @@ HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { - const TargetRegisterInfo *TRI = TM.getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setReturnAddressIsTaken(true); @@ -986,7 +990,8 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { - const HexagonRegisterInfo *TRI = TM.getRegisterInfo(); + const HexagonRegisterInfo *TRI = + static_cast(DAG.getTarget().getRegisterInfo()); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setFrameAddressIsTaken(true); @@ -1038,424 +1043,422 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // TargetLowering Implementation //===----------------------------------------------------------------------===// -HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine - &targetmachine) - : TargetLowering(targetmachine, new HexagonTargetObjectFile()), - TM(targetmachine) { +HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine) + : TargetLowering(targetmachine, new HexagonTargetObjectFile()), + TM(targetmachine) { - const HexagonRegisterInfo* QRI = TM.getRegisterInfo(); + const HexagonSubtarget &Subtarget = TM.getSubtarget(); - // Set up the register classes. - addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass); - addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass); + // Set up the register classes. + addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass); + addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass); - if (QRI->Subtarget.hasV5TOps()) { - addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass); - addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass); - } - - addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass); + if (Subtarget.hasV5TOps()) { + addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass); + addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass); + } - computeRegisterProperties(); + addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass); - // Align loop entry - setPrefLoopAlignment(4); + computeRegisterProperties(); - // Limits for inline expansion of memcpy/memmove - MaxStoresPerMemcpy = 6; - MaxStoresPerMemmove = 6; + // Align loop entry + setPrefLoopAlignment(4); - // - // Library calls for unsupported operations - // + // Limits for inline expansion of memcpy/memmove + MaxStoresPerMemcpy = 6; + MaxStoresPerMemmove = 6; - setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf"); - setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf"); - - setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti"); - setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti"); - - setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti"); - setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti"); - - setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3"); - setOperationAction(ISD::SDIV, MVT::i32, Expand); - setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3"); - setOperationAction(ISD::SREM, MVT::i32, Expand); - - setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3"); - setOperationAction(ISD::SDIV, MVT::i64, Expand); - setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3"); - setOperationAction(ISD::SREM, MVT::i64, Expand); - - setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3"); - setOperationAction(ISD::UDIV, MVT::i32, Expand); - - setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3"); - setOperationAction(ISD::UDIV, MVT::i64, Expand); - - setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3"); - setOperationAction(ISD::UREM, MVT::i32, Expand); - - setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3"); - setOperationAction(ISD::UREM, MVT::i64, Expand); - - setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3"); - setOperationAction(ISD::FDIV, MVT::f32, Expand); - - setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3"); - setOperationAction(ISD::FDIV, MVT::f64, Expand); - - setOperationAction(ISD::FSQRT, MVT::f32, Expand); - setOperationAction(ISD::FSQRT, MVT::f64, Expand); - setOperationAction(ISD::FSIN, MVT::f32, Expand); - setOperationAction(ISD::FSIN, MVT::f64, Expand); - - if (QRI->Subtarget.hasV5TOps()) { - // Hexagon V5 Support. - setOperationAction(ISD::FADD, MVT::f32, Legal); - setOperationAction(ISD::FADD, MVT::f64, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); - setCondCodeAction(ISD::SETOEQ, MVT::f32, Legal); - setCondCodeAction(ISD::SETOEQ, MVT::f64, Legal); - setCondCodeAction(ISD::SETUEQ, MVT::f32, Legal); - setCondCodeAction(ISD::SETUEQ, MVT::f64, Legal); - - setCondCodeAction(ISD::SETOGE, MVT::f32, Legal); - setCondCodeAction(ISD::SETOGE, MVT::f64, Legal); - setCondCodeAction(ISD::SETUGE, MVT::f32, Legal); - setCondCodeAction(ISD::SETUGE, MVT::f64, Legal); - - setCondCodeAction(ISD::SETOGT, MVT::f32, Legal); - setCondCodeAction(ISD::SETOGT, MVT::f64, Legal); - setCondCodeAction(ISD::SETUGT, MVT::f32, Legal); - setCondCodeAction(ISD::SETUGT, MVT::f64, Legal); - - setCondCodeAction(ISD::SETOLE, MVT::f32, Legal); - setCondCodeAction(ISD::SETOLE, MVT::f64, Legal); - setCondCodeAction(ISD::SETOLT, MVT::f32, Legal); - setCondCodeAction(ISD::SETOLT, MVT::f64, Legal); - - setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - setOperationAction(ISD::ConstantFP, MVT::f64, Legal); - - setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote); - setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); - - setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); - setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); - - setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); - setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); - - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); - - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); - - setOperationAction(ISD::FABS, MVT::f32, Legal); - setOperationAction(ISD::FABS, MVT::f64, Expand); - - setOperationAction(ISD::FNEG, MVT::f32, Legal); - setOperationAction(ISD::FNEG, MVT::f64, Expand); - } else { + // + // Library calls for unsupported operations + // - // Expand fp<->uint. - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); + setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf"); + setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf"); + + setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti"); + setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti"); + + setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti"); + setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti"); + + setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3"); + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3"); + setOperationAction(ISD::SREM, MVT::i32, Expand); + + setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3"); + setOperationAction(ISD::SDIV, MVT::i64, Expand); + setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3"); + setOperationAction(ISD::SREM, MVT::i64, Expand); + + setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3"); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + + setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3"); + setOperationAction(ISD::UDIV, MVT::i64, Expand); + + setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3"); + setOperationAction(ISD::UREM, MVT::i32, Expand); + + setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3"); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3"); + setOperationAction(ISD::FDIV, MVT::f32, Expand); + + setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3"); + setOperationAction(ISD::FDIV, MVT::f64, Expand); + + setOperationAction(ISD::FSQRT, MVT::f32, Expand); + setOperationAction(ISD::FSQRT, MVT::f64, Expand); + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FSIN, MVT::f64, Expand); + + if (Subtarget.hasV5TOps()) { + // Hexagon V5 Support. + setOperationAction(ISD::FADD, MVT::f32, Legal); + setOperationAction(ISD::FADD, MVT::f64, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); + setCondCodeAction(ISD::SETOEQ, MVT::f32, Legal); + setCondCodeAction(ISD::SETOEQ, MVT::f64, Legal); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Legal); + setCondCodeAction(ISD::SETUEQ, MVT::f64, Legal); + + setCondCodeAction(ISD::SETOGE, MVT::f32, Legal); + setCondCodeAction(ISD::SETOGE, MVT::f64, Legal); + setCondCodeAction(ISD::SETUGE, MVT::f32, Legal); + setCondCodeAction(ISD::SETUGE, MVT::f64, Legal); + + setCondCodeAction(ISD::SETOGT, MVT::f32, Legal); + setCondCodeAction(ISD::SETOGT, MVT::f64, Legal); + setCondCodeAction(ISD::SETUGT, MVT::f32, Legal); + setCondCodeAction(ISD::SETUGT, MVT::f64, Legal); + + setCondCodeAction(ISD::SETOLE, MVT::f32, Legal); + setCondCodeAction(ISD::SETOLE, MVT::f64, Legal); + setCondCodeAction(ISD::SETOLT, MVT::f32, Legal); + setCondCodeAction(ISD::SETOLT, MVT::f64, Legal); + + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + + setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); + + setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); + + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); + + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); + + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); + + setOperationAction(ISD::FABS, MVT::f32, Legal); + setOperationAction(ISD::FABS, MVT::f64, Expand); + + setOperationAction(ISD::FNEG, MVT::f32, Legal); + setOperationAction(ISD::FNEG, MVT::f64, Expand); + } else { - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); + // Expand fp<->uint. + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); - setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf"); - setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf"); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); - setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf"); - setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf"); + setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf"); + setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf"); - setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf"); - setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf"); + setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf"); + setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf"); - setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf"); - setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf"); + setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf"); + setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf"); - setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi"); - setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi"); + setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf"); + setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf"); - setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi"); - setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi"); + setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi"); + setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi"); - setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi"); - setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi"); + setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi"); + setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi"); - setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3"); - setOperationAction(ISD::FADD, MVT::f64, Expand); + setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi"); + setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi"); - setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3"); - setOperationAction(ISD::FADD, MVT::f32, Expand); + setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3"); + setOperationAction(ISD::FADD, MVT::f64, Expand); - setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2"); - setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand); + setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3"); + setOperationAction(ISD::FADD, MVT::f32, Expand); - setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2"); - setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); + setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2"); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand); - setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2"); - setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); + setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2"); + setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); - setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2"); - setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); + setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2"); + setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); - setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2"); - setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); + setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2"); + setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); - setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2"); - setCondCodeAction(ISD::SETOGT, MVT::f32, Expand); + setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2"); + setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); - setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2"); - setCondCodeAction(ISD::SETOGT, MVT::f64, Expand); + setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2"); + setCondCodeAction(ISD::SETOGT, MVT::f32, Expand); - setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi"); - setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand); + setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2"); + setCondCodeAction(ISD::SETOGT, MVT::f64, Expand); - setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi"); - setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand); + setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi"); + setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand); - setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2"); - setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); + setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi"); + setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand); - setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2"); - setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); + setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2"); + setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); - setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2"); - setCondCodeAction(ISD::SETOLT, MVT::f64, Expand); + setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2"); + setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); - setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2"); - setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); + setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2"); + setCondCodeAction(ISD::SETOLT, MVT::f64, Expand); - setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3"); - setOperationAction(ISD::FMUL, MVT::f64, Expand); + setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2"); + setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); - setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3"); - setOperationAction(ISD::MUL, MVT::f32, Expand); + setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3"); + setOperationAction(ISD::FMUL, MVT::f64, Expand); - setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2"); - setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); + setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3"); + setOperationAction(ISD::MUL, MVT::f32, Expand); - setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2"); + setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2"); + setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); - setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3"); - setOperationAction(ISD::SUB, MVT::f64, Expand); + setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2"); - setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3"); - setOperationAction(ISD::SUB, MVT::f32, Expand); + setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3"); + setOperationAction(ISD::SUB, MVT::f64, Expand); - setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2"); - setOperationAction(ISD::FP_ROUND, MVT::f64, Expand); + setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3"); + setOperationAction(ISD::SUB, MVT::f32, Expand); - setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2"); - setCondCodeAction(ISD::SETUO, MVT::f64, Expand); + setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2"); + setOperationAction(ISD::FP_ROUND, MVT::f64, Expand); - setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2"); - setCondCodeAction(ISD::SETO, MVT::f64, Expand); + setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2"); + setCondCodeAction(ISD::SETUO, MVT::f64, Expand); - setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2"); - setCondCodeAction(ISD::SETO, MVT::f32, Expand); + setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2"); + setCondCodeAction(ISD::SETO, MVT::f64, Expand); - setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2"); - setCondCodeAction(ISD::SETUO, MVT::f32, Expand); + setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2"); + setCondCodeAction(ISD::SETO, MVT::f32, Expand); - setOperationAction(ISD::FABS, MVT::f32, Expand); - setOperationAction(ISD::FABS, MVT::f64, Expand); - setOperationAction(ISD::FNEG, MVT::f32, Expand); - setOperationAction(ISD::FNEG, MVT::f64, Expand); - } + setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2"); + setCondCodeAction(ISD::SETUO, MVT::f32, Expand); - setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3"); - setOperationAction(ISD::SREM, MVT::i32, Expand); - - setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal); - setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal); - setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); - setIndexedLoadAction(ISD::POST_INC, MVT::i64, Legal); - - setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal); - setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal); - setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); - setIndexedStoreAction(ISD::POST_INC, MVT::i64, Legal); - - setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); - - // Turn FP extload into load/fextend. - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - // Hexagon has a i1 sign extending load. - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand); - // Turn FP truncstore into trunc + store. - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - - // Custom legalize GlobalAddress nodes into CONST32. - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::GlobalAddress, MVT::i8, Custom); - setOperationAction(ISD::BlockAddress, MVT::i32, Custom); - // Truncate action? - setOperationAction(ISD::TRUNCATE, MVT::i64, Expand); - - // Hexagon doesn't have sext_inreg, replace them with shl/sra. - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); - - // Hexagon has no REM or DIVREM operations. - setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::SREM, MVT::i32, Expand); - setOperationAction(ISD::SDIVREM, MVT::i32, Expand); - setOperationAction(ISD::UDIVREM, MVT::i32, Expand); - setOperationAction(ISD::SREM, MVT::i64, Expand); - setOperationAction(ISD::SDIVREM, MVT::i64, Expand); - setOperationAction(ISD::UDIVREM, MVT::i64, Expand); - - setOperationAction(ISD::BSWAP, MVT::i64, Expand); - - // Lower SELECT_CC to SETCC and SELECT. - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - - if (QRI->Subtarget.hasV5TOps()) { - - // We need to make the operation type of SELECT node to be Custom, - // such that we don't go into the infinite loop of - // select -> setcc -> select_cc -> select loop. - setOperationAction(ISD::SELECT, MVT::f32, Custom); - setOperationAction(ISD::SELECT, MVT::f64, Custom); - - setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::f32, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FNEG, MVT::f32, Expand); + setOperationAction(ISD::FNEG, MVT::f64, Expand); + } - } else { + setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3"); + setOperationAction(ISD::SREM, MVT::i32, Expand); + + setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal); + setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal); + setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); + setIndexedLoadAction(ISD::POST_INC, MVT::i64, Legal); + + setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal); + setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal); + setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); + setIndexedStoreAction(ISD::POST_INC, MVT::i64, Legal); + + setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); + + // Turn FP extload into load/fextend. + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + // Hexagon has a i1 sign extending load. + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand); + // Turn FP truncstore into trunc + store. + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + // Custom legalize GlobalAddress nodes into CONST32. + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i8, Custom); + setOperationAction(ISD::BlockAddress, MVT::i32, Custom); + // Truncate action? + setOperationAction(ISD::TRUNCATE, MVT::i64, Expand); + + // Hexagon doesn't have sext_inreg, replace them with shl/sra. + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + // Hexagon has no REM or DIVREM operations. + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i64, Expand); + setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + setOperationAction(ISD::UDIVREM, MVT::i64, Expand); + + setOperationAction(ISD::BSWAP, MVT::i64, Expand); + + // Lower SELECT_CC to SETCC and SELECT. + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + + if (Subtarget.hasV5TOps()) { + + // We need to make the operation type of SELECT node to be Custom, + // such that we don't go into the infinite loop of + // select -> setcc -> select_cc -> select loop. + setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Custom); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); - // Hexagon has no select or setcc: expand to SELECT_CC. - setOperationAction(ISD::SELECT, MVT::f32, Expand); - setOperationAction(ISD::SELECT, MVT::f64, Expand); - } + } else { - if (EmitJumpTables) { - setOperationAction(ISD::BR_JT, MVT::Other, Custom); - } else { - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - } - // Increase jump tables cutover to 5, was 4. - setMinimumJumpTableEntries(5); - - setOperationAction(ISD::BR_CC, MVT::f32, Expand); - setOperationAction(ISD::BR_CC, MVT::f64, Expand); - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::BR_CC, MVT::i32, Expand); - setOperationAction(ISD::BR_CC, MVT::i64, Expand); - - setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); - - setOperationAction(ISD::FSIN , MVT::f64, Expand); - setOperationAction(ISD::FCOS , MVT::f64, Expand); - setOperationAction(ISD::FREM , MVT::f64, Expand); - setOperationAction(ISD::FSIN , MVT::f32, Expand); - setOperationAction(ISD::FCOS , MVT::f32, Expand); - setOperationAction(ISD::FREM , MVT::f32, Expand); - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f32, Expand); - - // In V4, we have double word add/sub with carry. The problem with - // modelling this instruction is that it produces 2 results - Rdd and Px. - // To model update of Px, we will have to use Defs[p0..p3] which will - // cause any predicate live range to spill. So, we pretend we dont't - // have these instructions. - setOperationAction(ISD::ADDE, MVT::i8, Expand); - setOperationAction(ISD::ADDE, MVT::i16, Expand); - setOperationAction(ISD::ADDE, MVT::i32, Expand); - setOperationAction(ISD::ADDE, MVT::i64, Expand); - setOperationAction(ISD::SUBE, MVT::i8, Expand); - setOperationAction(ISD::SUBE, MVT::i16, Expand); - setOperationAction(ISD::SUBE, MVT::i32, Expand); - setOperationAction(ISD::SUBE, MVT::i64, Expand); - setOperationAction(ISD::ADDC, MVT::i8, Expand); - setOperationAction(ISD::ADDC, MVT::i16, Expand); - setOperationAction(ISD::ADDC, MVT::i32, Expand); - setOperationAction(ISD::ADDC, MVT::i64, Expand); - setOperationAction(ISD::SUBC, MVT::i8, Expand); - setOperationAction(ISD::SUBC, MVT::i16, Expand); - setOperationAction(ISD::SUBC, MVT::i32, Expand); - setOperationAction(ISD::SUBC, MVT::i64, Expand); - - setOperationAction(ISD::CTPOP, MVT::i32, Expand); - setOperationAction(ISD::CTPOP, MVT::i64, Expand); - setOperationAction(ISD::CTTZ , MVT::i32, Expand); - setOperationAction(ISD::CTTZ , MVT::i64, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::CTLZ , MVT::i32, Expand); - setOperationAction(ISD::CTLZ , MVT::i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::ROTL , MVT::i32, Expand); - setOperationAction(ISD::ROTR , MVT::i32, Expand); - setOperationAction(ISD::BSWAP, MVT::i32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - setOperationAction(ISD::FPOW , MVT::f64, Expand); - setOperationAction(ISD::FPOW , MVT::f32, Expand); - - setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); - setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); - setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); - - setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); - setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); - - setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); - - setOperationAction(ISD::EH_RETURN, MVT::Other, Custom); - - if (TM.getSubtargetImpl()->isSubtargetV2()) { - setExceptionPointerRegister(Hexagon::R20); - setExceptionSelectorRegister(Hexagon::R21); - } else { - setExceptionPointerRegister(Hexagon::R0); - setExceptionSelectorRegister(Hexagon::R1); - } + // Hexagon has no select or setcc: expand to SELECT_CC. + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::f64, Expand); + } - // VASTART needs to be custom lowered to use the VarArgsFrameIndex. - setOperationAction(ISD::VASTART , MVT::Other, Custom); + if (EmitJumpTables) { + setOperationAction(ISD::BR_JT, MVT::Other, Custom); + } else { + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + } + // Increase jump tables cutover to 5, was 4. + setMinimumJumpTableEntries(5); + + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BR_CC, MVT::f64, Expand); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); + + setOperationAction(ISD::FSIN, MVT::f64, Expand); + setOperationAction(ISD::FCOS, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f32, Expand); + setOperationAction(ISD::FREM, MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + + // In V4, we have double word add/sub with carry. The problem with + // modelling this instruction is that it produces 2 results - Rdd and Px. + // To model update of Px, we will have to use Defs[p0..p3] which will + // cause any predicate live range to spill. So, we pretend we dont't + // have these instructions. + setOperationAction(ISD::ADDE, MVT::i8, Expand); + setOperationAction(ISD::ADDE, MVT::i16, Expand); + setOperationAction(ISD::ADDE, MVT::i32, Expand); + setOperationAction(ISD::ADDE, MVT::i64, Expand); + setOperationAction(ISD::SUBE, MVT::i8, Expand); + setOperationAction(ISD::SUBE, MVT::i16, Expand); + setOperationAction(ISD::SUBE, MVT::i32, Expand); + setOperationAction(ISD::SUBE, MVT::i64, Expand); + setOperationAction(ISD::ADDC, MVT::i8, Expand); + setOperationAction(ISD::ADDC, MVT::i16, Expand); + setOperationAction(ISD::ADDC, MVT::i32, Expand); + setOperationAction(ISD::ADDC, MVT::i64, Expand); + setOperationAction(ISD::SUBC, MVT::i8, Expand); + setOperationAction(ISD::SUBC, MVT::i16, Expand); + setOperationAction(ISD::SUBC, MVT::i32, Expand); + setOperationAction(ISD::SUBC, MVT::i64, Expand); + + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + setOperationAction(ISD::CTPOP, MVT::i64, Expand); + setOperationAction(ISD::CTTZ, MVT::i32, Expand); + setOperationAction(ISD::CTTZ, MVT::i64, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTLZ, MVT::i32, Expand); + setOperationAction(ISD::CTLZ, MVT::i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTR, MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FPOW, MVT::f64, Expand); + setOperationAction(ISD::FPOW, MVT::f32, Expand); + + setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); + + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + + setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); + + setOperationAction(ISD::EH_RETURN, MVT::Other, Custom); + + if (Subtarget.isSubtargetV2()) { + setExceptionPointerRegister(Hexagon::R20); + setExceptionSelectorRegister(Hexagon::R21); + } else { + setExceptionPointerRegister(Hexagon::R0); + setExceptionSelectorRegister(Hexagon::R1); + } - // Use the default implementation. - setOperationAction(ISD::VAARG , MVT::Other, Expand); - setOperationAction(ISD::VACOPY , MVT::Other, Expand); - setOperationAction(ISD::VAEND , MVT::Other, Expand); - setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); - setOperationAction(ISD::STACKRESTORE , MVT::Other, Expand); + // VASTART needs to be custom lowered to use the VarArgsFrameIndex. + setOperationAction(ISD::VASTART, MVT::Other, Custom); + // Use the default implementation. + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); - setOperationAction(ISD::INLINEASM , MVT::Other, Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + setOperationAction(ISD::INLINEASM, MVT::Other, Custom); - setMinFunctionAlignment(2); + setMinFunctionAlignment(2); - // Needed for DYNAMIC_STACKALLOC expansion. - unsigned StackRegister = TM.getRegisterInfo()->getStackRegister(); - setStackPointerRegisterToSaveRestore(StackRegister); - setSchedulingPreference(Sched::VLIW); + // Needed for DYNAMIC_STACKALLOC expansion. + const HexagonRegisterInfo *QRI = + static_cast(TM.getRegisterInfo()); + setStackPointerRegisterToSaveRestore(QRI->getStackRegister()); + setSchedulingPreference(Sched::VLIW); } - const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { @@ -1620,8 +1623,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(const /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - const HexagonRegisterInfo* QRI = TM.getRegisterInfo(); - return QRI->Subtarget.hasV5TOps(); + return TM.getSubtarget().hasV5TOps(); } /// isLegalAddressingMode - Return true if the addressing mode represented by diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 0ddaf846b31d..ec16cc8f894b 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -74,8 +74,8 @@ namespace llvm { unsigned& RetSize) const; public: - HexagonTargetMachine &TM; - explicit HexagonTargetLowering(HexagonTargetMachine &targetmachine); + const TargetMachine &TM; + explicit HexagonTargetLowering(const TargetMachine &targetmachine); /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index ea6367a89bce..1c95e06c8923 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1538,14 +1538,13 @@ int HexagonInstrInfo::GetDotOldOp(const int opc) const { int NewOp = opc; if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form NewOp = Hexagon::getPredOldOpcode(NewOp); - if (NewOp < 0) - assert(0 && "Couldn't change predicate new instruction to its old form."); + assert(NewOp >= 0 && + "Couldn't change predicate new instruction to its old form."); } if (isNewValueStore(NewOp)) { // Convert into non-new-value format NewOp = Hexagon::getNonNVStore(NewOp); - if (NewOp < 0) - assert(0 && "Couldn't change new-value store to its old form."); + assert(NewOp >= 0 && "Couldn't change new-value store to its old form."); } return NewOp; } diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp index 7dd6e956ccb6..6fcaa2057404 100644 --- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp +++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp @@ -20,7 +20,7 @@ using namespace llvm; #define DEBUG_TYPE "misched" -/// Platform specific modifications to DAG. +/// Platform-specific modifications to DAG. void VLIWMachineScheduler::postprocessDAG() { SUnit* LastSequentialCall = nullptr; // Currently we only catch the situation when compare gets scheduled @@ -150,7 +150,7 @@ void VLIWMachineScheduler::schedule() { buildDAGWithRegPressure(); - // Postprocess the DAG to add platform specific artificial dependencies. + // Postprocess the DAG to add platform-specific artificial dependencies. postprocessDAG(); SmallVector TopRoots, BotRoots; diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h index 99100a141e6e..8c4108609606 100644 --- a/lib/Target/Hexagon/HexagonMachineScheduler.h +++ b/lib/Target/Hexagon/HexagonMachineScheduler.h @@ -100,7 +100,7 @@ class VLIWMachineScheduler : public ScheduleDAGMILive { /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's /// time to do some work. virtual void schedule() override; - /// Perform platform specific DAG postprocessing. + /// Perform platform-specific DAG postprocessing. void postprocessDAG(); }; diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp index f66ffd284a93..b5db997eb1b8 100644 --- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp +++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp @@ -18,8 +18,8 @@ using namespace llvm; bool llvm::flag_aligned_memcpy; -HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const HexagonTargetMachine &TM) - : TargetSelectionDAGInfo(TM.getDataLayout()) {} +HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() { } diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h index 8ba6108bdfad..b40b30320cf1 100644 --- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h +++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h @@ -18,11 +18,9 @@ namespace llvm { -class HexagonTargetMachine; - class HexagonSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit HexagonSelectionDAGInfo(const HexagonTargetMachine &TM); + explicit HexagonSelectionDAGInfo(const DataLayout &DL); ~HexagonSelectionDAGInfo(); SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp index 70c87fa19d1a..657893f32fee 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -48,10 +48,8 @@ EnableIEEERndNear( cl::Hidden, cl::ZeroOrMore, cl::init(false), cl::desc("Generate non-chopped conversion from fp to int.")); -HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS): - HexagonGenSubtargetInfo(TT, CPU, FS), - CPUString(CPU.str()) { - +HexagonSubtarget & +HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { // If the programmer has not specified a Hexagon version, default to -mv4. if (CPUString.empty()) CPUString = "hexagonv4"; @@ -70,6 +68,15 @@ HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS): } ParseSubtargetFeatures(CPUString, FS); + return *this; +} + +HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS, + const TargetMachine &TM) + : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU.str()), + DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32"), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM), + TSInfo(DL), FrameLowering() { // Initialize scheduling itinerary for the specified CPU. InstrItins = getInstrItineraryForCPU(CPUString); diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index 690bef0d7296..b184e62b4d0d 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -14,6 +14,11 @@ #ifndef Hexagon_SUBTARGET_H #define Hexagon_SUBTARGET_H +#include "HexagonFrameLowering.h" +#include "HexagonInstrInfo.h" +#include "HexagonISelLowering.h" +#include "HexagonSelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" #include @@ -28,6 +33,7 @@ namespace llvm { class HexagonSubtarget : public HexagonGenSubtargetInfo { virtual void anchor(); + bool UseMemOps; bool ModeIEEERndNear; @@ -37,16 +43,35 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { }; HexagonArchEnum HexagonArchVersion; +private: std::string CPUString; + const DataLayout DL; // Calculates type size & alignment. + HexagonInstrInfo InstrInfo; + HexagonTargetLowering TLInfo; + HexagonSelectionDAGInfo TSInfo; + HexagonFrameLowering FrameLowering; InstrItineraryData InstrItins; public: - HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS); + HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS, + const TargetMachine &TM); /// getInstrItins - Return the instruction itineraies based on subtarget /// selection. const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + const HexagonInstrInfo *getInstrInfo() const { return &InstrInfo; } + const HexagonRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const HexagonTargetLowering *getTargetLowering() const { return &TLInfo; } + const HexagonFrameLowering *getFrameLowering() const { + return &FrameLowering; + } + const HexagonSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + const DataLayout *getDataLayout() const { return &DL; } + HexagonSubtarget &initializeSubtargetDependencies(StringRef CPU, + StringRef FS); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index b9237647ff4a..78314100d18a 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -67,15 +67,10 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, - CodeModel::Model CM, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32") , - Subtarget(TT, CPU, FS), InstrInfo(Subtarget), TLInfo(*this), - TSInfo(*this), - FrameLowering(Subtarget), - InstrItins(&Subtarget.getInstrItineraryData()) { + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h index 70b835e61eef..d88178e052e7 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/lib/Target/Hexagon/HexagonTargetMachine.h @@ -14,12 +14,8 @@ #ifndef HexagonTARGETMACHINE_H #define HexagonTARGETMACHINE_H -#include "HexagonFrameLowering.h" -#include "HexagonISelLowering.h" #include "HexagonInstrInfo.h" -#include "HexagonSelectionDAGInfo.h" #include "HexagonSubtarget.h" -#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -27,13 +23,7 @@ namespace llvm { class Module; class HexagonTargetMachine : public LLVMTargetMachine { - const DataLayout DL; // Calculates type size & alignment. HexagonSubtarget Subtarget; - HexagonInstrInfo InstrInfo; - HexagonTargetLowering TLInfo; - HexagonSelectionDAGInfo TSInfo; - HexagonFrameLowering FrameLowering; - const InstrItineraryData* InstrItins; public: HexagonTargetMachine(const Target &T, StringRef TT,StringRef CPU, @@ -42,33 +32,29 @@ class HexagonTargetMachine : public LLVMTargetMachine { CodeGenOpt::Level OL); const HexagonInstrInfo *getInstrInfo() const override { - return &InstrInfo; + return getSubtargetImpl()->getInstrInfo(); } const HexagonSubtarget *getSubtargetImpl() const override { return &Subtarget; } const HexagonRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); } - const InstrItineraryData* getInstrItineraryData() const override { - return InstrItins; + return &getSubtargetImpl()->getInstrItineraryData(); } - - const HexagonTargetLowering* getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } - const HexagonFrameLowering* getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); } - const HexagonSelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - - const DataLayout *getDataLayout() const override { return &DL; } static unsigned getModuleMatchQuality(const Module &M); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/Hexagon/LLVMBuild.txt b/lib/Target/Hexagon/LLVMBuild.txt index 0cf9a062b659..a436b6e0454e 100644 --- a/lib/Target/Hexagon/LLVMBuild.txt +++ b/lib/Target/Hexagon/LLVMBuild.txt @@ -28,5 +28,5 @@ has_asmprinter = 1 type = Library name = HexagonCodeGen parent = Hexagon -required_libraries = Analysis AsmPrinter CodeGen Core HexagonAsmPrinter HexagonDesc HexagonInfo MC Scalar SelectionDAG Support Target TransformUtils +required_libraries = Analysis AsmPrinter CodeGen Core HexagonAsmPrinter HexagonDesc HexagonInfo MC SelectionDAG Support Target add_to_library_groups = Hexagon diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h index d464dd99698a..fadfeedd1853 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.h +++ b/lib/Target/MSP430/MSP430FrameLowering.h @@ -15,20 +15,15 @@ #define MSP430_FRAMEINFO_H #include "MSP430.h" -#include "MSP430Subtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { - class MSP430Subtarget; - class MSP430FrameLowering : public TargetFrameLowering { protected: - const MSP430Subtarget &STI; public: - explicit MSP430FrameLowering(const MSP430Subtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2), - STI(sti) {} + explicit MSP430FrameLowering() + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2) {} /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index c5901bcd6e47..3d3ee92d4bcb 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -57,11 +57,8 @@ HWMultMode("msp430-hwmult-mode", cl::Hidden, "Assume hardware multiplier cannot be used inside interrupts"), clEnumValEnd)); -MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) : - TargetLowering(tm, new TargetLoweringObjectFileELF()), - Subtarget(*tm.getSubtargetImpl()) { - - TD = getDataLayout(); +MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM) + : TargetLowering(TM, new TargetLoweringObjectFileELF()) { // Set up the register classes. addRegisterClass(MVT::i8, &MSP430::GR8RegClass); @@ -1032,7 +1029,7 @@ MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. - uint64_t SlotSize = TD->getPointerSize(); + uint64_t SlotSize = getDataLayout()->getPointerSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, true); FuncInfo->setRAIndex(ReturnAddrIndex); @@ -1055,7 +1052,7 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op, if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = - DAG.getConstant(TD->getPointerSize(), MVT::i16); + DAG.getConstant(getDataLayout()->getPointerSize(), MVT::i16); return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameAddr, Offset), diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h index 3ced61de3f4f..3e2f344aeb5a 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.h +++ b/lib/Target/MSP430/MSP430ISelLowering.h @@ -66,12 +66,9 @@ namespace llvm { }; } - class MSP430Subtarget; - class MSP430TargetMachine; - class MSP430TargetLowering : public TargetLowering { public: - explicit MSP430TargetLowering(MSP430TargetMachine &TM); + explicit MSP430TargetLowering(const TargetMachine &TM); MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; } @@ -170,9 +167,6 @@ namespace llvm { SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; - - const MSP430Subtarget &Subtarget; - const DataLayout *TD; }; } // namespace llvm diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp index 0c04ddbb110b..ccb6c09e3f41 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -30,9 +30,9 @@ using namespace llvm; // Pin the vtable to this file. void MSP430InstrInfo::anchor() {} -MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm) +MSP430InstrInfo::MSP430InstrInfo(MSP430Subtarget &STI) : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP), - RI(tm) {} + RI() {} void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h index 1ffcebb01543..e6baaefe4842 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.h +++ b/lib/Target/MSP430/MSP430InstrInfo.h @@ -22,7 +22,7 @@ namespace llvm { -class MSP430TargetMachine; +class MSP430Subtarget; /// MSP430II - This namespace holds all of the target specific flags that /// instruction info tracks. @@ -44,7 +44,7 @@ class MSP430InstrInfo : public MSP430GenInstrInfo { const MSP430RegisterInfo RI; virtual void anchor(); public: - explicit MSP430InstrInfo(MSP430TargetMachine &TM); + explicit MSP430InstrInfo(MSP430Subtarget &STI); /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index 341fb64b8fcb..691bceef0de0 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -32,10 +32,8 @@ using namespace llvm; #include "MSP430GenRegisterInfo.inc" // FIXME: Provide proper call frame setup / destroy opcodes. -MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm) - : MSP430GenRegisterInfo(MSP430::PCW), TM(tm) { - StackAlign = TM.getFrameLowering()->getStackAlignment(); -} +MSP430RegisterInfo::MSP430RegisterInfo() + : MSP430GenRegisterInfo(MSP430::PCW) {} const MCPhysReg* MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h index a607528fd118..cb0196134a8d 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.h +++ b/lib/Target/MSP430/MSP430RegisterInfo.h @@ -21,18 +21,9 @@ namespace llvm { -class TargetInstrInfo; -class MSP430TargetMachine; - struct MSP430RegisterInfo : public MSP430GenRegisterInfo { -private: - MSP430TargetMachine &TM; - - /// StackAlign - Default stack alignment. - /// - unsigned StackAlign; public: - MSP430RegisterInfo(MSP430TargetMachine &tm); + MSP430RegisterInfo(); /// Code Generation virtual methods... const MCPhysReg * diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp index 6ebddaffd30a..3897ef684d4d 100644 --- a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp +++ b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp @@ -16,8 +16,8 @@ using namespace llvm; #define DEBUG_TYPE "msp430-selectiondag-info" -MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const MSP430TargetMachine &TM) - : TargetSelectionDAGInfo(TM.getDataLayout()) {} +MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} MSP430SelectionDAGInfo::~MSP430SelectionDAGInfo() { } diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.h b/lib/Target/MSP430/MSP430SelectionDAGInfo.h index fa8194830ff8..cb04adc0de1e 100644 --- a/lib/Target/MSP430/MSP430SelectionDAGInfo.h +++ b/lib/Target/MSP430/MSP430SelectionDAGInfo.h @@ -22,7 +22,7 @@ class MSP430TargetMachine; class MSP430SelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit MSP430SelectionDAGInfo(const MSP430TargetMachine &TM); + explicit MSP430SelectionDAGInfo(const DataLayout &DL); ~MSP430SelectionDAGInfo(); }; diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp index 68ad0911aff4..dbddc5243db0 100644 --- a/lib/Target/MSP430/MSP430Subtarget.cpp +++ b/lib/Target/MSP430/MSP430Subtarget.cpp @@ -25,12 +25,15 @@ using namespace llvm; void MSP430Subtarget::anchor() { } -MSP430Subtarget::MSP430Subtarget(const std::string &TT, - const std::string &CPU, - const std::string &FS) : - MSP430GenSubtargetInfo(TT, CPU, FS) { - std::string CPUName = "generic"; - - // Parse features string. - ParseSubtargetFeatures(CPUName, FS); +MSP430Subtarget &MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { + ParseSubtargetFeatures("generic", FS); + return *this; } + +MSP430Subtarget::MSP430Subtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, const TargetMachine &TM) + : MSP430GenSubtargetInfo(TT, CPU, FS), + // FIXME: Check DataLayout string. + DL("e-m:e-p:16:16-i32:16:32-n8:16"), FrameLowering(), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM), + TSInfo(DL) {} diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h index 4d8792eede7f..0152ad19dfcd 100644 --- a/lib/Target/MSP430/MSP430Subtarget.h +++ b/lib/Target/MSP430/MSP430Subtarget.h @@ -14,6 +14,12 @@ #ifndef LLVM_TARGET_MSP430_SUBTARGET_H #define LLVM_TARGET_MSP430_SUBTARGET_H +#include "MSP430FrameLowering.h" +#include "MSP430InstrInfo.h" +#include "MSP430ISelLowering.h" +#include "MSP430RegisterInfo.h" +#include "MSP430SelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetSubtargetInfo.h" #include @@ -26,16 +32,33 @@ class StringRef; class MSP430Subtarget : public MSP430GenSubtargetInfo { virtual void anchor(); bool ExtendedInsts; + const DataLayout DL; // Calculates type size & alignment + MSP430FrameLowering FrameLowering; + MSP430InstrInfo InstrInfo; + MSP430TargetLowering TLInfo; + MSP430SelectionDAGInfo TSInfo; + public: /// This constructor initializes the data members to match that /// of the specified triple. /// MSP430Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS); + const std::string &FS, const TargetMachine &TM); + + MSP430Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } + const MSP430InstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const TargetRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const MSP430TargetLowering *getTargetLowering() const { return &TLInfo; } + const MSP430SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } }; } // End llvm namespace diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index 50be2be3007b..5ca36f2e4e77 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -24,19 +24,13 @@ extern "C" void LLVMInitializeMSP430Target() { RegisterTargetMachine X(TheMSP430Target); } -MSP430TargetMachine::MSP430TargetMachine(const Target &T, - StringRef TT, - StringRef CPU, - StringRef FS, +MSP430TargetMachine::MSP430TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS), - // FIXME: Check DataLayout string. - DL("e-m:e-p:16:16-i32:16:32-n8:16"), - InstrInfo(*this), TLInfo(*this), TSInfo(*this), - FrameLowering(Subtarget) { + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h index ea5d4073700b..efa84039ff6b 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.h +++ b/lib/Target/MSP430/MSP430TargetMachine.h @@ -15,13 +15,7 @@ #ifndef LLVM_TARGET_MSP430_TARGETMACHINE_H #define LLVM_TARGET_MSP430_TARGETMACHINE_H -#include "MSP430FrameLowering.h" -#include "MSP430ISelLowering.h" -#include "MSP430InstrInfo.h" -#include "MSP430RegisterInfo.h" -#include "MSP430SelectionDAGInfo.h" #include "MSP430Subtarget.h" -#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" @@ -31,11 +25,6 @@ namespace llvm { /// class MSP430TargetMachine : public LLVMTargetMachine { MSP430Subtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - MSP430InstrInfo InstrInfo; - MSP430TargetLowering TLInfo; - MSP430SelectionDAGInfo TSInfo; - MSP430FrameLowering FrameLowering; public: MSP430TargetMachine(const Target &T, StringRef TT, @@ -44,22 +33,25 @@ class MSP430TargetMachine : public LLVMTargetMachine { CodeGenOpt::Level OL); const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); + } + const MSP430InstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); + } + const MSP430Subtarget *getSubtargetImpl() const override { + return &Subtarget; } - const MSP430InstrInfo *getInstrInfo() const override { return &InstrInfo; } - const DataLayout *getDataLayout() const override { return &DL;} - const MSP430Subtarget *getSubtargetImpl() const override { return &Subtarget; } - const TargetRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); } - const MSP430TargetLowering *getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } - - const MSP430SelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + const MSP430SelectionDAGInfo *getSelectionDAGInfo() const override { + return getSubtargetImpl()->getSelectionDAGInfo(); } TargetPassConfig *createPassConfig(PassManagerBase &PM) override; }; // MSP430TargetMachine. diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index dd2b857789ca..53b30f9210b0 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -116,19 +116,25 @@ class MipsAsmParser : public MCTargetAsmParser { bool needsExpansion(MCInst &Inst); - void expandInstruction(MCInst &Inst, SMLoc IDLoc, + // Expands assembly pseudo instructions. + // Returns false on success, true otherwise. + bool expandInstruction(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl &Instructions); - void expandLoadImm(MCInst &Inst, SMLoc IDLoc, + + bool expandLoadImm(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl &Instructions); - void expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc, + + bool expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl &Instructions); - void expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc, + + bool expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl &Instructions); + void expandMemInst(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl &Instructions, bool isLoad, bool isImmOpnd); - bool reportParseError(StringRef ErrorMsg); - bool reportParseError(SMLoc Loc, StringRef ErrorMsg); + bool reportParseError(Twine ErrorMsg); + bool reportParseError(SMLoc Loc, Twine ErrorMsg); bool parseMemOffset(const MCExpr *&Res, bool isParenExpr); bool parseRelocOperand(const MCExpr *&Res); @@ -150,39 +156,20 @@ class MipsAsmParser : public MCTargetAsmParser { bool parseSetReorderDirective(); bool parseSetNoReorderDirective(); bool parseSetNoMips16Directive(); + bool parseSetFpDirective(); bool parseSetAssignment(); bool parseDataDirective(unsigned Size, SMLoc L); bool parseDirectiveGpWord(); bool parseDirectiveGpDWord(); + bool parseDirectiveModule(); + bool parseDirectiveModuleFP(); + bool parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI, + StringRef Directive); MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol); - bool isGP64() const { - return (STI.getFeatureBits() & Mips::FeatureGP64Bit) != 0; - } - - bool isFP64() const { - return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0; - } - - bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; } - bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; } - - bool isMicroMips() const { - return STI.getFeatureBits() & Mips::FeatureMicroMips; - } - - bool hasMips4() const { return STI.getFeatureBits() & Mips::FeatureMips4; } - bool hasMips32() const { return STI.getFeatureBits() & Mips::FeatureMips32; } - bool hasMips32r6() const { - return STI.getFeatureBits() & Mips::FeatureMips32r6; - } - bool hasMips64r6() const { - return STI.getFeatureBits() & Mips::FeatureMips64r6; - } - bool eatComma(StringRef ErrorStr); int matchCPURegisterName(StringRef Symbol); @@ -237,17 +224,21 @@ class MipsAsmParser : public MCTargetAsmParser { }; MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, - const MCInstrInfo &MII, - const MCTargetOptions &Options) + const MCInstrInfo &MII, const MCTargetOptions &Options) : MCTargetAsmParser(), STI(sti), Parser(parser) { // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + getTargetStreamer().updateABIInfo(*this); + // Assert exactly one ABI was chosen. assert((((STI.getFeatureBits() & Mips::FeatureO32) != 0) + ((STI.getFeatureBits() & Mips::FeatureEABI) != 0) + ((STI.getFeatureBits() & Mips::FeatureN32) != 0) + ((STI.getFeatureBits() & Mips::FeatureN64) != 0)) == 1); + + if (!isABI_O32() && !useOddSPReg() != 0) + report_fatal_error("-mno-odd-spreg requires the O32 ABI"); } MCAsmParser &getParser() const { return Parser; } @@ -256,6 +247,53 @@ class MipsAsmParser : public MCTargetAsmParser { /// True if all of $fcc0 - $fcc7 exist for the current ISA. bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); } + bool isGP64bit() const { return STI.getFeatureBits() & Mips::FeatureGP64Bit; } + bool isFP64bit() const { return STI.getFeatureBits() & Mips::FeatureFP64Bit; } + bool isABI_N32() const { return STI.getFeatureBits() & Mips::FeatureN32; } + bool isABI_N64() const { return STI.getFeatureBits() & Mips::FeatureN64; } + bool isABI_O32() const { return STI.getFeatureBits() & Mips::FeatureO32; } + bool isABI_FPXX() const { return STI.getFeatureBits() & Mips::FeatureFPXX; } + + bool useOddSPReg() const { + return !(STI.getFeatureBits() & Mips::FeatureNoOddSPReg); + } + + bool inMicroMipsMode() const { + return STI.getFeatureBits() & Mips::FeatureMicroMips; + } + bool hasMips1() const { return STI.getFeatureBits() & Mips::FeatureMips1; } + bool hasMips2() const { return STI.getFeatureBits() & Mips::FeatureMips2; } + bool hasMips3() const { return STI.getFeatureBits() & Mips::FeatureMips3; } + bool hasMips4() const { return STI.getFeatureBits() & Mips::FeatureMips4; } + bool hasMips5() const { return STI.getFeatureBits() & Mips::FeatureMips5; } + bool hasMips32() const { + return (STI.getFeatureBits() & Mips::FeatureMips32); + } + bool hasMips64() const { + return (STI.getFeatureBits() & Mips::FeatureMips64); + } + bool hasMips32r2() const { + return (STI.getFeatureBits() & Mips::FeatureMips32r2); + } + bool hasMips64r2() const { + return (STI.getFeatureBits() & Mips::FeatureMips64r2); + } + bool hasMips32r6() const { + return (STI.getFeatureBits() & Mips::FeatureMips32r6); + } + bool hasMips64r6() const { + return (STI.getFeatureBits() & Mips::FeatureMips64r6); + } + bool hasDSP() const { return (STI.getFeatureBits() & Mips::FeatureDSP); } + bool hasDSPR2() const { return (STI.getFeatureBits() & Mips::FeatureDSPR2); } + bool hasMSA() const { return (STI.getFeatureBits() & Mips::FeatureMSA); } + + bool inMips16Mode() const { + return STI.getFeatureBits() & Mips::FeatureMips16; + } + // TODO: see how can we get this info. + bool abiUsesSoftFloat() const { return false; } + /// Warn if RegNo is the current assembler temporary. void WarnIfAssemblerTemporary(int RegNo, SMLoc Loc); }; @@ -270,9 +308,9 @@ class MipsOperand : public MCParsedAsmOperand { /// Broad categories of register classes /// The exact class is finalized by the render method. enum RegKind { - RegKind_GPR = 1, /// GPR32 and GPR64 (depending on isGP64()) + RegKind_GPR = 1, /// GPR32 and GPR64 (depending on isGP64bit()) RegKind_FGR = 2, /// FGR32, FGR64, AFGR64 (depending on context and - /// isFP64()) + /// isFP64bit()) RegKind_FCC = 4, /// FCC RegKind_MSA128 = 8, /// MSA128[BHWD] (makes no difference which) RegKind_MSACtrl = 16, /// MSA control registers @@ -533,6 +571,10 @@ class MipsOperand : public MCParsedAsmOperand { void addFGR32AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::CreateReg(getFGR32Reg())); + // FIXME: We ought to do this for -integrated-as without -via-file-asm too. + if (!AsmParser.useOddSPReg() && RegIdx.Index & 1) + AsmParser.Error(StartLoc, "-mno-odd-spreg prohibits the use of odd FPU " + "registers"); } void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const { @@ -876,9 +918,10 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, Offset = Inst.getOperand(2); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. - if (!isIntN(isMicroMips() ? 17 : 18, Offset.getImm())) + if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), 1LL << (isMicroMips() ? 1 : 2))) + if (OffsetToAlignment(Offset.getImm(), + 1LL << (inMicroMipsMode() ? 1 : 2))) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BGEZ: @@ -901,9 +944,10 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, Offset = Inst.getOperand(1); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. - if (!isIntN(isMicroMips() ? 17 : 18, Offset.getImm())) + if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), 1LL << (isMicroMips() ? 1 : 2))) + if (OffsetToAlignment(Offset.getImm(), + 1LL << (inMicroMipsMode() ? 1 : 2))) return Error(IDLoc, "branch to misaligned address"); break; } @@ -965,7 +1009,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, } // if load/store if (needsExpansion(Inst)) - expandInstruction(Inst, IDLoc, Instructions); + return expandInstruction(Inst, IDLoc, Instructions); else Instructions.push_back(Inst); @@ -978,17 +1022,27 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) { case Mips::LoadImm32Reg: case Mips::LoadAddr32Imm: case Mips::LoadAddr32Reg: + case Mips::LoadImm64Reg: return true; default: return false; } } -void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, +bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl &Instructions) { switch (Inst.getOpcode()) { + default: + assert(0 && "unimplemented expansion"); + return true; case Mips::LoadImm32Reg: return expandLoadImm(Inst, IDLoc, Instructions); + case Mips::LoadImm64Reg: + if (!isGP64bit()) { + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + } + return expandLoadImm(Inst, IDLoc, Instructions); case Mips::LoadAddr32Imm: return expandLoadAddressImm(Inst, IDLoc, Instructions); case Mips::LoadAddr32Reg: @@ -996,7 +1050,31 @@ void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, } } -void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc, +namespace { +template +void createShiftOr(int64_t Value, unsigned RegNo, SMLoc IDLoc, + SmallVectorImpl &Instructions) { + MCInst tmpInst; + if (PerformShift) { + tmpInst.setOpcode(Mips::DSLL); + tmpInst.addOperand(MCOperand::CreateReg(RegNo)); + tmpInst.addOperand(MCOperand::CreateReg(RegNo)); + tmpInst.addOperand(MCOperand::CreateImm(16)); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); + tmpInst.clear(); + } + tmpInst.setOpcode(Mips::ORi); + tmpInst.addOperand(MCOperand::CreateReg(RegNo)); + tmpInst.addOperand(MCOperand::CreateReg(RegNo)); + tmpInst.addOperand( + MCOperand::CreateImm(((Value & (0xffffLL << Shift)) >> Shift))); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} +} + +bool MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl &Instructions) { MCInst tmpInst; const MCOperand &ImmOp = Inst.getOperand(1); @@ -1004,8 +1082,10 @@ void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc, const MCOperand &RegOp = Inst.getOperand(0); assert(RegOp.isReg() && "expected register operand kind"); - int ImmValue = ImmOp.getImm(); + int64_t ImmValue = ImmOp.getImm(); tmpInst.setLoc(IDLoc); + // FIXME: gas has a special case for values that are 000...1111, which + // becomes a li -1 and then a dsrl if (0 <= ImmValue && ImmValue <= 65535) { // For 0 <= j <= 65535. // li d,j => ori d,$zero,j @@ -1022,25 +1102,76 @@ void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc, tmpInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); tmpInst.addOperand(MCOperand::CreateImm(ImmValue)); Instructions.push_back(tmpInst); - } else { - // For any other value of j that is representable as a 32-bit integer. + } else if ((ImmValue & 0xffffffff) == ImmValue) { + // For any value of j that is representable as a 32-bit integer, create + // a sequence of: // li d,j => lui d,hi16(j) // ori d,d,lo16(j) tmpInst.setOpcode(Mips::LUi); tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg())); tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16)); Instructions.push_back(tmpInst); - tmpInst.clear(); - tmpInst.setOpcode(Mips::ORi); + createShiftOr<0, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions); + } else if ((ImmValue & (0xffffLL << 48)) == 0) { + if (!isGP64bit()) { + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + } + + // <------- lo32 ------> + // <------- hi32 ------> + // <- hi16 -> <- lo16 -> + // _________________________________ + // | | | | + // | 16-bytes | 16-bytes | 16-bytes | + // |__________|__________|__________| + // + // For any value of j that is representable as a 48-bit integer, create + // a sequence of: + // li d,j => lui d,hi16(j) + // ori d,d,hi16(lo32(j)) + // dsll d,d,16 + // ori d,d,lo16(lo32(j)) + tmpInst.setOpcode(Mips::LUi); tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg())); + tmpInst.addOperand( + MCOperand::CreateImm((ImmValue & (0xffffLL << 32)) >> 32)); + Instructions.push_back(tmpInst); + createShiftOr<16, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions); + createShiftOr<0, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions); + } else { + if (!isGP64bit()) { + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + } + + // <------- hi32 ------> <------- lo32 ------> + // <- hi16 -> <- lo16 -> + // ___________________________________________ + // | | | | | + // | 16-bytes | 16-bytes | 16-bytes | 16-bytes | + // |__________|__________|__________|__________| + // + // For any value of j that isn't representable as a 48-bit integer. + // li d,j => lui d,hi16(j) + // ori d,d,lo16(hi32(j)) + // dsll d,d,16 + // ori d,d,hi16(lo32(j)) + // dsll d,d,16 + // ori d,d,lo16(lo32(j)) + tmpInst.setOpcode(Mips::LUi); tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg())); - tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff)); - tmpInst.setLoc(IDLoc); + tmpInst.addOperand( + MCOperand::CreateImm((ImmValue & (0xffffLL << 48)) >> 48)); Instructions.push_back(tmpInst); + createShiftOr<32, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions); + createShiftOr<16, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions); + createShiftOr<0, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions); } + return false; } -void +bool MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl &Instructions) { MCInst tmpInst; @@ -1081,9 +1212,10 @@ MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc, tmpInst.addOperand(MCOperand::CreateReg(SrcRegOp.getReg())); Instructions.push_back(tmpInst); } + return false; } -void +bool MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl &Instructions) { MCInst tmpInst; @@ -1115,6 +1247,7 @@ MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc, tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff)); Instructions.push_back(tmpInst); } + return false; } void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, @@ -1180,8 +1313,8 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, // not available. if (!AT) return; - TmpRegNum = - getReg((isGP64()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, AT); + TmpRegNum = getReg( + (isGP64bit()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, AT); } TempInst.setOpcode(Mips::LUi); @@ -1339,7 +1472,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) { .Case("t9", 25) .Default(-1); - if (isN32() || isN64()) { + if (isABI_N32() || isABI_N64()) { // Although SGI documentation just cuts out t0-t3 for n32/n64, // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7 // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7. @@ -1452,7 +1585,7 @@ unsigned MipsAsmParser::getReg(int RC, int RegNo) { } unsigned MipsAsmParser::getGPR(int RegNo) { - return getReg(isGP64() ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, + return getReg(isGP64bit() ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, RegNo); } @@ -1677,7 +1810,7 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, // register is a parse error. if (Operand.isGPRAsmReg()) { // Resolve to GPR32 or GPR64 appropriately. - RegNo = isGP64() ? Operand.getGPR64Reg() : Operand.getGPR32Reg(); + RegNo = isGP64bit() ? Operand.getGPR64Reg() : Operand.getGPR32Reg(); } return (RegNo == (unsigned)-1); @@ -2133,6 +2266,9 @@ bool MipsAsmParser::ParseBracketSuffix(StringRef Name, bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { DEBUG(dbgs() << "ParseInstruction\n"); + // We have reached first instruction, module directive after + // this is forbidden. + getTargetStreamer().setCanHaveModuleDir(false); // Check if we have valid mnemonic if (!mnemonicIsValid(Name, 0)) { Parser.eatToEndOfStatement(); @@ -2179,13 +2315,13 @@ bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return false; } -bool MipsAsmParser::reportParseError(StringRef ErrorMsg) { +bool MipsAsmParser::reportParseError(Twine ErrorMsg) { SMLoc Loc = getLexer().getLoc(); Parser.eatToEndOfStatement(); return Error(Loc, ErrorMsg); } -bool MipsAsmParser::reportParseError(SMLoc Loc, StringRef ErrorMsg) { +bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) { return Error(Loc, ErrorMsg); } @@ -2319,6 +2455,32 @@ bool MipsAsmParser::parseSetNoMips16Directive() { return false; } +bool MipsAsmParser::parseSetFpDirective() { + MipsABIFlagsSection::FpABIKind FpAbiVal; + // Line can be: .set fp=32 + // .set fp=xx + // .set fp=64 + Parser.Lex(); // Eat fp token + AsmToken Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Equal)) { + reportParseError("unexpected token in statement"); + return false; + } + Parser.Lex(); // Eat '=' token. + Tok = Parser.getTok(); + + if (!parseFpABIValue(FpAbiVal, ".set")) + return false; + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token in statement"); + return false; + } + getTargetStreamer().emitDirectiveSetFp(FpAbiVal); + Parser.Lex(); // Consume the EndOfStatement. + return false; +} + bool MipsAsmParser::parseSetAssignment() { StringRef Name; const MCExpr *Value; @@ -2500,6 +2662,8 @@ bool MipsAsmParser::parseDirectiveSet() { return parseSetNoAtDirective(); } else if (Tok.getString() == "at") { return parseSetAtDirective(); + } else if (Tok.getString() == "fp") { + return parseSetFpDirective(); } else if (Tok.getString() == "reorder") { return parseSetReorderDirective(); } else if (Tok.getString() == "noreorder") { @@ -2632,6 +2796,134 @@ bool MipsAsmParser::parseDirectiveOption() { return false; } +/// parseDirectiveModule +/// ::= .module oddspreg +/// ::= .module nooddspreg +/// ::= .module fp=value +bool MipsAsmParser::parseDirectiveModule() { + MCAsmLexer &Lexer = getLexer(); + SMLoc L = Lexer.getLoc(); + + if (!getTargetStreamer().getCanHaveModuleDir()) { + // TODO : get a better message. + reportParseError(".module directive must appear before any code"); + return false; + } + + if (Lexer.is(AsmToken::Identifier)) { + StringRef Option = Parser.getTok().getString(); + Parser.Lex(); + + if (Option == "oddspreg") { + getTargetStreamer().emitDirectiveModuleOddSPReg(true, isABI_O32()); + clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("Expected end of statement"); + return false; + } + + return false; + } else if (Option == "nooddspreg") { + if (!isABI_O32()) { + Error(L, "'.module nooddspreg' requires the O32 ABI"); + return false; + } + + getTargetStreamer().emitDirectiveModuleOddSPReg(false, isABI_O32()); + setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("Expected end of statement"); + return false; + } + + return false; + } else if (Option == "fp") { + return parseDirectiveModuleFP(); + } + + return Error(L, "'" + Twine(Option) + "' is not a valid .module option."); + } + + return false; +} + +/// parseDirectiveModuleFP +/// ::= =32 +/// ::= =xx +/// ::= =64 +bool MipsAsmParser::parseDirectiveModuleFP() { + MCAsmLexer &Lexer = getLexer(); + + if (Lexer.isNot(AsmToken::Equal)) { + reportParseError("unexpected token in statement"); + return false; + } + Parser.Lex(); // Eat '=' token. + + MipsABIFlagsSection::FpABIKind FpABI; + if (!parseFpABIValue(FpABI, ".module")) + return false; + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token in statement"); + return false; + } + + // Emit appropriate flags. + getTargetStreamer().emitDirectiveModuleFP(FpABI, isABI_O32()); + Parser.Lex(); // Consume the EndOfStatement. + return false; +} + +bool MipsAsmParser::parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI, + StringRef Directive) { + MCAsmLexer &Lexer = getLexer(); + + if (Lexer.is(AsmToken::Identifier)) { + StringRef Value = Parser.getTok().getString(); + Parser.Lex(); + + if (Value != "xx") { + reportParseError("unsupported value, expected 'xx', '32' or '64'"); + return false; + } + + if (!isABI_O32()) { + reportParseError("'" + Directive + " fp=xx' requires the O32 ABI"); + return false; + } + + FpABI = MipsABIFlagsSection::FpABIKind::XX; + return true; + } + + if (Lexer.is(AsmToken::Integer)) { + unsigned Value = Parser.getTok().getIntVal(); + Parser.Lex(); + + if (Value != 32 && Value != 64) { + reportParseError("unsupported value, expected 'xx', '32' or '64'"); + return false; + } + + if (Value == 32) { + if (!isABI_O32()) { + reportParseError("'" + Directive + " fp=32' requires the O32 ABI"); + return false; + } + + FpABI = MipsABIFlagsSection::FpABIKind::S32; + } else + FpABI = MipsABIFlagsSection::FpABIKind::S64; + + return true; + } + + return false; +} + bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); @@ -2710,6 +3002,9 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".cpsetup") return parseDirectiveCPSetup(); + if (IDVal == ".module") + return parseDirectiveModule(); + return true; } diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index 902b87759dc0..f35a8deefcff 100644 --- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -142,11 +142,6 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeFGRH32RegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Address, - const void *Decoder); - static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -914,18 +909,6 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst, return MCDisassembler::Success; } -static DecodeStatus DecodeFGRH32RegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Address, - const void *Decoder) { - if (RegNo > 31) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::FGRH32RegClassID, RegNo); - Inst.addOperand(MCOperand::CreateReg(Reg)); - return MCDisassembler::Success; -} - static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt index d3e2fd75498c..6b3788ca515c 100644 --- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(LLVMMipsDesc + MipsABIFlagsSection.cpp MipsAsmBackend.cpp MipsELFObjectWriter.cpp MipsELFStreamer.cpp @@ -7,5 +8,6 @@ add_llvm_library(LLVMMipsDesc MipsMCExpr.cpp MipsMCTargetDesc.cpp MipsNaClELFStreamer.cpp + MipsOptionRecord.cpp MipsTargetStreamer.cpp ) diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp new file mode 100644 index 000000000000..5b0f950b076e --- /dev/null +++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp @@ -0,0 +1,66 @@ +//===-- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MipsABIFlagsSection.h" + +using namespace llvm; + +uint8_t MipsABIFlagsSection::getFpABIValue() { + switch (FpABI) { + case FpABIKind::ANY: + return Val_GNU_MIPS_ABI_FP_ANY; + case FpABIKind::XX: + return Val_GNU_MIPS_ABI_FP_XX; + case FpABIKind::S32: + return Val_GNU_MIPS_ABI_FP_DOUBLE; + case FpABIKind::S64: + if (Is32BitABI) + return OddSPReg ? Val_GNU_MIPS_ABI_FP_64 : Val_GNU_MIPS_ABI_FP_64A; + return Val_GNU_MIPS_ABI_FP_DOUBLE; + } + + llvm_unreachable("unexpected fp abi value"); +} + +StringRef MipsABIFlagsSection::getFpABIString(FpABIKind Value) { + switch (Value) { + case FpABIKind::XX: + return "xx"; + case FpABIKind::S32: + return "32"; + case FpABIKind::S64: + return "64"; + default: + llvm_unreachable("unsupported fp abi value"); + } +} + +uint8_t MipsABIFlagsSection::getCPR1SizeValue() { + if (FpABI == FpABIKind::XX) + return (uint8_t)AFL_REG_32; + return (uint8_t)CPR1Size; +} + +namespace llvm { +MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) { + // Write out a Elf_Internal_ABIFlags_v0 struct + OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2); // version + OS.EmitIntValue(ABIFlagsSection.getISALevelValue(), 1); // isa_level + OS.EmitIntValue(ABIFlagsSection.getISARevisionValue(), 1); // isa_rev + OS.EmitIntValue(ABIFlagsSection.getGPRSizeValue(), 1); // gpr_size + OS.EmitIntValue(ABIFlagsSection.getCPR1SizeValue(), 1); // cpr1_size + OS.EmitIntValue(ABIFlagsSection.getCPR2SizeValue(), 1); // cpr2_size + OS.EmitIntValue(ABIFlagsSection.getFpABIValue(), 1); // fp_abi + OS.EmitIntValue(ABIFlagsSection.getISAExtensionSetValue(), 4); // isa_ext + OS.EmitIntValue(ABIFlagsSection.getASESetValue(), 4); // ases + OS.EmitIntValue(ABIFlagsSection.getFlags1Value(), 4); // flags1 + OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4); // flags2 + return OS; +} +} diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h new file mode 100644 index 000000000000..ea5bc12b0740 --- /dev/null +++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h @@ -0,0 +1,238 @@ +//===-- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -----*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef MIPSABIFLAGSSECTION_H +#define MIPSABIFLAGSSECTION_H + +#include "llvm/MC/MCStreamer.h" + +namespace llvm { + +class MCStreamer; + +struct MipsABIFlagsSection { + // Values for the xxx_size bytes of an ABI flags structure. + enum AFL_REG { + AFL_REG_NONE = 0x00, // No registers. + AFL_REG_32 = 0x01, // 32-bit registers. + AFL_REG_64 = 0x02, // 64-bit registers. + AFL_REG_128 = 0x03 // 128-bit registers. + }; + + // Masks for the ases word of an ABI flags structure. + enum AFL_ASE { + AFL_ASE_DSP = 0x00000001, // DSP ASE. + AFL_ASE_DSPR2 = 0x00000002, // DSP R2 ASE. + AFL_ASE_EVA = 0x00000004, // Enhanced VA Scheme. + AFL_ASE_MCU = 0x00000008, // MCU (MicroController) ASE. + AFL_ASE_MDMX = 0x00000010, // MDMX ASE. + AFL_ASE_MIPS3D = 0x00000020, // MIPS-3D ASE. + AFL_ASE_MT = 0x00000040, // MT ASE. + AFL_ASE_SMARTMIPS = 0x00000080, // SmartMIPS ASE. + AFL_ASE_VIRT = 0x00000100, // VZ ASE. + AFL_ASE_MSA = 0x00000200, // MSA ASE. + AFL_ASE_MIPS16 = 0x00000400, // MIPS16 ASE. + AFL_ASE_MICROMIPS = 0x00000800, // MICROMIPS ASE. + AFL_ASE_XPA = 0x00001000 // XPA ASE. + }; + + // Values for the isa_ext word of an ABI flags structure. + enum AFL_EXT { + AFL_EXT_XLR = 1, // RMI Xlr instruction. + AFL_EXT_OCTEON2 = 2, // Cavium Networks Octeon2. + AFL_EXT_OCTEONP = 3, // Cavium Networks OcteonP. + AFL_EXT_LOONGSON_3A = 4, // Loongson 3A. + AFL_EXT_OCTEON = 5, // Cavium Networks Octeon. + AFL_EXT_5900 = 6, // MIPS R5900 instruction. + AFL_EXT_4650 = 7, // MIPS R4650 instruction. + AFL_EXT_4010 = 8, // LSI R4010 instruction. + AFL_EXT_4100 = 9, // NEC VR4100 instruction. + AFL_EXT_3900 = 10, // Toshiba R3900 instruction. + AFL_EXT_10000 = 11, // MIPS R10000 instruction. + AFL_EXT_SB1 = 12, // Broadcom SB-1 instruction. + AFL_EXT_4111 = 13, // NEC VR4111/VR4181 instruction. + AFL_EXT_4120 = 14, // NEC VR4120 instruction. + AFL_EXT_5400 = 15, // NEC VR5400 instruction. + AFL_EXT_5500 = 16, // NEC VR5500 instruction. + AFL_EXT_LOONGSON_2E = 17, // ST Microelectronics Loongson 2E. + AFL_EXT_LOONGSON_2F = 18 // ST Microelectronics Loongson 2F. + }; + + // Values for the fp_abi word of an ABI flags structure. + enum Val_GNU_MIPS_ABI { + Val_GNU_MIPS_ABI_FP_ANY = 0, + Val_GNU_MIPS_ABI_FP_DOUBLE = 1, + Val_GNU_MIPS_ABI_FP_XX = 5, + Val_GNU_MIPS_ABI_FP_64 = 6, + Val_GNU_MIPS_ABI_FP_64A = 7 + }; + + enum AFL_FLAGS1 { + AFL_FLAGS1_ODDSPREG = 1 + }; + + // Internal representation of the values used in .module fp=value + enum class FpABIKind { ANY, XX, S32, S64 }; + + // Version of flags structure. + uint16_t Version; + // The level of the ISA: 1-5, 32, 64. + uint8_t ISALevel; + // The revision of ISA: 0 for MIPS V and below, 1-n otherwise. + uint8_t ISARevision; + // The size of general purpose registers. + AFL_REG GPRSize; + // The size of co-processor 1 registers. + AFL_REG CPR1Size; + // The size of co-processor 2 registers. + AFL_REG CPR2Size; + // Processor-specific extension. + uint32_t ISAExtensionSet; + // Mask of ASEs used. + uint32_t ASESet; + + bool OddSPReg; + + bool Is32BitABI; + +protected: + // The floating-point ABI. + FpABIKind FpABI; + +public: + MipsABIFlagsSection() + : Version(0), ISALevel(0), ISARevision(0), GPRSize(AFL_REG_NONE), + CPR1Size(AFL_REG_NONE), CPR2Size(AFL_REG_NONE), ISAExtensionSet(0), + ASESet(0), OddSPReg(false), Is32BitABI(false), FpABI(FpABIKind::ANY) {} + + uint16_t getVersionValue() { return (uint16_t)Version; } + uint8_t getISALevelValue() { return (uint8_t)ISALevel; } + uint8_t getISARevisionValue() { return (uint8_t)ISARevision; } + uint8_t getGPRSizeValue() { return (uint8_t)GPRSize; } + uint8_t getCPR1SizeValue(); + uint8_t getCPR2SizeValue() { return (uint8_t)CPR2Size; } + uint8_t getFpABIValue(); + uint32_t getISAExtensionSetValue() { return (uint32_t)ISAExtensionSet; } + uint32_t getASESetValue() { return (uint32_t)ASESet; } + + uint32_t getFlags1Value() { + uint32_t Value = 0; + + if (OddSPReg) + Value |= (uint32_t)AFL_FLAGS1_ODDSPREG; + + return Value; + } + + uint32_t getFlags2Value() { return 0; } + + FpABIKind getFpABI() { return FpABI; } + void setFpABI(FpABIKind Value, bool IsABI32Bit) { + FpABI = Value; + Is32BitABI = IsABI32Bit; + } + StringRef getFpABIString(FpABIKind Value); + + template + void setISALevelAndRevisionFromPredicates(const PredicateLibrary &P) { + if (P.hasMips64()) { + ISALevel = 64; + if (P.hasMips64r6()) + ISARevision = 6; + else if (P.hasMips64r2()) + ISARevision = 2; + else + ISARevision = 1; + } else if (P.hasMips32()) { + ISALevel = 32; + if (P.hasMips32r6()) + ISARevision = 6; + else if (P.hasMips32r2()) + ISARevision = 2; + else + ISARevision = 1; + } else { + ISARevision = 0; + if (P.hasMips5()) + ISALevel = 5; + else if (P.hasMips4()) + ISALevel = 4; + else if (P.hasMips3()) + ISALevel = 3; + else if (P.hasMips2()) + ISALevel = 2; + else if (P.hasMips1()) + ISALevel = 1; + else + llvm_unreachable("Unknown ISA level!"); + } + } + + template + void setGPRSizeFromPredicates(const PredicateLibrary &P) { + GPRSize = P.isGP64bit() ? AFL_REG_64 : AFL_REG_32; + } + + template + void setCPR1SizeFromPredicates(const PredicateLibrary &P) { + if (P.abiUsesSoftFloat()) + CPR1Size = AFL_REG_NONE; + else if (P.hasMSA()) + CPR1Size = AFL_REG_128; + else + CPR1Size = P.isFP64bit() ? AFL_REG_64 : AFL_REG_32; + } + + template + void setASESetFromPredicates(const PredicateLibrary &P) { + ASESet = 0; + if (P.hasDSP()) + ASESet |= AFL_ASE_DSP; + if (P.hasDSPR2()) + ASESet |= AFL_ASE_DSPR2; + if (P.hasMSA()) + ASESet |= AFL_ASE_MSA; + if (P.inMicroMipsMode()) + ASESet |= AFL_ASE_MICROMIPS; + if (P.inMips16Mode()) + ASESet |= AFL_ASE_MIPS16; + } + + template + void setFpAbiFromPredicates(const PredicateLibrary &P) { + Is32BitABI = P.isABI_O32(); + + FpABI = FpABIKind::ANY; + if (P.isABI_N32() || P.isABI_N64()) + FpABI = FpABIKind::S64; + else if (P.isABI_O32()) { + if (P.isABI_FPXX()) + FpABI = FpABIKind::XX; + else if (P.isFP64bit()) + FpABI = FpABIKind::S64; + else + FpABI = FpABIKind::S32; + } + } + + template + void setAllFromPredicates(const PredicateLibrary &P) { + setISALevelAndRevisionFromPredicates(P); + setGPRSizeFromPredicates(P); + setCPR1SizeFromPredicates(P); + setASESetFromPredicates(P); + setFpAbiFromPredicates(P); + OddSPReg = P.useOddSPReg(); + } +}; + +MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection); +} + +#endif diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index bc695e6d4f74..d5c3dbc47880 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -65,7 +65,7 @@ class MipsAsmBackend : public MCAsmBackend { const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override { // FIXME. - assert(0 && "RelaxInstruction() unimplemented"); + llvm_unreachable("RelaxInstruction() unimplemented"); return false; } diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 49ac25690b98..4ea7846f83f6 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -30,7 +30,8 @@ namespace { unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; - bool needsRelocateWithSymbol(unsigned Type) const override; + bool needsRelocateWithSymbol(const MCSymbolData &SD, + unsigned Type) const override; }; } @@ -216,7 +217,8 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target, } bool -MipsELFObjectWriter::needsRelocateWithSymbol(unsigned Type) const { +MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD, + unsigned Type) const { // FIXME: This is extremelly conservative. This really needs to use a // whitelist with a clear explanation for why each realocation needs to // point to the symbol, not to the section. diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index fe378292befb..803ab85657dc 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -8,6 +8,30 @@ //===----------------------------------------------------------------------===// #include "MipsELFStreamer.h" +#include "llvm/MC/MCInst.h" + +void MipsELFStreamer::EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI) { + MCELFStreamer::EmitInstruction(Inst, STI); + + MCContext &Context = getContext(); + const MCRegisterInfo *MCRegInfo = Context.getRegisterInfo(); + + for (unsigned OpIndex = 0; OpIndex < Inst.getNumOperands(); ++OpIndex) { + const MCOperand &Op = Inst.getOperand(OpIndex); + + if (!Op.isReg()) + continue; + + unsigned Reg = Op.getReg(); + RegInfoRecord->SetPhysRegUsed(Reg, MCRegInfo); + } +} + +void MipsELFStreamer::EmitMipsOptionRecords() { + for (const auto &I : MipsOptionRecords) + I->EmitMipsOptionRecord(); +} namespace llvm { MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index 641f8cf7af25..58863be9cc23 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -15,8 +15,10 @@ #ifndef MIPSELFSTREAMER_H #define MIPSELFSTREAMER_H +#include "MipsOptionRecord.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCELFStreamer.h" -#include "llvm/Support/raw_ostream.h" +#include namespace llvm { class MCAsmBackend; @@ -25,13 +27,27 @@ class MCContext; class MCSubtargetInfo; class MipsELFStreamer : public MCELFStreamer { + SmallVector, 8> MipsOptionRecords; + MipsRegInfoRecord *RegInfoRecord; public: MipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS, MCCodeEmitter *Emitter, const MCSubtargetInfo &STI) - : MCELFStreamer(Context, MAB, OS, Emitter) {} + : MCELFStreamer(Context, MAB, OS, Emitter) { - virtual ~MipsELFStreamer() {} + RegInfoRecord = new MipsRegInfoRecord(this, Context, STI); + MipsOptionRecords.push_back( + std::unique_ptr(RegInfoRecord)); + } + + /// Overriding this function allows us to add arbitrary behaviour before the + /// \p Inst is actually emitted. For example, we can inspect the operands and + /// gather sufficient information that allows us to reason about the register + /// usage for the translation unit. + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; + + /// Emits all the option records stored up until the point it's called. + void EmitMipsOptionRecords(); }; MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp index 6aa3c762d9db..e415412ab6cb 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp @@ -38,7 +38,7 @@ MipsMCAsmInfo::MipsMCAsmInfo(StringRef TT) { ZeroDirective = "\t.space\t"; GPRel32Directive = "\t.gpword\t"; GPRel64Directive = "\t.gpdword\t"; - DebugLabelSuffix = "=."; + UseAssignmentForEHBegin = true; SupportsDebugInformation = true; ExceptionsType = ExceptionHandling::DwarfCFI; HasLEB128 = true; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp index 21ccc3c58b34..5bba3e5b7ae2 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp @@ -11,6 +11,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectStreamer.h" using namespace llvm; @@ -83,33 +84,6 @@ MipsMCExpr::EvaluateAsRelocatableImpl(MCValue &Res, return getSubExpr()->EvaluateAsRelocatable(Res, Layout); } -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast(Value); - AddValueSymbolsImpl(BE->getLHS(), Asm); - AddValueSymbolsImpl(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbolsImpl(cast(Value)->getSubExpr(), Asm); - break; - } -} - -void MipsMCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbolsImpl(getSubExpr(), Asm); +void MipsMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h index 8d7aacde31a1..f193dc9b9d50 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h @@ -49,7 +49,7 @@ class MipsMCExpr : public MCTargetExpr { void PrintImpl(raw_ostream &OS) const override; bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const override; - void AddValueSymbols(MCAssembler *) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; const MCSection *FindAssociatedSection() const override { return getSubExpr()->FindAssociatedSection(); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index 660e5a7be1b8..d2b929bea334 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -133,6 +133,12 @@ createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, return S; } +static MCStreamer *createMipsNullStreamer(MCContext &Ctx) { + MCStreamer *S = llvm::createNullStreamer(Ctx); + new MipsTargetStreamer(*S); + return S; +} + extern "C" void LLVMInitializeMipsTargetMC() { // Register the MC asm info. RegisterMCAsmInfoFn X(TheMipsTarget, createMipsMCAsmInfo); @@ -187,6 +193,12 @@ extern "C" void LLVMInitializeMipsTargetMC() { TargetRegistry::RegisterAsmStreamer(TheMips64Target, createMCAsmStreamer); TargetRegistry::RegisterAsmStreamer(TheMips64elTarget, createMCAsmStreamer); + TargetRegistry::RegisterNullStreamer(TheMipsTarget, createMipsNullStreamer); + TargetRegistry::RegisterNullStreamer(TheMipselTarget, createMipsNullStreamer); + TargetRegistry::RegisterNullStreamer(TheMips64Target, createMipsNullStreamer); + TargetRegistry::RegisterNullStreamer(TheMips64elTarget, + createMipsNullStreamer); + // Register the asm backend. TargetRegistry::RegisterMCAsmBackend(TheMipsTarget, createMipsAsmBackendEB32); diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp index ce4b9a8f9e98..6cde8f9ae3e4 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp @@ -48,7 +48,13 @@ class MipsNaClELFStreamer : public MipsELFStreamer { bool PendingCall; bool isIndirectJump(const MCInst &MI) { - return MI.getOpcode() == Mips::JR || MI.getOpcode() == Mips::RET; + if (MI.getOpcode() == Mips::JALR) { + // MIPS32r6/MIPS64r6 doesn't have a JR instruction and uses JALR instead. + // JALR is an indirect branch if the link register is $0. + assert(MI.getOperand(0).isReg()); + return MI.getOperand(0).getReg() == Mips::ZERO; + } + return MI.getOpcode() == Mips::JR; } bool isStackPointerFirstOperand(const MCInst &MI) { @@ -56,7 +62,9 @@ class MipsNaClELFStreamer : public MipsELFStreamer { && MI.getOperand(0).getReg() == Mips::SP); } - bool isCall(unsigned Opcode, bool *IsIndirectCall) { + bool isCall(const MCInst &MI, bool *IsIndirectCall) { + unsigned Opcode = MI.getOpcode(); + *IsIndirectCall = false; switch (Opcode) { @@ -71,6 +79,12 @@ class MipsNaClELFStreamer : public MipsELFStreamer { return true; case Mips::JALR: + // JALR is only a call if the link register is not $0. Otherwise it's an + // indirect branch. + assert(MI.getOperand(0).isReg()); + if (MI.getOperand(0).getReg() == Mips::ZERO) + return false; + *IsIndirectCall = true; return true; } @@ -154,7 +168,7 @@ class MipsNaClELFStreamer : public MipsELFStreamer { // Sandbox calls by aligning call and branch delay to the bundle end. // For indirect calls, emit the mask before the call. bool IsIndirectCall; - if (isCall(Inst.getOpcode(), &IsIndirectCall)) { + if (isCall(Inst, &IsIndirectCall)) { if (PendingCall) report_fatal_error("Dangerous instruction in branch delay slot!"); diff --git a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp new file mode 100644 index 000000000000..0ef220821320 --- /dev/null +++ b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp @@ -0,0 +1,92 @@ +//===-- MipsOptionRecord.cpp - Abstraction for storing information --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MipsOptionRecord.h" +#include "MipsELFStreamer.h" +#include "llvm/MC/MCSectionELF.h" + +using namespace llvm; + +void MipsRegInfoRecord::EmitMipsOptionRecord() { + MCAssembler &MCA = Streamer->getAssembler(); + Triple T(STI.getTargetTriple()); + uint64_t Features = STI.getFeatureBits(); + + Streamer->PushSection(); + + // We need to distinguish between N64 and the rest because at the moment + // we don't emit .Mips.options for other ELFs other than N64. + // Since .reginfo has the same information as .Mips.options (ODK_REGINFO), + // we can use the same abstraction (MipsRegInfoRecord class) to handle both. + if (Features & Mips::FeatureN64) { + // The EntrySize value of 1 seems strange since the records are neither + // 1-byte long nor fixed length but it matches the value GAS emits. + const MCSectionELF *Sec = + Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS, + ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, + SectionKind::getMetadata(), 1, ""); + MCA.getOrCreateSectionData(*Sec).setAlignment(8); + Streamer->SwitchSection(Sec); + + Streamer->EmitIntValue(1, 1); // kind + Streamer->EmitIntValue(40, 1); // size + Streamer->EmitIntValue(0, 2); // section + Streamer->EmitIntValue(0, 4); // info + Streamer->EmitIntValue(ri_gprmask, 4); + Streamer->EmitIntValue(0, 4); // pad + Streamer->EmitIntValue(ri_cprmask[0], 4); + Streamer->EmitIntValue(ri_cprmask[1], 4); + Streamer->EmitIntValue(ri_cprmask[2], 4); + Streamer->EmitIntValue(ri_cprmask[3], 4); + Streamer->EmitIntValue(ri_gp_value, 8); + } else { + const MCSectionELF *Sec = + Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC, + SectionKind::getMetadata(), 24, ""); + MCA.getOrCreateSectionData(*Sec) + .setAlignment(Features & Mips::FeatureN32 ? 8 : 4); + Streamer->SwitchSection(Sec); + + Streamer->EmitIntValue(ri_gprmask, 4); + Streamer->EmitIntValue(ri_cprmask[0], 4); + Streamer->EmitIntValue(ri_cprmask[1], 4); + Streamer->EmitIntValue(ri_cprmask[2], 4); + Streamer->EmitIntValue(ri_cprmask[3], 4); + assert((ri_gp_value & 0xffffffff) == ri_gp_value); + Streamer->EmitIntValue(ri_gp_value, 4); + } + + Streamer->PopSection(); +} + +void MipsRegInfoRecord::SetPhysRegUsed(unsigned Reg, + const MCRegisterInfo *MCRegInfo) { + unsigned Value = 0; + + for (MCSubRegIterator SubRegIt(Reg, MCRegInfo, true); SubRegIt.isValid(); + ++SubRegIt) { + unsigned CurrentSubReg = *SubRegIt; + + unsigned EncVal = MCRegInfo->getEncodingValue(CurrentSubReg); + Value |= 1 << EncVal; + + if (GPR32RegClass->contains(CurrentSubReg) || + GPR64RegClass->contains(CurrentSubReg)) + ri_gprmask |= Value; + else if (FGR32RegClass->contains(CurrentSubReg) || + FGR64RegClass->contains(CurrentSubReg) || + AFGR64RegClass->contains(CurrentSubReg) || + MSA128BRegClass->contains(CurrentSubReg)) + ri_cprmask[1] |= Value; + else if (COP2RegClass->contains(CurrentSubReg)) + ri_cprmask[2] |= Value; + else if (COP3RegClass->contains(CurrentSubReg)) + ri_cprmask[3] |= Value; + } +} diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index a8fa27244d94..4a178e2df7a9 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "InstPrinter/MipsInstPrinter.h" +#include "MipsELFStreamer.h" #include "MipsMCTargetDesc.h" #include "MipsTargetObjectFile.h" #include "MipsTargetStreamer.h" @@ -27,10 +28,43 @@ using namespace llvm; -// Pin vtable to this file. -void MipsTargetStreamer::anchor() {} - -MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} +MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S) + : MCTargetStreamer(S), canHaveModuleDirective(true) {} +void MipsTargetStreamer::emitDirectiveSetMicroMips() {} +void MipsTargetStreamer::emitDirectiveSetNoMicroMips() {} +void MipsTargetStreamer::emitDirectiveSetMips16() {} +void MipsTargetStreamer::emitDirectiveSetNoMips16() {} +void MipsTargetStreamer::emitDirectiveSetReorder() {} +void MipsTargetStreamer::emitDirectiveSetNoReorder() {} +void MipsTargetStreamer::emitDirectiveSetMacro() {} +void MipsTargetStreamer::emitDirectiveSetNoMacro() {} +void MipsTargetStreamer::emitDirectiveSetAt() {} +void MipsTargetStreamer::emitDirectiveSetNoAt() {} +void MipsTargetStreamer::emitDirectiveEnd(StringRef Name) {} +void MipsTargetStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {} +void MipsTargetStreamer::emitDirectiveAbiCalls() {} +void MipsTargetStreamer::emitDirectiveNaN2008() {} +void MipsTargetStreamer::emitDirectiveNaNLegacy() {} +void MipsTargetStreamer::emitDirectiveOptionPic0() {} +void MipsTargetStreamer::emitDirectiveOptionPic2() {} +void MipsTargetStreamer::emitFrame(unsigned StackReg, unsigned StackSize, + unsigned ReturnReg) {} +void MipsTargetStreamer::emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) {} +void MipsTargetStreamer::emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) { +} +void MipsTargetStreamer::emitDirectiveSetMips32R2() {} +void MipsTargetStreamer::emitDirectiveSetMips64() {} +void MipsTargetStreamer::emitDirectiveSetMips64R2() {} +void MipsTargetStreamer::emitDirectiveSetDsp() {} +void MipsTargetStreamer::emitDirectiveCpload(unsigned RegNo) {} +void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, + const MCSymbol &Sym, bool IsReg) { +} +void MipsTargetStreamer::emitDirectiveModuleOddSPReg(bool Enabled, + bool IsO32ABI) { + if (!Enabled && !IsO32ABI) + report_fatal_error("+nooddspreg is only valid for O32"); +} MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) @@ -38,42 +72,52 @@ MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S, void MipsTargetAsmStreamer::emitDirectiveSetMicroMips() { OS << "\t.set\tmicromips\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetNoMicroMips() { OS << "\t.set\tnomicromips\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetMips16() { OS << "\t.set\tmips16\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetNoMips16() { OS << "\t.set\tnomips16\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetReorder() { OS << "\t.set\treorder\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetNoReorder() { OS << "\t.set\tnoreorder\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetMacro() { OS << "\t.set\tmacro\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetNoMacro() { OS << "\t.set\tnomacro\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetAt() { OS << "\t.set\tat\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetNoAt() { OS << "\t.set\tnoat\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveEnd(StringRef Name) { @@ -110,24 +154,28 @@ void MipsTargetAsmStreamer::emitFrame(unsigned StackReg, unsigned StackSize, void MipsTargetAsmStreamer::emitDirectiveSetMips32R2() { OS << "\t.set\tmips32r2\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetMips64() { OS << "\t.set\tmips64\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetMips64R2() { OS << "\t.set\tmips64r2\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetDsp() { OS << "\t.set\tdsp\n"; + setCanHaveModuleDir(false); } // Print a 32 bit hex number with all numbers. static void printHex32(unsigned Value, raw_ostream &OS) { OS << "0x"; for (int i = 7; i >= 0; i--) - OS.write_hex((Value & (0xF << (i*4))) >> (i*4)); + OS.write_hex((Value & (0xF << (i * 4))) >> (i * 4)); } void MipsTargetAsmStreamer::emitMask(unsigned CPUBitmask, @@ -147,6 +195,7 @@ void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask, void MipsTargetAsmStreamer::emitDirectiveCpload(unsigned RegNo) { OS << "\t.cpload\t$" << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo, @@ -165,6 +214,34 @@ void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo, OS << ", "; OS << Sym.getName() << "\n"; + setCanHaveModuleDir(false); +} + +void MipsTargetAsmStreamer::emitDirectiveModuleFP( + MipsABIFlagsSection::FpABIKind Value, bool Is32BitABI) { + MipsTargetStreamer::emitDirectiveModuleFP(Value, Is32BitABI); + + StringRef ModuleValue; + OS << "\t.module\tfp="; + OS << ABIFlagsSection.getFpABIString(Value) << "\n"; +} + +void MipsTargetAsmStreamer::emitDirectiveSetFp( + MipsABIFlagsSection::FpABIKind Value) { + StringRef ModuleValue; + OS << "\t.set\tfp="; + OS << ABIFlagsSection.getFpABIString(Value) << "\n"; +} + +void MipsTargetAsmStreamer::emitMipsAbiFlags() { + // No action required for text output. +} + +void MipsTargetAsmStreamer::emitDirectiveModuleOddSPReg(bool Enabled, + bool IsO32ABI) { + MipsTargetStreamer::emitDirectiveModuleOddSPReg(Enabled, IsO32ABI); + + OS << "\t.module\t" << (Enabled ? "" : "no") << "oddspreg\n"; } // This part is for ELF object output. @@ -174,7 +251,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S, MCAssembler &MCA = getStreamer().getAssembler(); uint64_t Features = STI.getFeatureBits(); Triple T(STI.getTargetTriple()); - Pic = (MCA.getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_) + Pic = (MCA.getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_) ? true : false; @@ -182,38 +259,52 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S, unsigned EFlags = 0; // Architecture - if (Features & Mips::FeatureMips64r2) + if (Features & Mips::FeatureMips64r6) + EFlags |= ELF::EF_MIPS_ARCH_64R6; + else if (Features & Mips::FeatureMips64r2) EFlags |= ELF::EF_MIPS_ARCH_64R2; else if (Features & Mips::FeatureMips64) EFlags |= ELF::EF_MIPS_ARCH_64; + else if (Features & Mips::FeatureMips5) + EFlags |= ELF::EF_MIPS_ARCH_5; else if (Features & Mips::FeatureMips4) EFlags |= ELF::EF_MIPS_ARCH_4; + else if (Features & Mips::FeatureMips3) + EFlags |= ELF::EF_MIPS_ARCH_3; + else if (Features & Mips::FeatureMips32r6) + EFlags |= ELF::EF_MIPS_ARCH_32R6; else if (Features & Mips::FeatureMips32r2) EFlags |= ELF::EF_MIPS_ARCH_32R2; else if (Features & Mips::FeatureMips32) EFlags |= ELF::EF_MIPS_ARCH_32; + else if (Features & Mips::FeatureMips2) + EFlags |= ELF::EF_MIPS_ARCH_2; + else + EFlags |= ELF::EF_MIPS_ARCH_1; - if (T.isArch64Bit()) { - if (Features & Mips::FeatureN32) - EFlags |= ELF::EF_MIPS_ABI2; - else if (Features & Mips::FeatureO32) { - EFlags |= ELF::EF_MIPS_ABI_O32; - EFlags |= ELF::EF_MIPS_32BITMODE; /* Compatibility Mode */ - } - // No need to set any bit for N64 which is the default ABI at the moment - // for 64-bit Mips architectures. - } else { - if (Features & Mips::FeatureMips64r2 || Features & Mips::FeatureMips64) - EFlags |= ELF::EF_MIPS_32BITMODE; - - // ABI + // ABI + // N64 does not require any ABI bits. + if (Features & Mips::FeatureO32) EFlags |= ELF::EF_MIPS_ABI_O32; - } + else if (Features & Mips::FeatureN32) + EFlags |= ELF::EF_MIPS_ABI2; + + if (Features & Mips::FeatureGP64Bit) { + if (Features & Mips::FeatureO32) + EFlags |= ELF::EF_MIPS_32BITMODE; /* Compatibility Mode */ + } else if (Features & Mips::FeatureMips64r2 || Features & Mips::FeatureMips64) + EFlags |= ELF::EF_MIPS_32BITMODE; // Other options. if (Features & Mips::FeatureNaN2008) EFlags |= ELF::EF_MIPS_NAN2008; + // -mabicalls and -mplt are not implemented but we should act as if they were + // given. + EFlags |= ELF::EF_MIPS_CPIC; + if (Features & Mips::FeatureN64) + EFlags |= ELF::EF_MIPS_PIC; + MCA.setELFHeaderEFlags(EFlags); } @@ -233,41 +324,27 @@ void MipsTargetELFStreamer::emitLabel(MCSymbol *Symbol) { void MipsTargetELFStreamer::finish() { MCAssembler &MCA = getStreamer().getAssembler(); - MCContext &Context = MCA.getContext(); - MCStreamer &OS = getStreamer(); - Triple T(STI.getTargetTriple()); - uint64_t Features = STI.getFeatureBits(); + const MCObjectFileInfo &OFI = *MCA.getContext().getObjectFileInfo(); - if (T.isArch64Bit() && (Features & Mips::FeatureN64)) { - const MCSectionELF *Sec = Context.getELFSection( - ".MIPS.options", ELF::SHT_MIPS_OPTIONS, - ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, SectionKind::getMetadata()); - OS.SwitchSection(Sec); - - OS.EmitIntValue(1, 1); // kind - OS.EmitIntValue(40, 1); // size - OS.EmitIntValue(0, 2); // section - OS.EmitIntValue(0, 4); // info - OS.EmitIntValue(0, 4); // ri_gprmask - OS.EmitIntValue(0, 4); // pad - OS.EmitIntValue(0, 4); // ri_cpr[0]mask - OS.EmitIntValue(0, 4); // ri_cpr[1]mask - OS.EmitIntValue(0, 4); // ri_cpr[2]mask - OS.EmitIntValue(0, 4); // ri_cpr[3]mask - OS.EmitIntValue(0, 8); // ri_gp_value - } else { - const MCSectionELF *Sec = - Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC, - SectionKind::getMetadata()); - OS.SwitchSection(Sec); - - OS.EmitIntValue(0, 4); // ri_gprmask - OS.EmitIntValue(0, 4); // ri_cpr[0]mask - OS.EmitIntValue(0, 4); // ri_cpr[1]mask - OS.EmitIntValue(0, 4); // ri_cpr[2]mask - OS.EmitIntValue(0, 4); // ri_cpr[3]mask - OS.EmitIntValue(0, 4); // ri_gp_value - } + // .bss, .text and .data are always at least 16-byte aligned. + MCSectionData &TextSectionData = + MCA.getOrCreateSectionData(*OFI.getTextSection()); + MCSectionData &DataSectionData = + MCA.getOrCreateSectionData(*OFI.getDataSection()); + MCSectionData &BSSSectionData = + MCA.getOrCreateSectionData(*OFI.getBSSSection()); + + TextSectionData.setAlignment(std::max(16u, TextSectionData.getAlignment())); + DataSectionData.setAlignment(std::max(16u, DataSectionData.getAlignment())); + BSSSectionData.setAlignment(std::max(16u, BSSSectionData.getAlignment())); + + // Emit all the option records. + // At the moment we are only emitting .Mips.options (ODK_REGINFO) and + // .reginfo. + MipsELFStreamer &MEF = static_cast(Streamer); + MEF.EmitMipsOptionRecords(); + + emitMipsAbiFlags(); } void MipsTargetELFStreamer::emitAssignment(MCSymbol *Symbol, @@ -276,11 +353,11 @@ void MipsTargetELFStreamer::emitAssignment(MCSymbol *Symbol, if (Value->getKind() != MCExpr::SymbolRef) return; const MCSymbol &RhsSym = - static_cast(Value)->getSymbol(); + static_cast(Value)->getSymbol(); MCSymbolData &Data = getStreamer().getOrCreateSymbolData(&RhsSym); uint8_t Type = MCELF::GetType(Data); - if ((Type != ELF::STT_FUNC) - || !(MCELF::getOther(Data) & (ELF::STO_MIPS_MICROMIPS >> 2))) + if ((Type != ELF::STT_FUNC) || + !(MCELF::getOther(Data) & (ELF::STO_MIPS_MICROMIPS >> 2))) return; MCSymbolData &SymbolData = getStreamer().getOrCreateSymbolData(Symbol); @@ -305,6 +382,7 @@ void MipsTargetELFStreamer::emitDirectiveSetMicroMips() { void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() { MicroMipsEnabled = false; + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetMips16() { @@ -312,14 +390,17 @@ void MipsTargetELFStreamer::emitDirectiveSetMips16() { unsigned Flags = MCA.getELFHeaderEFlags(); Flags |= ELF::EF_MIPS_ARCH_ASE_M16; MCA.setELFHeaderEFlags(Flags); + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetNoMips16() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetReorder() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetNoReorder() { @@ -327,22 +408,27 @@ void MipsTargetELFStreamer::emitDirectiveSetNoReorder() { unsigned Flags = MCA.getELFHeaderEFlags(); Flags |= ELF::EF_MIPS_NOREORDER; MCA.setELFHeaderEFlags(Flags); + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetMacro() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetNoMacro() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetAt() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetNoAt() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) { @@ -411,19 +497,19 @@ void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask, } void MipsTargetELFStreamer::emitDirectiveSetMips32R2() { - // No action required for ELF output. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetMips64() { - // No action required for ELF output. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetMips64R2() { - // No action required for ELF output. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetDsp() { - // No action required for ELF output. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveCpload(unsigned RegNo) { @@ -473,6 +559,8 @@ void MipsTargetELFStreamer::emitDirectiveCpload(unsigned RegNo) { TmpInst.addOperand(MCOperand::CreateReg(Mips::GP)); TmpInst.addOperand(MCOperand::CreateReg(RegNo)); getStreamer().EmitInstruction(TmpInst, STI); + + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, @@ -528,4 +616,27 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, Inst.addOperand(MCOperand::CreateReg(Mips::GP)); Inst.addOperand(MCOperand::CreateReg(RegNo)); getStreamer().EmitInstruction(Inst, STI); + + setCanHaveModuleDir(false); +} + +void MipsTargetELFStreamer::emitMipsAbiFlags() { + MCAssembler &MCA = getStreamer().getAssembler(); + MCContext &Context = MCA.getContext(); + MCStreamer &OS = getStreamer(); + const MCSectionELF *Sec = + Context.getELFSection(".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, + ELF::SHF_ALLOC, SectionKind::getMetadata(), 24, ""); + MCSectionData &ABIShndxSD = MCA.getOrCreateSectionData(*Sec); + ABIShndxSD.setAlignment(8); + OS.SwitchSection(Sec); + + OS << ABIFlagsSection; +} + +void MipsTargetELFStreamer::emitDirectiveModuleOddSPReg(bool Enabled, + bool IsO32ABI) { + MipsTargetStreamer::emitDirectiveModuleOddSPReg(Enabled, IsO32ABI); + + ABIFlagsSection.OddSPReg = Enabled; } diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td index 9904bc690c23..87a3a3e29ca2 100644 --- a/lib/Target/Mips/MicroMipsInstrInfo.td +++ b/lib/Target/Mips/MicroMipsInstrInfo.td @@ -246,7 +246,6 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { } def JR_MM : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>; def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>; - def RET_MM : MMRel, RetBase<"ret", GPR32Opnd>, JR_FM_MM<0x3c>; /// Branch Instructions def BEQ_MM : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>, diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index ea16331f71d8..dd3bc9b08fc8 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -61,6 +61,8 @@ def FeatureGP64Bit : SubtargetFeature<"gp64", "IsGP64bit", "true", "General Purpose Registers are 64-bit wide.">; def FeatureFP64Bit : SubtargetFeature<"fp64", "IsFP64bit", "true", "Support 64-bit FP registers.">; +def FeatureFPXX : SubtargetFeature<"fpxx", "IsFPXX", "true", + "Support for FPXX.">; def FeatureNaN2008 : SubtargetFeature<"nan2008", "IsNaN2008bit", "true", "IEEE 754-2008 NaN encoding.">; def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat", @@ -73,6 +75,9 @@ def FeatureN64 : SubtargetFeature<"n64", "MipsABI", "N64", "Enable n64 ABI">; def FeatureEABI : SubtargetFeature<"eabi", "MipsABI", "EABI", "Enable eabi ABI">; +def FeatureNoOddSPReg : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false", + "Disable odd numbered single-precision " + "registers">; def FeatureVFPU : SubtargetFeature<"vfpu", "HasVFPU", "true", "Enable vector FPU instructions.">; def FeatureMips1 : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1", diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp index c01d03a1ddf6..93706c2c3fcc 100644 --- a/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/lib/Target/Mips/Mips16FrameLowering.cpp @@ -16,6 +16,7 @@ #include "Mips16InstrInfo.h" #include "MipsInstrInfo.h" #include "MipsRegisterInfo.h" +#include "MipsSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -28,6 +29,9 @@ using namespace llvm; +Mips16FrameLowering::Mips16FrameLowering(const MipsSubtarget &STI) + : MipsFrameLowering(STI, STI.stackAlignment()) {} + void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); MachineFrameInfo *MFI = MF.getFrameInfo(); diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h index 3f7829dd6b3f..1fb7eda0e915 100644 --- a/lib/Target/Mips/Mips16FrameLowering.h +++ b/lib/Target/Mips/Mips16FrameLowering.h @@ -19,8 +19,7 @@ namespace llvm { class Mips16FrameLowering : public MipsFrameLowering { public: - explicit Mips16FrameLowering(const MipsSubtarget &STI) - : MipsFrameLowering(STI, STI.stackAlignment()) {} + explicit Mips16FrameLowering(const MipsSubtarget &STI); /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp index 4e86a27e323c..7b0584255542 100644 --- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp +++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp @@ -37,7 +37,8 @@ using namespace llvm; #define DEBUG_TYPE "mips-isel" bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { - if (!Subtarget.inMips16Mode()) + Subtarget = &TM.getSubtarget(); + if (!Subtarget->inMips16Mode()) return false; return MipsDAGToDAGISel::runOnMachineFunction(MF); } @@ -226,9 +227,9 @@ bool Mips16DAGToDAGISel::selectAddr16( const LSBaseSDNode *LS = dyn_cast(Parent); if (LS) { - if (LS->getMemoryVT() == MVT::f32 && Subtarget.hasMips4_32r2()) + if (LS->getMemoryVT() == MVT::f32 && Subtarget->hasMips4_32r2()) return false; - if (LS->getMemoryVT() == MVT::f64 && Subtarget.hasMips4_32r2()) + if (LS->getMemoryVT() == MVT::f64 && Subtarget->hasMips4_32r2()) return false; } } diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp index 9102450dc533..587925df946b 100644 --- a/lib/Target/Mips/Mips16ISelLowering.cpp +++ b/lib/Target/Mips/Mips16ISelLowering.cpp @@ -118,20 +118,14 @@ static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = { {"truncf", "__mips16_call_stub_sf_1"}, }; -Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM) - : MipsTargetLowering(TM) { - // - // set up as if mips32 and then revert so we can test the mechanism - // for switching - addRegisterClass(MVT::i32, &Mips::GPR32RegClass); - addRegisterClass(MVT::f32, &Mips::FGR32RegClass); - computeRegisterProperties(); - clearRegisterClasses(); +Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM, + const MipsSubtarget &STI) + : MipsTargetLowering(TM, STI) { // Set up the register classes addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass); - if (Subtarget->inMips16HardFloat()) + if (!TM.Options.UseSoftFloat) setMips16HardFloatLibCalls(); setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); @@ -157,8 +151,9 @@ Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM) } const MipsTargetLowering * -llvm::createMips16TargetLowering(MipsTargetMachine &TM) { - return new Mips16TargetLowering(TM); +llvm::createMips16TargetLowering(MipsTargetMachine &TM, + const MipsSubtarget &STI) { + return new Mips16TargetLowering(TM, STI); } bool @@ -434,7 +429,7 @@ getOpndList(SmallVectorImpl &Ops, const char* Mips16HelperFunction = nullptr; bool NeedMips16Helper = false; - if (Subtarget->inMips16HardFloat()) { + if (Subtarget.inMips16HardFloat()) { // // currently we don't have symbols tagged with the mips16 or mips32 // qualifier so we will assume that we don't know what kind it is. diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h index df883339b26b..e7e4d7f651d4 100644 --- a/lib/Target/Mips/Mips16ISelLowering.h +++ b/lib/Target/Mips/Mips16ISelLowering.h @@ -11,15 +11,16 @@ // //===----------------------------------------------------------------------===// -#ifndef Mips16ISELLOWERING_H -#define Mips16ISELLOWERING_H +#ifndef MIPS16ISELLOWERING_H +#define MIPS16ISELLOWERING_H #include "MipsISelLowering.h" namespace llvm { class Mips16TargetLowering : public MipsTargetLowering { public: - explicit Mips16TargetLowering(MipsTargetMachine &TM); + explicit Mips16TargetLowering(MipsTargetMachine &TM, + const MipsSubtarget &STI); bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace, bool *Fast) const override; diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp index 79607de3397e..4dd9af24968d 100644 --- a/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/lib/Target/Mips/Mips16InstrInfo.cpp @@ -31,9 +31,8 @@ using namespace llvm; #define DEBUG_TYPE "mips16-instrinfo" -Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm) - : MipsInstrInfo(tm, Mips::Bimm16), - RI(*tm.getSubtargetImpl()) {} +Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI) + : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {} const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const { return RI; @@ -44,9 +43,8 @@ const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const { /// the destination along with the FrameIndex of the loaded stack slot. If /// not, return 0. This predicate must return 0 if the instruction has /// any side effects other than loading from the stack slot. -unsigned Mips16InstrInfo:: -isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const -{ +unsigned Mips16InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { return 0; } @@ -55,9 +53,8 @@ isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const /// the source reg along with the FrameIndex of the loaded stack slot. If /// not, return 0. This predicate must return 0 if the instruction has /// any side effects other than storing to the stack slot. -unsigned Mips16InstrInfo:: -isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const -{ +unsigned Mips16InstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { return 0; } @@ -93,11 +90,12 @@ void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MIB.addReg(SrcReg, getKillRegState(KillSrc)); } -void Mips16InstrInfo:: -storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned SrcReg, bool isKill, int FI, - const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - int64_t Offset) const { +void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI, + int64_t Offset) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore); @@ -110,10 +108,12 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addMemOperand(MMO); } -void Mips16InstrInfo:: -loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned DestReg, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset) const { +void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI, + int64_t Offset) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad); @@ -171,7 +171,8 @@ unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const { } static void addSaveRestoreRegs(MachineInstrBuilder &MIB, - const std::vector &CSI, unsigned Flags=0) { + const std::vector &CSI, + unsigned Flags = 0) { for (unsigned i = 0, e = CSI.size(); i != e; ++i) { // Add the callee-saved register as live-in. Do not add if the register is // RA and return address is taken, because it has already been added in @@ -195,8 +196,8 @@ static void addSaveRestoreRegs(MachineInstrBuilder &MIB, } // Adjust SP by FrameSize bytes. Save RA, S0, S1 void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const { + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -265,9 +266,6 @@ void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount, MachineBasicBlock::iterator I, unsigned Reg1, unsigned Reg2) const { DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); -// MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); -// unsigned Reg1 = RegInfo.createVirtualRegister(&Mips::CPU16RegsRegClass); -// unsigned Reg2 = RegInfo.createVirtualRegister(&Mips::CPU16RegsRegClass); // // li reg1, constant // move reg2, sp @@ -287,9 +285,9 @@ void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount, MIB4.addReg(Reg1, RegState::Kill); } -void Mips16InstrInfo::adjustStackPtrBigUnrestricted(unsigned SP, int64_t Amount, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const { +void Mips16InstrInfo::adjustStackPtrBigUnrestricted( + unsigned SP, int64_t Amount, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { assert(false && "adjust stack pointer amount exceeded"); } @@ -305,11 +303,10 @@ void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount, /// This function generates the sequence of instructions needed to get the /// result of adding register REG and immediate IMM. -unsigned -Mips16InstrInfo::loadImmediate(unsigned FrameReg, - int64_t Imm, MachineBasicBlock &MBB, - MachineBasicBlock::iterator II, DebugLoc DL, - unsigned &NewImm) const { +unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator II, + DebugLoc DL, unsigned &NewImm) const { // // given original instruction is: // Instr rx, T[offset] where offset is too big. @@ -345,7 +342,7 @@ Mips16InstrInfo::loadImmediate(unsigned FrameReg, !TargetRegisterInfo::isVirtualRegister(MO.getReg())) Candidates.reset(MO.getReg()); } - // + // If the same register was used and defined in an instruction, then // it will not be in the list of candidates. // @@ -354,7 +351,6 @@ Mips16InstrInfo::loadImmediate(unsigned FrameReg, // present as an operand of the instruction. this tells // whether the register is live before the instruction. if it's not // then we don't need to save it in case there are no free registers. - // int DefReg = 0; for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) { MachineOperand &MO = II->getOperand(i); @@ -363,9 +359,8 @@ Mips16InstrInfo::loadImmediate(unsigned FrameReg, break; } } - // - BitVector Available = rs.getRegsAvailable(&Mips::CPU16RegsRegClass); + BitVector Available = rs.getRegsAvailable(&Mips::CPU16RegsRegClass); Available &= Candidates; // // we use T0 for the first register, if we need to save something away. @@ -374,7 +369,6 @@ Mips16InstrInfo::loadImmediate(unsigned FrameReg, unsigned FirstRegSaved =0, SecondRegSaved=0; unsigned FirstRegSavedTo = 0, SecondRegSavedTo = 0; - Reg = Available.find_first(); if (Reg == -1) { @@ -442,7 +436,6 @@ void Mips16InstrInfo::ExpandRetRA16(MachineBasicBlock &MBB, BuildMI(MBB, I, I->getDebugLoc(), get(Opc)); } - const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const { if (validSpImm8(Imm)) return get(Mips::AddiuSpImm16); @@ -456,8 +449,8 @@ void Mips16InstrInfo::BuildAddiuSpImm BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm); } -const MipsInstrInfo *llvm::createMips16InstrInfo(MipsTargetMachine &TM) { - return new Mips16InstrInfo(TM); +const MipsInstrInfo *llvm::createMips16InstrInfo(const MipsSubtarget &STI) { + return new Mips16InstrInfo(STI); } bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg, @@ -497,7 +490,6 @@ bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg, unsigned Mips16InstrInfo::getInlineAsmLength(const char *Str, const MCAsmInfo &MAI) const { - // Count the number of instructions in the asm. bool atInsnStart = true; unsigned Length = 0; diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h index 0dc00461ccfc..a004c567e6c9 100644 --- a/lib/Target/Mips/Mips16InstrInfo.h +++ b/lib/Target/Mips/Mips16InstrInfo.h @@ -23,7 +23,7 @@ class Mips16InstrInfo : public MipsInstrInfo { const Mips16RegisterInfo RI; public: - explicit Mips16InstrInfo(MipsTargetMachine &TM); + explicit Mips16InstrInfo(const MipsSubtarget &STI); const MipsRegisterInfo &getRegisterInfo() const override; diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td index 11166c45a880..5e4eebb62c12 100644 --- a/lib/Target/Mips/Mips16InstrInfo.td +++ b/lib/Target/Mips/Mips16InstrInfo.td @@ -1370,9 +1370,11 @@ def : Mips16Pat<(MipsJmpLink (i32 texternalsym:$dst)), (Jal16 texternalsym:$dst)>; // Indirect branch -def: Mips16Pat< - (brind CPU16Regs:$rs), - (JrcRx16 CPU16Regs:$rs)>; +def: Mips16Pat<(brind CPU16Regs:$rs), (JrcRx16 CPU16Regs:$rs)> { + // Ensure that the addition of MIPS32r6/MIPS64r6 support does not change + // MIPS16's behaviour. + let AddedComplexity = 1; +} // Jump and Link (Call) let isCall=1, hasDelaySlot=0 in diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td index 051db7525f6d..e4ec96a92f5b 100644 --- a/lib/Target/Mips/Mips32r6InstrFormats.td +++ b/lib/Target/Mips/Mips32r6InstrFormats.td @@ -95,6 +95,9 @@ def OPCODE6_CLO : OPCODE6<0b010001>; def OPCODE6_CLZ : OPCODE6<0b010000>; def OPCODE6_DCLO : OPCODE6<0b010011>; def OPCODE6_DCLZ : OPCODE6<0b010010>; +def OPCODE6_LSA : OPCODE6<0b000101>; +def OPCODE6_DLSA : OPCODE6<0b010101>; +def OPCODE6_SDBBP : OPCODE6<0b001110>; class FIELD_FMT Val> { bits<5> Value = Val; @@ -105,22 +108,23 @@ def FIELD_FMT_D : FIELD_FMT<0b10001>; class FIELD_CMP_COND Val> { bits<5> Value = Val; } -def FIELD_CMP_COND_F : FIELD_CMP_COND<0b00000>; +// Note: The CMP_COND_FMT names differ from the C_COND_FMT names. +def FIELD_CMP_COND_AF : FIELD_CMP_COND<0b00000>; def FIELD_CMP_COND_UN : FIELD_CMP_COND<0b00001>; def FIELD_CMP_COND_EQ : FIELD_CMP_COND<0b00010>; def FIELD_CMP_COND_UEQ : FIELD_CMP_COND<0b00011>; -def FIELD_CMP_COND_OLT : FIELD_CMP_COND<0b00100>; +def FIELD_CMP_COND_LT : FIELD_CMP_COND<0b00100>; def FIELD_CMP_COND_ULT : FIELD_CMP_COND<0b00101>; -def FIELD_CMP_COND_OLE : FIELD_CMP_COND<0b00110>; +def FIELD_CMP_COND_LE : FIELD_CMP_COND<0b00110>; def FIELD_CMP_COND_ULE : FIELD_CMP_COND<0b00111>; -def FIELD_CMP_COND_SF : FIELD_CMP_COND<0b01000>; -def FIELD_CMP_COND_NGLE : FIELD_CMP_COND<0b01001>; +def FIELD_CMP_COND_SAF : FIELD_CMP_COND<0b01000>; +def FIELD_CMP_COND_SUN : FIELD_CMP_COND<0b01001>; def FIELD_CMP_COND_SEQ : FIELD_CMP_COND<0b01010>; -def FIELD_CMP_COND_NGL : FIELD_CMP_COND<0b01011>; -def FIELD_CMP_COND_LT : FIELD_CMP_COND<0b01100>; -def FIELD_CMP_COND_NGE : FIELD_CMP_COND<0b01101>; -def FIELD_CMP_COND_LE : FIELD_CMP_COND<0b01110>; -def FIELD_CMP_COND_NGT : FIELD_CMP_COND<0b01111>; +def FIELD_CMP_COND_SUEQ : FIELD_CMP_COND<0b01011>; +def FIELD_CMP_COND_SLT : FIELD_CMP_COND<0b01100>; +def FIELD_CMP_COND_SULT : FIELD_CMP_COND<0b01101>; +def FIELD_CMP_COND_SLE : FIELD_CMP_COND<0b01110>; +def FIELD_CMP_COND_SULE : FIELD_CMP_COND<0b01111>; class FIELD_CMP_FORMAT Val> { bits<5> Value = Val; @@ -326,6 +330,16 @@ class SPECIAL_3R_FM mulop, bits<6> funct> : MipsR6Inst { let Inst{5-0} = funct; } +class SPECIAL_SDBBP_FM : MipsR6Inst { + bits<20> code_; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_SPECIAL.Value; + let Inst{25-6} = code_; + let Inst{5-0} = OPCODE6_SDBBP.Value; +} + // This class is ambiguous with other branches: // BEQC/BNEC require that rs > rt class CMP_BRANCH_2R_OFF16_FM : MipsR6Inst { @@ -453,6 +467,23 @@ class SPECIAL3_LL_SC_FM : MipsR6Inst { string DecoderMethod = "DecodeSpecial3LlSc"; } +class SPECIAL_LSA_FM : MipsR6Inst { + bits<5> rd; + bits<5> rs; + bits<5> rt; + bits<2> imm2; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_SPECIAL.Value; + let Inst{25-21} = rs; + let Inst{20-16} = rt; + let Inst{15-11} = rd; + let Inst{10-8} = 0b000; + let Inst{7-6} = imm2; + let Inst{5-0} = Operation.Value; +} + class REGIMM_FM : MipsR6Inst { bits<5> rs; bits<16> imm; diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td index 52e12fcf9462..6d6735b3aae6 100644 --- a/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/lib/Target/Mips/Mips32r6InstrInfo.td @@ -16,7 +16,6 @@ include "Mips32r6InstrFormats.td" // Notes about removals/changes from MIPS32r6: // Reencoded: jr -> jalr // Reencoded: jr.hb -> jalr.hb -// Reencoded: sdbbp def brtarget21 : Operand { let EncoderMethod = "getBranchTarget21OpValue"; @@ -160,12 +159,22 @@ class LWC2_R6_ENC : COP2LDST_FM; class SDC2_R6_ENC : COP2LDST_FM; class SWC2_R6_ENC : COP2LDST_FM; +class LSA_R6_ENC : SPECIAL_LSA_FM; + class LL_R6_ENC : SPECIAL3_LL_SC_FM; class SC_R6_ENC : SPECIAL3_LL_SC_FM; class CLO_R6_ENC : SPECIAL_2R_FM; class CLZ_R6_ENC : SPECIAL_2R_FM; +class SDBBP_R6_ENC : SPECIAL_SDBBP_FM; + +//===----------------------------------------------------------------------===// +// +// Instruction Multiclasses +// +//===----------------------------------------------------------------------===// + class CMP_CONDN_DESC_BASE { @@ -175,16 +184,10 @@ class CMP_CONDN_DESC_BASE Pattern = [(set FGRCCOpnd:$fd, (Op FGROpnd:$fs, FGROpnd:$ft))]; } -//===----------------------------------------------------------------------===// -// -// Instruction Multiclasses -// -//===----------------------------------------------------------------------===// - multiclass CMP_CC_M { - def CMP_F_#NAME : COP1_CMP_CONDN_FM, - CMP_CONDN_DESC_BASE<"f", Typestr, FGROpnd>, + def CMP_F_#NAME : COP1_CMP_CONDN_FM, + CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, ISA_MIPS32R6; def CMP_UN_#NAME : COP1_CMP_CONDN_FM, CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>, @@ -195,42 +198,42 @@ multiclass CMP_CC_M , CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>, ISA_MIPS32R6; - def CMP_OLT_#NAME : COP1_CMP_CONDN_FM, - CMP_CONDN_DESC_BASE<"olt", Typestr, FGROpnd, setolt>, - ISA_MIPS32R6; + def CMP_LT_#NAME : COP1_CMP_CONDN_FM, + CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>, + ISA_MIPS32R6; def CMP_ULT_#NAME : COP1_CMP_CONDN_FM, CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>, ISA_MIPS32R6; - def CMP_OLE_#NAME : COP1_CMP_CONDN_FM, - CMP_CONDN_DESC_BASE<"ole", Typestr, FGROpnd, setole>, - ISA_MIPS32R6; + def CMP_LE_#NAME : COP1_CMP_CONDN_FM, + CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>, + ISA_MIPS32R6; def CMP_ULE_#NAME : COP1_CMP_CONDN_FM, CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>, ISA_MIPS32R6; - def CMP_SF_#NAME : COP1_CMP_CONDN_FM, - CMP_CONDN_DESC_BASE<"sf", Typestr, FGROpnd>, - ISA_MIPS32R6; - def CMP_NGLE_#NAME : COP1_CMP_CONDN_FM, - CMP_CONDN_DESC_BASE<"ngle", Typestr, FGROpnd>, - ISA_MIPS32R6; + def CMP_SAF_#NAME : COP1_CMP_CONDN_FM, + CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, + ISA_MIPS32R6; + def CMP_SUN_#NAME : COP1_CMP_CONDN_FM, + CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, + ISA_MIPS32R6; def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM, CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, ISA_MIPS32R6; - def CMP_NGL_#NAME : COP1_CMP_CONDN_FM, - CMP_CONDN_DESC_BASE<"ngl", Typestr, FGROpnd>, - ISA_MIPS32R6; - def CMP_LT_#NAME : COP1_CMP_CONDN_FM, - CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd>, - ISA_MIPS32R6; - def CMP_NGE_#NAME : COP1_CMP_CONDN_FM, - CMP_CONDN_DESC_BASE<"nge", Typestr, FGROpnd>, + def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM, + CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, + ISA_MIPS32R6; + def CMP_SLT_#NAME : COP1_CMP_CONDN_FM, + CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, ISA_MIPS32R6; - def CMP_LE_#NAME : COP1_CMP_CONDN_FM, - CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd>, - ISA_MIPS32R6; - def CMP_NGT_#NAME : COP1_CMP_CONDN_FM, - CMP_CONDN_DESC_BASE<"ngt", Typestr, FGROpnd>, + def CMP_SULT_#NAME : COP1_CMP_CONDN_FM, + CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, + ISA_MIPS32R6; + def CMP_SLE_#NAME : COP1_CMP_CONDN_FM, + CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, ISA_MIPS32R6; + def CMP_SULE_#NAME : COP1_CMP_CONDN_FM, + CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, + ISA_MIPS32R6; } //===----------------------------------------------------------------------===// @@ -574,6 +577,16 @@ class COP2ST_DESC_BASE { class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd>; class SWC2_R6_DESC : COP2ST_DESC_BASE<"swc2", COP2Opnd>; +class LSA_R6_DESC_BASE { + dag OutOperandList = (outs GPROpnd:$rd); + dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$imm2); + string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $imm2"); + list Pattern = []; +} + +class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2>; + class LL_R6_DESC_BASE { dag OutOperandList = (outs GPROpnd:$rt); dag InOperandList = (ins mem_simm9:$addr); @@ -614,6 +627,13 @@ class CLZ_R6_DESC_BASE : class CLO_R6_DESC : CLO_R6_DESC_BASE<"clo", GPR32Opnd>; class CLZ_R6_DESC : CLZ_R6_DESC_BASE<"clz", GPR32Opnd>; +class SDBBP_R6_DESC { + dag OutOperandList = (outs); + dag InOperandList = (ins uimm20:$code_); + string AsmString = "sdbbp\t$code_"; + list Pattern = []; +} + //===----------------------------------------------------------------------===// // // Instruction Definitions @@ -667,7 +687,7 @@ def JIC : JIC_ENC, JIC_DESC, ISA_MIPS32R6; def JR_HB_R6 : JR_HB_R6_ENC, JR_HB_R6_DESC, ISA_MIPS32R6; def LDC2_R6 : LDC2_R6_ENC, LDC2_R6_DESC, ISA_MIPS32R6; def LL_R6 : LL_R6_ENC, LL_R6_DESC, ISA_MIPS32R6; -// def LSA; // See MSA +def LSA_R6 : LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6; def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6; def LWPC : LWPC_ENC, LWPC_DESC, ISA_MIPS32R6; def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6; @@ -694,6 +714,7 @@ def PREF_R6 : PREF_ENC, PREF_DESC, ISA_MIPS32R6; def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6; def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6; def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6; +def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6; def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6; def SELEQZ : SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32; def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6; @@ -705,6 +726,15 @@ def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6; def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6; def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6; +//===----------------------------------------------------------------------===// +// +// Instruction Aliases +// +//===----------------------------------------------------------------------===// + +def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6; +def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6; + //===----------------------------------------------------------------------===// // // Patterns and Pseudo Instructions @@ -724,9 +754,9 @@ def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>, ISA_MIPS32R6; def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>, ISA_MIPS32R6; -def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_OLT_S f32:$lhs, f32:$rhs)>, +def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>, ISA_MIPS32R6; -def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_OLE_S f32:$lhs, f32:$rhs)>, +def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>, ISA_MIPS32R6; def : MipsPat<(setne f32:$lhs, f32:$rhs), (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; @@ -744,9 +774,9 @@ def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>, ISA_MIPS32R6; def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>, ISA_MIPS32R6; -def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_OLT_D f64:$lhs, f64:$rhs)>, +def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>, ISA_MIPS32R6; -def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_OLE_D f64:$lhs, f64:$rhs)>, +def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>, ISA_MIPS32R6; def : MipsPat<(setne f64:$lhs, f64:$rhs), (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; @@ -756,28 +786,28 @@ def : MipsPat<(select i32:$cond, i32:$t, i32:$f), (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>, ISA_MIPS32R6; def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f), - (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>, + (OR (SELEQZ i32:$t, i32:$cond), (SELNEZ i32:$f, i32:$cond))>, ISA_MIPS32R6; def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f), - (OR (SELNEZ i32:$f, i32:$cond), (SELEQZ i32:$t, i32:$cond))>, + (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>, ISA_MIPS32R6; def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f), - (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)), - (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>, + (OR (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)), + (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>, ISA_MIPS32R6; def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f), - (OR (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)), - (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)))>, + (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)), + (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>, ISA_MIPS32R6; def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t, i32:$f), - (OR (SELNEZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))), - (SELEQZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>, + (OR (SELEQZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))), + (SELNEZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>, ISA_MIPS32R6; def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)), i32:$t, i32:$f), - (OR (SELNEZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))), - (SELEQZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>, + (OR (SELEQZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))), + (SELNEZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>, ISA_MIPS32R6; def : MipsPat<(select i32:$cond, i32:$t, immz), diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 88422ce19dea..f0b6814e37cc 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -23,6 +23,8 @@ def uimm16_64 : Operand { // Signed Operand def simm10_64 : Operand; +def imm64: Operand; + // Transformation Function - get Imm - 32. def Subtract32 : SDNodeXFormgetZExtValue() - 32); @@ -172,18 +174,21 @@ def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3_NOT_32R6_64R6; /// Jump and Branch Instructions let isCodeGenOnly = 1 in { -def JR64 : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>; -def BEQ64 : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>; -def BNE64 : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>; -def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>; -def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>; -def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>; -def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>; -def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM; -def JALR64Pseudo : JumpLinkRegPseudo; -def TAILCALL64_R : TailCallReg; + def JR64 : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>; + def BEQ64 : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>; + def BNE64 : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>; + def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>; + def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>; + def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>; + def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>; + def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM; + def JALR64Pseudo : JumpLinkRegPseudo; + def TAILCALL64_R : TailCallReg; } +def PseudoReturn64 : PseudoReturnBase; +def PseudoIndirectBranch64 : PseudoIndirectBranchBase; + /// Multiply and Divide Instructions. def DMULT : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>, MULT_FM<0, 0x1c>, ISA_MIPS3_NOT_32R6_64R6; @@ -486,6 +491,11 @@ def : MipsInstAlias<"dsrl $rd, $rt, $rs", (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS3; +class LoadImm64< string instr_asm, Operand Od, RegisterOperand RO> : + MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64), + !strconcat(instr_asm, "\t$rt, $imm64")> ; +def LoadImm64Reg : LoadImm64<"dli", imm64, GPR64Opnd>; + /// Move between CPU and coprocessor registers let DecoderNamespace = "Mips64", Predicates = [HasMips64] in { def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>; diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td index dbfb9fbbf9bc..6b546e864bd3 100644 --- a/lib/Target/Mips/Mips64r6InstrInfo.td +++ b/lib/Target/Mips/Mips64r6InstrInfo.td @@ -29,12 +29,13 @@ class DCLO_R6_ENC : SPECIAL_2R_FM; class DCLZ_R6_ENC : SPECIAL_2R_FM; class DDIV_ENC : SPECIAL_3R_FM<0b00010, 0b011110>; class DDIVU_ENC : SPECIAL_3R_FM<0b00010, 0b011111>; +class DLSA_R6_ENC : SPECIAL_LSA_FM; class DMOD_ENC : SPECIAL_3R_FM<0b00011, 0b011110>; class DMODU_ENC : SPECIAL_3R_FM<0b00011, 0b011111>; -class DMUH_ENC : SPECIAL_3R_FM<0b00011, 0b111000>; -class DMUHU_ENC : SPECIAL_3R_FM<0b00011, 0b111001>; -class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b111000>; -class DMULU_ENC : SPECIAL_3R_FM<0b00010, 0b111001>; +class DMUH_ENC : SPECIAL_3R_FM<0b00011, 0b011100>; +class DMUHU_ENC : SPECIAL_3R_FM<0b00011, 0b011101>; +class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011100>; +class DMULU_ENC : SPECIAL_3R_FM<0b00010, 0b011101>; class LDPC_ENC : PCREL18_FM; class LLD_R6_ENC : SPECIAL3_LL_SC_FM; class SCD_R6_ENC : SPECIAL3_LL_SC_FM; @@ -61,6 +62,7 @@ class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd>; class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd>; class DDIV_DESC : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, sdiv>; class DDIVU_DESC : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, udiv>; +class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2>; class DMOD_DESC : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, srem>; class DMODU_DESC : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, urem>; class DMUH_DESC : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, mulhs>; @@ -88,7 +90,7 @@ def DCLO_R6 : DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6; def DCLZ_R6 : DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6; def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6; def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6; -// def DLSA; // See MSA +def DLSA_R6 : DLSA_R6_ENC, DLSA_R6_DESC, ISA_MIPS64R6; def DMOD : DMOD_ENC, DMOD_DESC, ISA_MIPS64R6; def DMODU : DMODU_ENC, DMODU_DESC, ISA_MIPS64R6; def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6; @@ -103,6 +105,14 @@ let DecoderNamespace = "Mips32r6_64r6_GP64" in { def SELNEZ64 : SELNEZ_ENC, SELNEZ64_DESC, ISA_MIPS32R6, GPR_64; } +//===----------------------------------------------------------------------===// +// +// Instruction Aliases +// +//===----------------------------------------------------------------------===// + +def : MipsInstAlias<"jr $rs", (JALR64 ZERO_64, GPR64Opnd:$rs), 1>, ISA_MIPS64R6; + //===----------------------------------------------------------------------===// // // Patterns and Pseudo Instructions @@ -115,36 +125,36 @@ def : MipsPat<(select i64:$cond, i64:$t, i64:$f), (SELEQZ64 i64:$f, i64:$cond))>, ISA_MIPS64R6; def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, i64:$f), - (OR64 (SELNEZ64 i64:$t, i64:$cond), - (SELEQZ64 i64:$f, i64:$cond))>, + (OR64 (SELEQZ64 i64:$t, i64:$cond), + (SELNEZ64 i64:$f, i64:$cond))>, ISA_MIPS64R6; def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, i64:$f), - (OR64 (SELNEZ64 i64:$f, i64:$cond), - (SELEQZ64 i64:$t, i64:$cond))>, + (OR64 (SELNEZ64 i64:$t, i64:$cond), + (SELEQZ64 i64:$f, i64:$cond))>, ISA_MIPS64R6; def : MipsPat<(select (i32 (seteq i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f), - (OR64 (SELNEZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)), - (SELEQZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>, + (OR64 (SELEQZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)), + (SELNEZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>, ISA_MIPS64R6; def : MipsPat<(select (i32 (setne i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f), - (OR64 (SELNEZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)), - (SELEQZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)))>, + (OR64 (SELNEZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)), + (SELEQZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>, ISA_MIPS64R6; def : MipsPat< (select (i32 (setgt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f), - (OR64 (SELNEZ64 i64:$t, + (OR64 (SELEQZ64 i64:$t, (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)), sub_32)), - (SELEQZ64 i64:$f, + (SELNEZ64 i64:$f, (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)), sub_32)))>, ISA_MIPS64R6; def : MipsPat< (select (i32 (setugt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f), - (OR64 (SELNEZ64 i64:$t, + (OR64 (SELEQZ64 i64:$t, (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)), sub_32)), - (SELEQZ64 i64:$f, + (SELNEZ64 i64:$f, (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)), sub_32)))>, ISA_MIPS64R6; @@ -167,23 +177,23 @@ def : MipsPat<(select i32:$cond, i64:$t, i64:$f), (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>, ISA_MIPS64R6; def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, i64:$f), - (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)), - (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>, + (OR64 (SELEQZ64 i64:$t, (SLL64_32 i32:$cond)), + (SELNEZ64 i64:$f, (SLL64_32 i32:$cond)))>, ISA_MIPS64R6; def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, i64:$f), - (OR64 (SELNEZ64 i64:$f, (SLL64_32 i32:$cond)), - (SELEQZ64 i64:$t, (SLL64_32 i32:$cond)))>, + (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)), + (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>, ISA_MIPS64R6; def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i64:$t, i64:$f), - (OR64 (SELNEZ64 i64:$t, (SLL64_32 (XORi i32:$cond, + (OR64 (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond, immZExt16:$imm))), - (SELEQZ64 i64:$f, (SLL64_32 (XORi i32:$cond, + (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond, immZExt16:$imm))))>, ISA_MIPS64R6; def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i64:$t, i64:$f), - (OR64 (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond, + (OR64 (SELNEZ64 i64:$t, (SLL64_32 (XORi i32:$cond, immZExt16:$imm))), - (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond, + (SELEQZ64 i64:$f, (SLL64_32 (XORi i32:$cond, immZExt16:$imm))))>, ISA_MIPS64R6; diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index 6df90aa75a11..7f21d68bdd1d 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -58,10 +58,12 @@ MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() { } bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &TM.getSubtarget(); + // Initialize TargetLoweringObjectFile. - if (Subtarget->allowMixed16_32()) - const_cast(getObjFileLowering()) + const_cast(getObjFileLowering()) .Initialize(OutContext, TM); + MipsFI = MF.getInfo(); if (Subtarget->inMips16Mode()) for (std::map< @@ -91,7 +93,46 @@ bool MipsAsmPrinter::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) { #include "MipsGenMCPseudoLowering.inc" +// Lower PseudoReturn/PseudoIndirectBranch/PseudoIndirectBranch64 to JR, JR_MM, +// JALR, or JALR64 as appropriate for the target +void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer, + const MachineInstr *MI) { + bool HasLinkReg = false; + MCInst TmpInst0; + + if (Subtarget->hasMips64r6()) { + // MIPS64r6 should use (JALR64 ZERO_64, $rs) + TmpInst0.setOpcode(Mips::JALR64); + HasLinkReg = true; + } else if (Subtarget->hasMips32r6()) { + // MIPS32r6 should use (JALR ZERO, $rs) + TmpInst0.setOpcode(Mips::JALR); + HasLinkReg = true; + } else if (Subtarget->inMicroMipsMode()) + // microMIPS should use (JR_MM $rs) + TmpInst0.setOpcode(Mips::JR_MM); + else { + // Everything else should use (JR $rs) + TmpInst0.setOpcode(Mips::JR); + } + + MCOperand MCOp; + + if (HasLinkReg) { + unsigned ZeroReg = Subtarget->isGP64bit() ? Mips::ZERO_64 : Mips::ZERO; + TmpInst0.addOperand(MCOperand::CreateReg(ZeroReg)); + } + + lowerOperand(MI->getOperand(0), MCOp); + TmpInst0.addOperand(MCOp); + + EmitToStreamer(OutStreamer, TmpInst0); +} + void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { + MipsTargetStreamer &TS = getTargetStreamer(); + TS.setCanHaveModuleDir(false); + if (MI->isDebugValue()) { SmallString<128> Str; raw_svector_ostream OS(Str); @@ -141,6 +182,14 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (emitPseudoExpansionLowering(OutStreamer, &*I)) continue; + if (I->getOpcode() == Mips::PseudoReturn || + I->getOpcode() == Mips::PseudoReturn64 || + I->getOpcode() == Mips::PseudoIndirectBranch || + I->getOpcode() == Mips::PseudoIndirectBranch64) { + emitPseudoIndirectBranch(OutStreamer, &*I); + continue; + } + // The inMips16Mode() test is not permanent. // Some instructions are marked as pseudo right now which // would make the test fail for the wrong reason but @@ -625,7 +674,7 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) { bool IsABICalls = true; if (IsABICalls) { getTargetStreamer().emitDirectiveAbiCalls(); - Reloc::Model RM = Subtarget->getRelocationModel(); + Reloc::Model RM = TM.getRelocationModel(); // FIXME: This condition should be a lot more complicated that it is here. // Ideally it should test for properties of the ABI and not the ABI // itself. @@ -657,6 +706,23 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) { OutContext.getELFSection(".gcc_compiled_long64", ELF::SHT_PROGBITS, 0, SectionKind::getDataRel())); } + + getTargetStreamer().updateABIInfo(*Subtarget); + + // We should always emit a '.module fp=...' but binutils 2.24 does not accept + // it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or + // -mfp64) and omit it otherwise. + if (Subtarget->isABI_O32() && (Subtarget->isABI_FPXX() || + Subtarget->isFP64bit())) + getTargetStreamer().emitDirectiveModuleFP(); + + // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not + // accept it. We therefore emit it when it contradicts the default or an + // option has changed the default (i.e. FPXX) and omit it otherwise. + if (Subtarget->isABI_O32() && (!Subtarget->useOddSPReg() || + Subtarget->isABI_FPXX())) + getTargetStreamer().emitDirectiveModuleOddSPReg(Subtarget->useOddSPReg(), + Subtarget->isABI_O32()); } void MipsAsmPrinter::EmitJal(MCSymbol *Symbol) { @@ -852,7 +918,7 @@ void MipsAsmPrinter::EmitFPCallStub( TS.emitDirectiveSetNoMicroMips(); // // .ent __call_stub_fp_xxxx - // .type __call_stub_fp_xxxx,@function + // .type __call_stub_fp_xxxx,@function // __call_stub_fp_xxxx: // std::string x = "__call_stub_fp_" + std::string(Symbol); diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h index e82b145925d7..abbd39b571a5 100644 --- a/lib/Target/Mips/MipsAsmPrinter.h +++ b/lib/Target/Mips/MipsAsmPrinter.h @@ -40,6 +40,12 @@ class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter { bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, const MachineInstr *MI); + // Emit PseudoReturn, PseudoReturn64, PseudoIndirectBranch, + // and PseudoIndirectBranch64 as a JR, JR_MM, JALR, or JALR64 as appropriate + // for the target. + void emitPseudoIndirectBranch(MCStreamer &OutStreamer, + const MachineInstr *MI); + // lowerOperand - Convert a MachineOperand into the equivalent MCOperand. bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp); @@ -83,11 +89,14 @@ class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter { const MipsFunctionInfo *MipsFI; MipsMCInstLower MCInstLowering; - explicit MipsAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer), MCP(nullptr), InConstantPool(false), - MCInstLowering(*this) { - Subtarget = &TM.getSubtarget(); - } + // We initialize the subtarget here and in runOnMachineFunction + // since there are certain target specific flags (ABI) that could + // reside on the TargetMachine, but are on the subtarget currently + // and we need them for the beginning of file output before we've + // seen a single function. + explicit MipsAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : AsmPrinter(TM, Streamer), MCP(nullptr), InConstantPool(false), + Subtarget(&TM.getSubtarget()), MCInstLowering(*this) {} const char *getPassName() const override { return "Mips Assembly Printer"; diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td index c83d880cdb2f..85704b9917a3 100644 --- a/lib/Target/Mips/MipsCallingConv.td +++ b/lib/Target/Mips/MipsCallingConv.td @@ -26,9 +26,9 @@ def RetCC_MipsO32 : CallingConv<[ // f32 are returned in registers F0, F2 CCIfType<[f32], CCAssignToReg<[F0, F2]>>, - // f64 arguments are returned in D0_64 and D1_64 in FP64bit mode or + // f64 arguments are returned in D0_64 and D2_64 in FP64bit mode or // in D0 and D1 in FP32bit mode. - CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCAssignToReg<[D0_64, D1_64]>>>, + CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCAssignToReg<[D0_64, D2_64]>>>, CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()", CCAssignToReg<[D0, D1]>>> ]>; @@ -239,11 +239,17 @@ def RetCC_Mips : CallingConv<[ def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP, (sequence "S%u", 7, 0))>; +def CSR_O32_FPXX : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP, + (sequence "S%u", 7, 0))> { + let OtherPreserved = (add (decimate (sequence "F%u", 30, 20), 2)); +} + def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP, (sequence "S%u", 7, 0))>; -def CSR_O32_FP64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 20), RA, FP, - (sequence "S%u", 7, 0))>; +def CSR_O32_FP64 : + CalleeSavedRegs<(add (decimate (sequence "D%u_64", 30, 20), 2), RA, FP, + (sequence "S%u", 7, 0))>; def CSR_N32 : CalleeSavedRegs<(add D20_64, D22_64, D24_64, D26_64, D28_64, D30_64, RA_64, FP_64, GP_64, diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp index 151ef134e1da..794c71898fa3 100644 --- a/lib/Target/Mips/MipsCodeEmitter.cpp +++ b/lib/Target/Mips/MipsCodeEmitter.cpp @@ -130,6 +130,9 @@ class MipsCodeEmitter : public MachineFunctionPass { void expandACCInstr(MachineBasicBlock::instr_iterator MI, MachineBasicBlock &MBB, unsigned Opc) const; + void expandPseudoIndirectBranch(MachineBasicBlock::instr_iterator MI, + MachineBasicBlock &MBB) const; + /// \brief Expand pseudo instruction. Return true if MI was expanded. bool expandPseudos(MachineBasicBlock::instr_iterator &MI, MachineBasicBlock &MBB) const; @@ -373,9 +376,44 @@ void MipsCodeEmitter::expandACCInstr(MachineBasicBlock::instr_iterator MI, .addReg(MI->getOperand(1).getReg()).addReg(MI->getOperand(2).getReg()); } +void MipsCodeEmitter::expandPseudoIndirectBranch( + MachineBasicBlock::instr_iterator MI, MachineBasicBlock &MBB) const { + // This logic is duplicated from MipsAsmPrinter::emitPseudoIndirectBranch() + bool HasLinkReg = false; + unsigned Opcode = 0; + + if (Subtarget->hasMips64r6()) { + // MIPS64r6 should use (JALR64 ZERO_64, $rs) + Opcode = Mips::JALR64; + HasLinkReg = true; + } else if (Subtarget->hasMips32r6()) { + // MIPS32r6 should use (JALR ZERO, $rs) + Opcode = Mips::JALR; + HasLinkReg = true; + } else if (Subtarget->inMicroMipsMode()) + // microMIPS should use (JR_MM $rs) + Opcode = Mips::JR_MM; + else { + // Everything else should use (JR $rs) + Opcode = Mips::JR; + } + + auto MIB = BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Opcode)); + + if (HasLinkReg) { + unsigned ZeroReg = Subtarget->isGP64bit() ? Mips::ZERO_64 : Mips::ZERO; + MIB.addReg(ZeroReg); + } + + MIB.addReg(MI->getOperand(0).getReg()); +} + bool MipsCodeEmitter::expandPseudos(MachineBasicBlock::instr_iterator &MI, MachineBasicBlock &MBB) const { switch (MI->getOpcode()) { + default: + llvm_unreachable("Unhandled pseudo"); + return false; case Mips::NOP: BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::SLL), Mips::ZERO) .addReg(Mips::ZERO).addImm(0); @@ -416,8 +454,17 @@ bool MipsCodeEmitter::expandPseudos(MachineBasicBlock::instr_iterator &MI, case Mips::PseudoMSUBU: expandACCInstr(MI, MBB, Mips::MSUBU); break; - default: - return false; + case Mips::PseudoReturn: + case Mips::PseudoReturn64: + case Mips::PseudoIndirectBranch: + case Mips::PseudoIndirectBranch64: + expandPseudoIndirectBranch(MI, MBB); + break; + case TargetOpcode::CFI_INSTRUCTION: + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + // Do nothing + return false; } (MI--)->eraseFromBundle(); diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp index a37062f3b267..80bf573a9625 100644 --- a/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -365,11 +365,10 @@ namespace { public: static char ID; MipsConstantIslands(TargetMachine &tm) - : MachineFunctionPass(ID), TM(tm), - IsPIC(TM.getRelocationModel() == Reloc::PIC_), - ABI(TM.getSubtarget().getTargetABI()), - STI(&TM.getSubtarget()), MF(nullptr), MCP(nullptr), - PrescannedForConstants(false){} + : MachineFunctionPass(ID), TM(tm), + IsPIC(TM.getRelocationModel() == Reloc::PIC_), + ABI(TM.getSubtarget().getTargetABI()), STI(nullptr), + MF(nullptr), MCP(nullptr), PrescannedForConstants(false) {} const char *getPassName() const override { return "Mips Constant Islands"; @@ -450,9 +449,9 @@ bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) { // FIXME: MF = &mf; MCP = mf.getConstantPool(); + STI = &mf.getTarget().getSubtarget(); DEBUG(dbgs() << "constant island machine function " << "\n"); - if (!TM.getSubtarget().inMips16Mode() || - !MipsSubtarget::useConstantIslands()) { + if (!STI->inMips16Mode() || !MipsSubtarget::useConstantIslands()) { return false; } TII = (const Mips16InstrInfo*)MF->getTarget().getInstrInfo(); diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td index cf09113cd8ad..b5d52ced9d3d 100644 --- a/lib/Target/Mips/MipsDSPInstrFormats.td +++ b/lib/Target/Mips/MipsDSPInstrFormats.td @@ -7,9 +7,9 @@ // //===----------------------------------------------------------------------===// -def HasDSP : Predicate<"Subtarget.hasDSP()">, +def HasDSP : Predicate<"Subtarget->hasDSP()">, AssemblerPredicate<"FeatureDSP">; -def HasDSPR2 : Predicate<"Subtarget.hasDSPR2()">, +def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">, AssemblerPredicate<"FeatureDSPR2">; // Fields. diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index 375d9b205542..617801ba8c67 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -41,7 +41,7 @@ class MipsFastISel final : public FastISel { const TargetMachine &TM; const TargetInstrInfo &TII; const TargetLowering &TLI; - const MipsSubtarget &Subtarget; + const MipsSubtarget *Subtarget; MipsFunctionInfo *MFI; // Convenience variables to avoid some queries. @@ -56,11 +56,11 @@ class MipsFastISel final : public FastISel { M(const_cast(*funcInfo.Fn->getParent())), TM(funcInfo.MF->getTarget()), TII(*TM.getInstrInfo()), TLI(*TM.getTargetLowering()), - Subtarget(TM.getSubtarget()) { + Subtarget(&TM.getSubtarget()) { MFI = funcInfo.MF->getInfo(); Context = &funcInfo.Fn->getContext(); - TargetSupported = ((Subtarget.getRelocationModel() == Reloc::PIC_) && - (Subtarget.hasMips32r2() && (Subtarget.isABI_O32()))); + TargetSupported = ((Subtarget->getRelocationModel() == Reloc::PIC_) && + (Subtarget->hasMips32r2() && (Subtarget->isABI_O32()))); } bool TargetSelectInstruction(const Instruction *I) override; diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp index 8ba35fa02b82..61afe179df57 100644 --- a/lib/Target/Mips/MipsFrameLowering.cpp +++ b/lib/Target/Mips/MipsFrameLowering.cpp @@ -82,9 +82,8 @@ using namespace llvm; // //===----------------------------------------------------------------------===// -const MipsFrameLowering *MipsFrameLowering::create(MipsTargetMachine &TM, - const MipsSubtarget &ST) { - if (TM.getSubtargetImpl()->inMips16Mode()) +const MipsFrameLowering *MipsFrameLowering::create(const MipsSubtarget &ST) { + if (ST.inMips16Mode()) return llvm::createMips16FrameLowering(ST); return llvm::createMipsSEFrameLowering(ST); diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h index e10a3a551f9f..9d593091e018 100644 --- a/lib/Target/Mips/MipsFrameLowering.h +++ b/lib/Target/Mips/MipsFrameLowering.h @@ -15,7 +15,6 @@ #define MIPS_FRAMEINFO_H #include "Mips.h" -#include "MipsSubtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { @@ -29,8 +28,7 @@ class MipsFrameLowering : public TargetFrameLowering { explicit MipsFrameLowering(const MipsSubtarget &sti, unsigned Alignment) : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {} - static const MipsFrameLowering *create(MipsTargetMachine &TM, - const MipsSubtarget &ST); + static const MipsFrameLowering *create(const MipsSubtarget &ST); bool hasFP(const MachineFunction &MF) const override; diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp index 90cff631931f..0bdabf37f63b 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -47,6 +47,7 @@ using namespace llvm; //===----------------------------------------------------------------------===// bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &TM.getSubtarget(); bool Ret = SelectionDAGISel::runOnMachineFunction(MF); processFunctionAfterISel(MF); @@ -202,7 +203,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { #ifndef NDEBUG case ISD::LOAD: case ISD::STORE: - assert((Subtarget.systemSupportsUnalignedAccess() || + assert((Subtarget->systemSupportsUnalignedAccess() || cast(Node)->getMemoryVT().getSizeInBits() / 8 <= cast(Node)->getAlignment()) && "Unexpected unaligned loads/stores."); diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h index 13becb6b5bb9..52f4c0d3b9ea 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.h +++ b/lib/Target/Mips/MipsISelDAGToDAG.h @@ -32,7 +32,7 @@ namespace llvm { class MipsDAGToDAGISel : public SelectionDAGISel { public: explicit MipsDAGToDAGISel(MipsTargetMachine &TM) - : SelectionDAGISel(TM), Subtarget(TM.getSubtarget()) {} + : SelectionDAGISel(TM), Subtarget(nullptr) {} // Pass Name const char *getPassName() const override { @@ -46,7 +46,7 @@ class MipsDAGToDAGISel : public SelectionDAGISel { /// Keep a pointer to the MipsSubtarget around so that we can make the right /// decision when generating code for different targets. - const MipsSubtarget &Subtarget; + const MipsSubtarget *Subtarget; private: // Include the pieces autogenerated from the target description. diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 605236fee55d..12b339188b14 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -208,13 +208,18 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const { } } -MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) - : TargetLowering(TM, new MipsTargetObjectFile()), - Subtarget(&TM.getSubtarget()) { +MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM, + const MipsSubtarget &STI) + : TargetLowering(TM, new MipsTargetObjectFile()), Subtarget(STI) { // Mips does not have i1 type, so use i32 for // setcc operations results (slt, sgt, ...). setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + // The cmp.cond.fmt instruction in MIPS32r6/MIPS64r6 uses 0 and -1 like MSA + // does. Integer booleans still use 0 and 1. + if (Subtarget.hasMips32r6()) + setBooleanContents(ZeroOrOneBooleanContent, + ZeroOrNegativeOneBooleanContent); // Load extented operations for i1 types must be promoted setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); @@ -251,7 +256,7 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); - if (isGP64bit()) { + if (Subtarget.isGP64bit()) { setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::BlockAddress, MVT::i64, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); @@ -263,14 +268,14 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); } - if (!isGP64bit()) { + if (!Subtarget.isGP64bit()) { setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } setOperationAction(ISD::ADD, MVT::i32, Custom); - if (isGP64bit()) + if (Subtarget.isGP64bit()) setOperationAction(ISD::ADD, MVT::i64, Custom); setOperationAction(ISD::SDIV, MVT::i32, Expand); @@ -294,7 +299,7 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - if (Subtarget->hasCnMips()) { + if (Subtarget.hasCnMips()) { setOperationAction(ISD::CTPOP, MVT::i32, Legal); setOperationAction(ISD::CTPOP, MVT::i64, Legal); } else { @@ -312,10 +317,10 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); - if (!Subtarget->hasMips32r2()) + if (!Subtarget.hasMips32r2()) setOperationAction(ISD::ROTR, MVT::i32, Expand); - if (!Subtarget->hasMips64r2()) + if (!Subtarget.hasMips64r2()) setOperationAction(ISD::ROTR, MVT::i64, Expand); setOperationAction(ISD::FSIN, MVT::f32, Expand); @@ -353,23 +358,23 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) setInsertFencesForAtomic(true); - if (!Subtarget->hasMips32r2()) { + if (!Subtarget.hasMips32r2()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); } // MIPS16 lacks MIPS32's clz and clo instructions. - if (!Subtarget->hasMips32() || Subtarget->inMips16Mode()) + if (!Subtarget.hasMips32() || Subtarget.inMips16Mode()) setOperationAction(ISD::CTLZ, MVT::i32, Expand); - if (!Subtarget->hasMips64()) + if (!Subtarget.hasMips64()) setOperationAction(ISD::CTLZ, MVT::i64, Expand); - if (!Subtarget->hasMips32r2()) + if (!Subtarget.hasMips32r2()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); - if (!Subtarget->hasMips64r2()) + if (!Subtarget.hasMips64r2()) setOperationAction(ISD::BSWAP, MVT::i64, Expand); - if (isGP64bit()) { + if (Subtarget.isGP64bit()) { setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom); @@ -385,23 +390,25 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::ADD); - setMinFunctionAlignment(isGP64bit() ? 3 : 2); + setMinFunctionAlignment(Subtarget.isGP64bit() ? 3 : 2); - setStackPointerRegisterToSaveRestore(isN64() ? Mips::SP_64 : Mips::SP); + setStackPointerRegisterToSaveRestore(Subtarget.isABI_N64() ? Mips::SP_64 + : Mips::SP); - setExceptionPointerRegister(isN64() ? Mips::A0_64 : Mips::A0); - setExceptionSelectorRegister(isN64() ? Mips::A1_64 : Mips::A1); + setExceptionPointerRegister(Subtarget.isABI_N64() ? Mips::A0_64 : Mips::A0); + setExceptionSelectorRegister(Subtarget.isABI_N64() ? Mips::A1_64 : Mips::A1); MaxStoresPerMemcpy = 16; - isMicroMips = Subtarget->inMicroMipsMode(); + isMicroMips = Subtarget.inMicroMipsMode(); } -const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM) { - if (TM.getSubtargetImpl()->inMips16Mode()) - return llvm::createMips16TargetLowering(TM); +const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM, + const MipsSubtarget &STI) { + if (STI.inMips16Mode()) + return llvm::createMips16TargetLowering(TM, STI); - return llvm::createMipsSETargetLowering(TM); + return llvm::createMipsSETargetLowering(TM, STI); } // Create a fast isel object. @@ -421,7 +428,7 @@ EVT MipsTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -531,7 +538,7 @@ static SDValue createCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True, static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -610,11 +617,11 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG, static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { // Pattern match EXT. // $dst = and ((sra or srl) $src , pos), (2**size - 1) // => ext $dst, $src, size, pos - if (DCI.isBeforeLegalizeOps() || !Subtarget->hasExtractInsert()) + if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert()) return SDValue(); SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1); @@ -650,12 +657,12 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG, static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { // Pattern match INS. // $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1), // where mask1 = (2**size - 1) << pos, mask0 = ~mask1 // => ins $dst, $src, size, pos, $src1 - if (DCI.isBeforeLegalizeOps() || !Subtarget->hasExtractInsert()) + if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert()) return SDValue(); SDValue And0 = N->getOperand(0), And1 = N->getOperand(1); @@ -704,7 +711,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt)) if (DCI.isBeforeLegalizeOps()) @@ -962,16 +969,16 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, LL = Mips::LL_MM; SC = Mips::SC_MM; } else { - LL = Subtarget->hasMips32r6() ? Mips::LL : Mips::LL_R6; - SC = Subtarget->hasMips32r6() ? Mips::SC : Mips::SC_R6; + LL = Subtarget.hasMips32r6() ? Mips::LL : Mips::LL_R6; + SC = Subtarget.hasMips32r6() ? Mips::SC : Mips::SC_R6; } AND = Mips::AND; NOR = Mips::NOR; ZERO = Mips::ZERO; BEQ = Mips::BEQ; } else { - LL = Subtarget->hasMips64r6() ? Mips::LLD : Mips::LLD_R6; - SC = Subtarget->hasMips64r6() ? Mips::SCD : Mips::SCD_R6; + LL = Subtarget.hasMips64r6() ? Mips::LLD : Mips::LLD_R6; + SC = Subtarget.hasMips64r6() ? Mips::SCD : Mips::SCD_R6; AND = Mips::AND64; NOR = Mips::NOR64; ZERO = Mips::ZERO_64; @@ -1039,12 +1046,12 @@ MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg( const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); - if (Subtarget->hasMips32r2() && Size == 1) { + if (Subtarget.hasMips32r2() && Size == 1) { BuildMI(BB, DL, TII->get(Mips::SEB), DstReg).addReg(SrcReg); return BB; } - if (Subtarget->hasMips32r2() && Size == 2) { + if (Subtarget.hasMips32r2() && Size == 2) { BuildMI(BB, DL, TII->get(Mips::SEH), DstReg).addReg(SrcReg); return BB; } @@ -1134,7 +1141,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword( BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr) .addReg(Ptr).addReg(MaskLSB2); BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3); - if (Subtarget->isLittle()) { + if (Subtarget.isLittle()) { BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3); } else { unsigned Off = RegInfo.createVirtualRegister(RC); @@ -1374,7 +1381,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI, BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr) .addReg(Ptr).addReg(MaskLSB2); BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3); - if (Subtarget->isLittle()) { + if (Subtarget.isLittle()) { BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3); } else { unsigned Off = RegInfo.createVirtualRegister(RC); @@ -1484,7 +1491,8 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 0); Chain = Addr.getValue(1); - if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || isN64()) { + if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || + Subtarget.isABI_N64()) { // For PIC, the sequence is: // BRIND(load(Jumptable + index) + RelocBase) // RelocBase can be JumpTable, GOT or some sort of global base. @@ -1502,7 +1510,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Dest = Op.getOperand(2); SDLoc DL(Op); - assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6()); + assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6()); SDValue CondRes = createFPCmp(DAG, Op.getOperand(1)); // Return if flag is not set by a floating point comparison. @@ -1522,7 +1530,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue MipsTargetLowering:: lowerSELECT(SDValue Op, SelectionDAG &DAG) const { - assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6()); + assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6()); SDValue Cond = createFPCmp(DAG, Op.getOperand(0)); // Return if flag is not set by a floating point comparison. @@ -1548,7 +1556,7 @@ lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const } SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const { - assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6()); + assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6()); SDValue Cond = createFPCmp(DAG, Op); assert(Cond.getOpcode() == MipsISD::FPCmp && @@ -1568,7 +1576,8 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op, GlobalAddressSDNode *N = cast(Op); const GlobalValue *GV = N->getGlobal(); - if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64()) { + if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && + !Subtarget.isABI_N64()) { const MipsTargetObjectFile &TLOF = (const MipsTargetObjectFile&)getObjFileLowering(); @@ -1587,15 +1596,18 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op, } if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa(GV))) - return getAddrLocal(N, Ty, DAG, isN32() || isN64()); + return getAddrLocal(N, Ty, DAG, + Subtarget.isABI_N32() || Subtarget.isABI_N64()); if (LargeGOT) return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16, DAG.getEntryNode(), MachinePointerInfo::getGOT()); - return getAddrGlobal(N, Ty, DAG, (isN32() || isN64()) ? MipsII::MO_GOT_DISP - : MipsII::MO_GOT16, + return getAddrGlobal(N, Ty, DAG, + (Subtarget.isABI_N32() || Subtarget.isABI_N64()) + ? MipsII::MO_GOT_DISP + : MipsII::MO_GOT16, DAG.getEntryNode(), MachinePointerInfo::getGOT()); } @@ -1604,10 +1616,12 @@ SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op, BlockAddressSDNode *N = cast(Op); EVT Ty = Op.getValueType(); - if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64()) + if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && + !Subtarget.isABI_N64()) return getAddrNonPIC(N, Ty, DAG); - return getAddrLocal(N, Ty, DAG, isN32() || isN64()); + return getAddrLocal(N, Ty, DAG, + Subtarget.isABI_N32() || Subtarget.isABI_N64()); } SDValue MipsTargetLowering:: @@ -1645,7 +1659,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, PtrTy, TlsGetAddr, &Args, 0); + .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args), 0); std::pair CallResult = LowerCallTo(CLI); SDValue Ret = CallResult.first; @@ -1695,10 +1709,12 @@ lowerJumpTable(SDValue Op, SelectionDAG &DAG) const JumpTableSDNode *N = cast(Op); EVT Ty = Op.getValueType(); - if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64()) + if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && + !Subtarget.isABI_N64()) return getAddrNonPIC(N, Ty, DAG); - return getAddrLocal(N, Ty, DAG, isN32() || isN64()); + return getAddrLocal(N, Ty, DAG, + Subtarget.isABI_N32() || Subtarget.isABI_N64()); } SDValue MipsTargetLowering:: @@ -1716,10 +1732,12 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const ConstantPoolSDNode *N = cast(Op); EVT Ty = Op.getValueType(); - if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64()) + if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && + !Subtarget.isABI_N64()) return getAddrNonPIC(N, Ty, DAG); - return getAddrLocal(N, Ty, DAG, isN32() || isN64()); + return getAddrLocal(N, Ty, DAG, + Subtarget.isABI_N32() || Subtarget.isABI_N64()); } SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const { @@ -1834,10 +1852,10 @@ static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, SDValue MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { - if (Subtarget->isGP64bit()) - return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasExtractInsert()); + if (Subtarget.isGP64bit()) + return lowerFCOPYSIGN64(Op, DAG, Subtarget.hasExtractInsert()); - return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasExtractInsert()); + return lowerFCOPYSIGN32(Op, DAG, Subtarget.hasExtractInsert()); } SDValue MipsTargetLowering:: @@ -1850,8 +1868,9 @@ lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MFI->setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); - SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, - isN64() ? Mips::FP_64 : Mips::FP, VT); + SDValue FrameAddr = + DAG.getCopyFromReg(DAG.getEntryNode(), DL, + Subtarget.isABI_N64() ? Mips::FP_64 : Mips::FP, VT); return FrameAddr; } @@ -1867,7 +1886,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MVT VT = Op.getSimpleValueType(); - unsigned RA = isN64() ? Mips::RA_64 : Mips::RA; + unsigned RA = Subtarget.isABI_N64() ? Mips::RA_64 : Mips::RA; MFI->setReturnAddressIsTaken(true); // Return RA, which contains the return address. Mark it an implicit live-in. @@ -1889,12 +1908,12 @@ SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG) SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); SDLoc DL(Op); - EVT Ty = isN64() ? MVT::i64 : MVT::i32; + EVT Ty = Subtarget.isABI_N64() ? MVT::i64 : MVT::i32; // Store stack offset in V1, store jump target in V0. Glue CopyToReg and // EH_RETURN nodes, so that instructions are emitted back-to-back. - unsigned OffsetReg = isN64() ? Mips::V1_64 : Mips::V1; - unsigned AddrReg = isN64() ? Mips::V0_64 : Mips::V0; + unsigned OffsetReg = Subtarget.isABI_N64() ? Mips::V1_64 : Mips::V1; + unsigned AddrReg = Subtarget.isABI_N64() ? Mips::V0_64 : Mips::V0; Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue()); Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1)); return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain, @@ -2007,7 +2026,7 @@ SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const { LoadSDNode *LD = cast(Op); EVT MemVT = LD->getMemoryVT(); - if (Subtarget->systemSupportsUnalignedAccess()) + if (Subtarget.systemSupportsUnalignedAccess()) return Op; // Return if load is aligned or if MemVT is neither i32 nor i64. @@ -2015,7 +2034,7 @@ SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const { ((MemVT != MVT::i32) && (MemVT != MVT::i64))) return SDValue(); - bool IsLittle = Subtarget->isLittle(); + bool IsLittle = Subtarget.isLittle(); EVT VT = Op.getValueType(); ISD::LoadExtType ExtType = LD->getExtensionType(); SDValue Chain = LD->getChain(), Undef = DAG.getUNDEF(VT); @@ -2133,10 +2152,10 @@ SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const { EVT MemVT = SD->getMemoryVT(); // Lower unaligned integer stores. - if (!Subtarget->systemSupportsUnalignedAccess() && + if (!Subtarget.systemSupportsUnalignedAccess() && (SD->getAlignment() < MemVT.getSizeInBits() / 8) && ((MemVT == MVT::i32) || (MemVT == MVT::i64))) - return lowerUnalignedIntStore(SD, DAG, Subtarget->isLittle()); + return lowerUnalignedIntStore(SD, DAG, Subtarget.isLittle()); return lowerFP_TO_SINT_STORE(SD, DAG); } @@ -2322,8 +2341,8 @@ getOpndList(SmallVectorImpl &Ops, // in PIC mode) allow symbols to be resolved via lazy binding. // The lazy binding stub requires GP to point to the GOT. if (IsPICCall && !InternalLinkage) { - unsigned GPReg = isN64() ? Mips::GP_64 : Mips::GP; - EVT Ty = isN64() ? MVT::i64 : MVT::i32; + unsigned GPReg = Subtarget.isABI_N64() ? Mips::GP_64 : Mips::GP; + EVT Ty = Subtarget.isABI_N64() ? MVT::i64 : MVT::i32; RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty))); } @@ -2349,7 +2368,7 @@ getOpndList(SmallVectorImpl &Ops, const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CLI.CallConv); assert(Mask && "Missing call preserved mask for calling convention"); - if (Subtarget->inMips16HardFloat()) { + if (Subtarget.inMips16HardFloat()) { if (GlobalAddressSDNode *G = dyn_cast(CLI.Callee)) { llvm::StringRef Sym = G->getGlobal()->getName(); Function *F = G->getGlobal()->getParent()->getFunction(Sym); @@ -2392,11 +2411,11 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, getTargetMachine(), ArgLocs, *DAG.getContext()); MipsCC::SpecialCallingConvType SpecialCallingConv = getSpecialCallingConv(Callee); - MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo, - SpecialCallingConv); + MipsCC MipsCCInfo(CallConv, Subtarget.isABI_O32(), Subtarget.isFP64bit(), + CCInfo, SpecialCallingConv); MipsCCInfo.analyzeCallOperands(Outs, IsVarArg, - Subtarget->mipsSEUsesSoftFloat(), + Subtarget.abiUsesSoftFloat(), Callee.getNode(), CLI.getArgs()); // Get a count of how many bytes are to be pushed on the stack. @@ -2426,7 +2445,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL); SDValue StackPtr = DAG.getCopyFromReg( - Chain, DL, isN64() ? Mips::SP_64 : Mips::SP, getPointerTy()); + Chain, DL, Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP, + getPointerTy()); // With EABI is it possible to have 16 args on registers. std::deque< std::pair > RegsToPass; @@ -2448,7 +2468,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(!IsTailCall && "Do not tail-call optimize if there is a byval argument."); passByValArg(Chain, DL, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg, - MipsCCInfo, *ByValArg, Flags, Subtarget->isLittle()); + MipsCCInfo, *ByValArg, Flags, Subtarget.isLittle()); ++ByValArg; continue; } @@ -2467,7 +2487,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg, DAG.getConstant(0, MVT::i32)); SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Arg, DAG.getConstant(1, MVT::i32)); - if (!Subtarget->isLittle()) + if (!Subtarget.isLittle()) std::swap(Lo, Hi); unsigned LocRegLo = VA.getLocReg(); unsigned LocRegHigh = getNextIntArgReg(LocRegLo); @@ -2512,8 +2532,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. - bool IsPICCall = (isN64() || IsPIC); // true if calls are translated to - // jalr $25 + bool IsPICCall = + (Subtarget.isABI_N64() || IsPIC); // true if calls are translated to + // jalr $25 bool GlobalOrExternal = false, InternalLinkage = false; SDValue CalleeLo; EVT Ty = Callee.getValueType(); @@ -2524,7 +2545,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InternalLinkage = Val->hasInternalLinkage(); if (InternalLinkage) - Callee = getAddrLocal(G, Ty, DAG, isN32() || isN64()); + Callee = getAddrLocal(G, Ty, DAG, + Subtarget.isABI_N32() || Subtarget.isABI_N64()); else if (LargeGOT) Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16, MipsII::MO_CALL_LO16, Chain, @@ -2540,7 +2562,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { const char *Sym = S->getSymbol(); - if (!isN64() && !IsPIC) // !N64 && static + if (!Subtarget.isABI_N64() && !IsPIC) // !N64 && static Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), MipsII::MO_NO_FLAG); else if (LargeGOT) @@ -2591,9 +2613,10 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVector RVLocs; CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), getTargetMachine(), RVLocs, *DAG.getContext()); - MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo); + MipsCC MipsCCInfo(CallConv, Subtarget.isABI_O32(), Subtarget.isFP64bit(), + CCInfo); - MipsCCInfo.analyzeCallResult(Ins, Subtarget->mipsSEUsesSoftFloat(), + MipsCCInfo.analyzeCallResult(Ins, Subtarget.abiUsesSoftFloat(), CallNode, RetTy); // Copy all of the result registers out of their specified physreg. @@ -2638,10 +2661,11 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), getTargetMachine(), ArgLocs, *DAG.getContext()); - MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo); + MipsCC MipsCCInfo(CallConv, Subtarget.isABI_O32(), Subtarget.isFP64bit(), + CCInfo); Function::const_arg_iterator FuncArg = DAG.getMachineFunction().getFunction()->arg_begin(); - bool UseSoftFloat = Subtarget->mipsSEUsesSoftFloat(); + bool UseSoftFloat = Subtarget.abiUsesSoftFloat(); MipsCCInfo.analyzeFormalArguments(Ins, UseSoftFloat, FuncArg); MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(), @@ -2700,11 +2724,12 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, (RegVT == MVT::i64 && ValVT == MVT::f64) || (RegVT == MVT::f64 && ValVT == MVT::i64)) ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue); - else if (isO32() && RegVT == MVT::i32 && ValVT == MVT::f64) { + else if (Subtarget.isABI_O32() && RegVT == MVT::i32 && + ValVT == MVT::f64) { unsigned Reg2 = addLiveIn(DAG.getMachineFunction(), getNextIntArgReg(ArgReg), RC); SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT); - if (!Subtarget->isLittle()) + if (!Subtarget.isLittle()) std::swap(ArgValue, ArgValue2); ArgValue = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, ArgValue, ArgValue2); @@ -2738,7 +2763,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, unsigned Reg = MipsFI->getSRetReturnReg(); if (!Reg) { Reg = MF.getRegInfo().createVirtualRegister( - getRegClassFor(isN64() ? MVT::i64 : MVT::i32)); + getRegClassFor(Subtarget.isABI_N64() ? MVT::i64 : MVT::i32)); MipsFI->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]); @@ -2789,10 +2814,11 @@ MipsTargetLowering::LowerReturn(SDValue Chain, // CCState - Info about the registers and stack slot. CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs, *DAG.getContext()); - MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo); + MipsCC MipsCCInfo(CallConv, Subtarget.isABI_O32(), Subtarget.isFP64bit(), + CCInfo); // Analyze return values. - MipsCCInfo.analyzeReturn(Outs, Subtarget->mipsSEUsesSoftFloat(), + MipsCCInfo.analyzeReturn(Outs, Subtarget.abiUsesSoftFloat(), MF.getFunction()->getReturnType()); SDValue Flag; @@ -2825,7 +2851,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain, if (!Reg) llvm_unreachable("sret virtual register not created in the entry block"); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); - unsigned V0 = isN64() ? Mips::V0_64 : Mips::V0; + unsigned V0 = Subtarget.isABI_N64() ? Mips::V0_64 : Mips::V0; Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag); Flag = Chain.getValue(1); @@ -2903,7 +2929,7 @@ MipsTargetLowering::getSingleConstraintMatchWeight( weight = CW_Register; break; case 'f': // FPU or MSA register - if (Subtarget->hasMSA() && type->isVectorTy() && + if (Subtarget.hasMSA() && type->isVectorTy() && cast(type)->getBitWidth() == 128) weight = CW_Register; else if (type->isFloatTy()) @@ -3009,7 +3035,7 @@ parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const { // If the size of FP registers is 64-bit or Reg is an even number, select // the 64-bit register class. Otherwise, select the 32-bit register class. if (VT == MVT::Other) - VT = (Subtarget->isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32; + VT = (Subtarget.isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32; RC = getRegClassFor(VT); @@ -3042,13 +3068,13 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const case 'y': // Same as 'r'. Exists for compatibility. case 'r': if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) { - if (Subtarget->inMips16Mode()) + if (Subtarget.inMips16Mode()) return std::make_pair(0U, &Mips::CPU16RegsRegClass); return std::make_pair(0U, &Mips::GPR32RegClass); } - if (VT == MVT::i64 && !isGP64bit()) + if (VT == MVT::i64 && !Subtarget.isGP64bit()) return std::make_pair(0U, &Mips::GPR32RegClass); - if (VT == MVT::i64 && isGP64bit()) + if (VT == MVT::i64 && Subtarget.isGP64bit()) return std::make_pair(0U, &Mips::GPR64RegClass); // This will generate an error message return std::make_pair(0U, nullptr); @@ -3063,8 +3089,8 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const return std::make_pair(0U, &Mips::MSA128DRegClass); else if (VT == MVT::f32) return std::make_pair(0U, &Mips::FGR32RegClass); - else if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) { - if (Subtarget->isFP64bit()) + else if ((VT == MVT::f64) && (!Subtarget.isSingleFloat())) { + if (Subtarget.isFP64bit()) return std::make_pair(0U, &Mips::FGR64RegClass); return std::make_pair(0U, &Mips::AFGR64RegClass); } @@ -3220,7 +3246,7 @@ EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { - if (Subtarget->hasMips64()) + if (Subtarget.hasMips64()) return MVT::i64; return MVT::i32; @@ -3235,7 +3261,7 @@ bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { } unsigned MipsTargetLowering::getJumpTableEncoding() const { - if (isN64()) + if (Subtarget.isABI_N64()) return MachineJumpTableInfo::EK_GPRel64BlockAddress; return TargetLowering::getJumpTableEncoding(); @@ -3285,7 +3311,7 @@ MipsTargetLowering::MipsCC::SpecialCallingConvType MipsTargetLowering::getSpecialCallingConv(SDValue Callee) const { MipsCC::SpecialCallingConvType SpecialCallingConv = MipsCC::NoSpecialCallingConv; - if (Subtarget->inMips16HardFloat()) { + if (Subtarget.inMips16HardFloat()) { if (GlobalAddressSDNode *G = dyn_cast(Callee)) { llvm::StringRef Sym = G->getGlobal()->getName(); Function *F = G->getGlobal()->getParent()->getFunction(Sym); @@ -3682,7 +3708,8 @@ void MipsTargetLowering::writeVarArgRegs(std::vector &OutChains, SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy()); SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff, MachinePointerInfo(), false, false, 0); - cast(Store.getNode())->getMemOperand()->setValue((Value*)nullptr); + cast(Store.getNode())->getMemOperand()->setValue( + (Value *)nullptr); OutChains.push_back(Store); } } diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index b006aba380ea..10e4e0b48644 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -17,7 +17,6 @@ #include "MCTargetDesc/MipsBaseInfo.h" #include "Mips.h" -#include "MipsSubtarget.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/Function.h" @@ -210,13 +209,16 @@ namespace llvm { // TargetLowering Implementation //===--------------------------------------------------------------------===// class MipsFunctionInfo; + class MipsSubtarget; class MipsTargetLowering : public TargetLowering { bool isMicroMips; public: - explicit MipsTargetLowering(MipsTargetMachine &TM); + explicit MipsTargetLowering(MipsTargetMachine &TM, + const MipsSubtarget &STI); - static const MipsTargetLowering *create(MipsTargetMachine &TM); + static const MipsTargetLowering *create(MipsTargetMachine &TM, + const MipsSubtarget &STI); /// createFastISel - This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. @@ -436,13 +438,7 @@ namespace llvm { SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const; // Subtarget Info - const MipsSubtarget *Subtarget; - - bool hasMips64() const { return Subtarget->hasMips64(); } - bool isGP64bit() const { return Subtarget->isGP64bit(); } - bool isO32() const { return Subtarget->isABI_O32(); } - bool isN32() const { return Subtarget->isABI_N32(); } - bool isN64() const { return Subtarget->isABI_N64(); } + const MipsSubtarget &Subtarget; private: // Create a TargetGlobalAddress node. @@ -617,8 +613,10 @@ namespace llvm { }; /// Create MipsTargetLowering objects. - const MipsTargetLowering *createMips16TargetLowering(MipsTargetMachine &TM); - const MipsTargetLowering *createMipsSETargetLowering(MipsTargetMachine &TM); + const MipsTargetLowering * + createMips16TargetLowering(MipsTargetMachine &TM, const MipsSubtarget &STI); + const MipsTargetLowering * + createMipsSETargetLowering(MipsTargetMachine &TM, const MipsSubtarget &STI); namespace Mips { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index aa8f053327f3..29d8e30be483 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -57,13 +57,13 @@ let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in // Feature predicates. //===----------------------------------------------------------------------===// -def IsFP64bit : Predicate<"Subtarget.isFP64bit()">, +def IsFP64bit : Predicate<"Subtarget->isFP64bit()">, AssemblerPredicate<"FeatureFP64Bit">; -def NotFP64bit : Predicate<"!Subtarget.isFP64bit()">, +def NotFP64bit : Predicate<"!Subtarget->isFP64bit()">, AssemblerPredicate<"!FeatureFP64Bit">; -def IsSingleFloat : Predicate<"Subtarget.isSingleFloat()">, +def IsSingleFloat : Predicate<"Subtarget->isSingleFloat()">, AssemblerPredicate<"FeatureSingleFloat">; -def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">, +def IsNotSingleFloat : Predicate<"!Subtarget->isSingleFloat()">, AssemblerPredicate<"!FeatureSingleFloat">; //===----------------------------------------------------------------------===// @@ -362,11 +362,15 @@ def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1, bitconvert>, MFC1_FM<0>; def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1, bitconvert>, MFC1_FM<4>; -def MFHC1 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>, - MFC1_FM<3>, ISA_MIPS32R2; -def MTHC1_D32 : MMRel, MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>, +def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>, + MFC1_FM<3>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>; +def MFHC1_D64 : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>, + MFC1_FM<3>, ISA_MIPS32R2, AdditionalRequires<[IsFP64bit]> { + let DecoderNamespace = "Mips64"; +} +def MTHC1_D32 : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>, MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>; -def MTHC1_D64 : MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>, +def MTHC1_D64 : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>, MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[IsFP64bit]> { let DecoderNamespace = "Mips64"; } diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td index a708e307f3f2..6a01ae560f3a 100644 --- a/lib/Target/Mips/MipsInstrFormats.td +++ b/lib/Target/Mips/MipsInstrFormats.td @@ -844,6 +844,16 @@ class BARRIER_FM op> : StdArch { let Inst{5-0} = 0; // SLL } +class SDBBP_FM : StdArch { + bits<20> code_; + + bits<32> Inst; + + let Inst{31-26} = 0b011100; // SPECIAL2 + let Inst{25-6} = code_; + let Inst{5-0} = 0b111111; // SDBBP +} + class JR_HB_FM op> : StdArch{ bits<5> rs; diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp index d6da6c6b1723..dcc0e24e080b 100644 --- a/lib/Target/Mips/MipsInstrInfo.cpp +++ b/lib/Target/Mips/MipsInstrInfo.cpp @@ -30,15 +30,15 @@ using namespace llvm; // Pin the vtable to this file. void MipsInstrInfo::anchor() {} -MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm, unsigned UncondBr) - : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), - TM(tm), UncondBrOpc(UncondBr) {} +MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr) + : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), + Subtarget(STI), UncondBrOpc(UncondBr) {} -const MipsInstrInfo *MipsInstrInfo::create(MipsTargetMachine &TM) { - if (TM.getSubtargetImpl()->inMips16Mode()) - return llvm::createMips16InstrInfo(TM); +const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) { + if (STI.inMips16Mode()) + return llvm::createMips16InstrInfo(STI); - return llvm::createMipsSEInstrInfo(TM); + return llvm::createMipsSEInstrInfo(STI); } bool MipsInstrInfo::isZeroImm(const MachineOperand &op) const { @@ -94,10 +94,10 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return (BT == BT_None) || (BT == BT_Indirect); } -void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, - MachineBasicBlock *TBB, DebugLoc DL, - const SmallVectorImpl& Cond) - const { +void +MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + DebugLoc DL, + const SmallVectorImpl &Cond) const { unsigned Opc = Cond[0].getImm(); const MCInstrDesc &MCID = get(Opc); MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID); @@ -113,11 +113,9 @@ void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MIB.addMBB(TBB); } -unsigned MipsInstrInfo:: -InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl &Cond, - DebugLoc DL) const { +unsigned MipsInstrInfo::InsertBranch( + MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, + const SmallVectorImpl &Cond, DebugLoc DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); @@ -145,9 +143,7 @@ InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, return 1; } -unsigned MipsInstrInfo:: -RemoveBranch(MachineBasicBlock &MBB) const -{ +unsigned MipsInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend(); MachineBasicBlock::reverse_iterator FirstBr; unsigned removed; @@ -160,7 +156,7 @@ RemoveBranch(MachineBasicBlock &MBB) const // Up to 2 branches are removed. // Note that indirect branches are not removed. - for(removed = 0; I != REnd && removed < 2; ++I, ++removed) + for (removed = 0; I != REnd && removed < 2; ++I, ++removed) if (!getAnalyzableBrOpc(I->getOpcode())) break; @@ -171,20 +167,18 @@ RemoveBranch(MachineBasicBlock &MBB) const /// ReverseBranchCondition - Return the inverse opcode of the /// specified Branch instruction. -bool MipsInstrInfo:: -ReverseBranchCondition(SmallVectorImpl &Cond) const -{ +bool MipsInstrInfo::ReverseBranchCondition( + SmallVectorImpl &Cond) const { assert( (Cond.size() && Cond.size() <= 3) && "Invalid Mips branch condition!"); Cond[0].setImm(getOppositeBranchOpc(Cond[0].getImm())); return false; } -MipsInstrInfo::BranchType MipsInstrInfo:: -AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, SmallVectorImpl &Cond, - bool AllowModify, - SmallVectorImpl &BranchInstrs) const { +MipsInstrInfo::BranchType MipsInstrInfo::AnalyzeBranch( + MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, bool AllowModify, + SmallVectorImpl &BranchInstrs) const { MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend(); diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h index 742193f712c1..bdf2fd37ed8c 100644 --- a/lib/Target/Mips/MipsInstrInfo.h +++ b/lib/Target/Mips/MipsInstrInfo.h @@ -33,7 +33,7 @@ namespace llvm { class MipsInstrInfo : public MipsGenInstrInfo { virtual void anchor(); protected: - MipsTargetMachine &TM; + const MipsSubtarget &Subtarget; unsigned UncondBrOpc; public: @@ -46,9 +46,9 @@ class MipsInstrInfo : public MipsGenInstrInfo { BT_Indirect // One indirct branch. }; - explicit MipsInstrInfo(MipsTargetMachine &TM, unsigned UncondBrOpc); + explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc); - static const MipsInstrInfo *create(MipsTargetMachine &TM); + static const MipsInstrInfo *create(MipsSubtarget &STI); /// Branch Analysis bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, @@ -140,8 +140,8 @@ class MipsInstrInfo : public MipsGenInstrInfo { }; /// Create MipsInstrInfo objects. -const MipsInstrInfo *createMips16InstrInfo(MipsTargetMachine &TM); -const MipsInstrInfo *createMipsSEInstrInfo(MipsTargetMachine &TM); +const MipsInstrInfo *createMips16InstrInfo(const MipsSubtarget &STI); +const MipsInstrInfo *createMipsSEInstrInfo(const MipsSubtarget &STI); } diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index b1b455769477..8e9472c2d9d8 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -146,61 +146,61 @@ def MipsSDR : SDNode<"MipsISD::SDR", SDTStore, //===----------------------------------------------------------------------===// // Mips Instruction Predicate Definitions. //===----------------------------------------------------------------------===// -def HasMips2 : Predicate<"Subtarget.hasMips2()">, +def HasMips2 : Predicate<"Subtarget->hasMips2()">, AssemblerPredicate<"FeatureMips2">; -def HasMips3_32 : Predicate<"Subtarget.hasMips3_32()">, +def HasMips3_32 : Predicate<"Subtarget->hasMips3_32()">, AssemblerPredicate<"FeatureMips3_32">; -def HasMips3_32r2 : Predicate<"Subtarget.hasMips3_32r2()">, +def HasMips3_32r2 : Predicate<"Subtarget->hasMips3_32r2()">, AssemblerPredicate<"FeatureMips3_32r2">; -def HasMips3 : Predicate<"Subtarget.hasMips3()">, +def HasMips3 : Predicate<"Subtarget->hasMips3()">, AssemblerPredicate<"FeatureMips3">; -def HasMips4_32 : Predicate<"Subtarget.hasMips4_32()">, +def HasMips4_32 : Predicate<"Subtarget->hasMips4_32()">, AssemblerPredicate<"FeatureMips4_32">; -def HasMips4_32r2 : Predicate<"Subtarget.hasMips4_32r2()">, +def HasMips4_32r2 : Predicate<"Subtarget->hasMips4_32r2()">, AssemblerPredicate<"FeatureMips4_32r2">; -def HasMips5_32r2 : Predicate<"Subtarget.hasMips5_32r2()">, +def HasMips5_32r2 : Predicate<"Subtarget->hasMips5_32r2()">, AssemblerPredicate<"FeatureMips5_32r2">; -def HasMips32 : Predicate<"Subtarget.hasMips32()">, +def HasMips32 : Predicate<"Subtarget->hasMips32()">, AssemblerPredicate<"FeatureMips32">; -def HasMips32r2 : Predicate<"Subtarget.hasMips32r2()">, +def HasMips32r2 : Predicate<"Subtarget->hasMips32r2()">, AssemblerPredicate<"FeatureMips32r2">; -def HasMips32r6 : Predicate<"Subtarget.hasMips32r6()">, +def HasMips32r6 : Predicate<"Subtarget->hasMips32r6()">, AssemblerPredicate<"FeatureMips32r6">; -def NotMips32r6 : Predicate<"!Subtarget.hasMips32r6()">, +def NotMips32r6 : Predicate<"!Subtarget->hasMips32r6()">, AssemblerPredicate<"!FeatureMips32r6">; -def IsGP64bit : Predicate<"Subtarget.isGP64bit()">, +def IsGP64bit : Predicate<"Subtarget->isGP64bit()">, AssemblerPredicate<"FeatureGP64Bit">; -def IsGP32bit : Predicate<"!Subtarget.isGP64bit()">, +def IsGP32bit : Predicate<"!Subtarget->isGP64bit()">, AssemblerPredicate<"!FeatureGP64Bit">; -def HasMips64 : Predicate<"Subtarget.hasMips64()">, +def HasMips64 : Predicate<"Subtarget->hasMips64()">, AssemblerPredicate<"FeatureMips64">; -def HasMips64r2 : Predicate<"Subtarget.hasMips64r2()">, +def HasMips64r2 : Predicate<"Subtarget->hasMips64r2()">, AssemblerPredicate<"FeatureMips64r2">; -def HasMips64r6 : Predicate<"Subtarget.hasMips64r6()">, +def HasMips64r6 : Predicate<"Subtarget->hasMips64r6()">, AssemblerPredicate<"FeatureMips64r6">; -def NotMips64r6 : Predicate<"!Subtarget.hasMips64r6()">, +def NotMips64r6 : Predicate<"!Subtarget->hasMips64r6()">, AssemblerPredicate<"!FeatureMips64r6">; -def IsN64 : Predicate<"Subtarget.isABI_N64()">, +def IsN64 : Predicate<"Subtarget->isABI_N64()">, AssemblerPredicate<"FeatureN64">; -def InMips16Mode : Predicate<"Subtarget.inMips16Mode()">, +def InMips16Mode : Predicate<"Subtarget->inMips16Mode()">, AssemblerPredicate<"FeatureMips16">; -def HasCnMips : Predicate<"Subtarget.hasCnMips()">, +def HasCnMips : Predicate<"Subtarget->hasCnMips()">, AssemblerPredicate<"FeatureCnMips">; def RelocStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">, AssemblerPredicate<"FeatureMips32">; def RelocPIC : Predicate<"TM.getRelocationModel() == Reloc::PIC_">, AssemblerPredicate<"FeatureMips32">; def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">; -def HasStdEnc : Predicate<"Subtarget.hasStandardEncoding()">, +def HasStdEnc : Predicate<"Subtarget->hasStandardEncoding()">, AssemblerPredicate<"!FeatureMips16">; -def NotDSP : Predicate<"!Subtarget.hasDSP()">; -def InMicroMips : Predicate<"Subtarget.inMicroMipsMode()">, +def NotDSP : Predicate<"!Subtarget->hasDSP()">; +def InMicroMips : Predicate<"Subtarget->inMicroMipsMode()">, AssemblerPredicate<"FeatureMicroMips">; -def NotInMicroMips : Predicate<"!Subtarget.inMicroMipsMode()">, +def NotInMicroMips : Predicate<"!Subtarget->inMicroMipsMode()">, AssemblerPredicate<"!FeatureMicroMips">; -def IsLE : Predicate<"Subtarget.isLittle()">; -def IsBE : Predicate<"!Subtarget.isLittle()">; -def IsNotNaCl : Predicate<"!Subtarget.isTargetNaCl()">; +def IsLE : Predicate<"Subtarget->isLittle()">; +def IsBE : Predicate<"!Subtarget->isLittle()">; +def IsNotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; //===----------------------------------------------------------------------===// // Mips GPR size adjectives. @@ -743,20 +743,11 @@ class JumpFR; // Indirect branch -class IndirectBranch : - JumpFR { +class IndirectBranch : JumpFR { let isBranch = 1; let isIndirectBranch = 1; } -// Return instruction -class RetBase: JumpFR { - let isReturn = 1; - let isCodeGenOnly = 1; - let hasCtrlDep = 1; - let hasExtraSrcRegAllocReq = 1; -} - // Jump and Link (Call) let isCall=1, hasDelaySlot=1, Defs = [RA] in { class JumpLink : @@ -1180,6 +1171,7 @@ def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>, def BREAK : MMRel, BRK_FT<"break">, BRK_FM<0xd>; def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>; def TRAP : TrapBase; +def SDBBP : SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6; def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32; def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32; @@ -1228,7 +1220,35 @@ def BAL_BR : BAL_BR_Pseudo; def TAILCALL : TailCall; def TAILCALL_R : TailCallReg; -def RET : MMRel, RetBase<"ret", GPR32Opnd>, MTLO_FM<8>; +// Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64 +// then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA. +class PseudoIndirectBranchBase : + MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)], IIBranch> { + let isTerminator=1; + let isBarrier=1; + let hasDelaySlot = 1; + let isBranch = 1; + let isIndirectBranch = 1; +} + +def PseudoIndirectBranch : PseudoIndirectBranchBase; + +// Return instructions are matched as a RetRA instruction, then ar expanded +// into PseudoReturn/PseudoReturn64 after register allocation. Finally, +// MipsAsmPrinter expands this into JR, JR64, JALR, or JALR64 depending on the +// ISA. +class PseudoReturnBase : MipsPseudo<(outs), (ins RO:$rs), + [], IIBranch> { + let isTerminator = 1; + let isBarrier = 1; + let hasDelaySlot = 1; + let isReturn = 1; + let isCodeGenOnly = 1; + let hasCtrlDep = 1; + let hasExtraSrcRegAllocReq = 1; +} + +def PseudoReturn : PseudoReturnBase; // Exception handling related node and instructions. // The conversion sequence is: @@ -1488,6 +1508,7 @@ def : MipsInstAlias<"sra $rd, $rt, $rs", (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>; def : MipsInstAlias<"srl $rd, $rt, $rs", (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>; +def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6; def : MipsInstAlias<"sync", (SYNC 0), 1>, ISA_MIPS2; //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp index c6838a37be28..27110b6e870e 100644 --- a/lib/Target/Mips/MipsLongBranch.cpp +++ b/lib/Target/Mips/MipsLongBranch.cpp @@ -449,7 +449,8 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) { const MipsInstrInfo *TII = static_cast(TM.getInstrInfo()); - if (TM.getSubtarget().inMips16Mode()) + const MipsSubtarget &STI = TM.getSubtarget(); + if (STI.inMips16Mode() || !STI.enableLongBranchPass()) return false; if ((TM.getRelocationModel() == Reloc::PIC_) && TM.getSubtarget().isABI_O32() && diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td index 6bd0366b52e7..bff2d0fab1ec 100644 --- a/lib/Target/Mips/MipsMSAInstrFormats.td +++ b/lib/Target/Mips/MipsMSAInstrFormats.td @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -def HasMSA : Predicate<"Subtarget.hasMSA()">, +def HasMSA : Predicate<"Subtarget->hasMSA()">, AssemblerPredicate<"FeatureMSA">; class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> { diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp index e30302e0afdd..bc896be4e1de 100644 --- a/lib/Target/Mips/MipsMachineFunction.cpp +++ b/lib/Target/Mips/MipsMachineFunction.cpp @@ -137,4 +137,12 @@ MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *Val) { return MachinePointerInfo(E); } +int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) { + if (MoveF64ViaSpillFI == -1) { + MoveF64ViaSpillFI = MF.getFrameInfo()->CreateStackObject( + RC->getSize(), RC->getAlignment(), false); + } + return MoveF64ViaSpillFI; +} + void MipsFunctionInfo::anchor() { } diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h index e9101cc77c0d..61260e578159 100644 --- a/lib/Target/Mips/MipsMachineFunction.h +++ b/lib/Target/Mips/MipsMachineFunction.h @@ -15,7 +15,6 @@ #define MIPS_MACHINE_FUNCTION_INFO_H #include "Mips16HardFloatInfo.h" -#include "MipsSubtarget.h" #include "llvm/ADT/StringMap.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -55,7 +54,8 @@ class MipsFunctionInfo : public MachineFunctionInfo { public: MipsFunctionInfo(MachineFunction &MF) : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0), - VarArgsFrameIndex(0), CallsEhReturn(false), SaveS2(false) {} + VarArgsFrameIndex(0), CallsEhReturn(false), SaveS2(false), + MoveF64ViaSpillFI(-1) {} ~MipsFunctionInfo(); @@ -97,6 +97,8 @@ class MipsFunctionInfo : public MachineFunctionInfo { void setSaveS2() { SaveS2 = true; } bool hasSaveS2() const { return SaveS2; } + int getMoveF64ViaSpillFI(const TargetRegisterClass *RC); + std::map StubsNeeded; @@ -137,6 +139,10 @@ class MipsFunctionInfo : public MachineFunctionInfo { // saveS2 bool SaveS2; + /// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the + /// O32 FPXX ABI is enabled. -1 is used to denote invalid index. + int MoveF64ViaSpillFI; + /// MipsCallEntry maps. StringMap ExternalCallEntries; ValueMap GlobalCallEntries; diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp index 03c76eaf1996..b011e8fcd8b7 100644 --- a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp @@ -20,7 +20,7 @@ namespace llvm { bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n"); - const_cast(Subtarget).resetSubtarget(&MF); + TM.resetSubtarget(&MF); return false; } diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.h b/lib/Target/Mips/MipsModuleISelDAGToDAG.h index a96862a6a11a..f7a03104880b 100644 --- a/lib/Target/Mips/MipsModuleISelDAGToDAG.h +++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.h @@ -37,8 +37,7 @@ class MipsModuleDAGToDAGISel : public MachineFunctionPass { static char ID; explicit MipsModuleDAGToDAGISel(MipsTargetMachine &TM_) - : MachineFunctionPass(ID), - TM(TM_), Subtarget(TM.getSubtarget()) {} + : MachineFunctionPass(ID), TM(TM_) {} // Pass Name const char *getPassName() const override { @@ -48,10 +47,7 @@ class MipsModuleDAGToDAGISel : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; protected: - /// Keep a pointer to the MipsSubtarget around so that we can make the right - /// decision when generating code for different targets. - const TargetMachine &TM; - const MipsSubtarget &Subtarget; + MipsTargetMachine &TM; }; /// createMipsISelDag - This pass converts a legalized DAG into a diff --git a/lib/Target/Mips/MipsOptionRecord.h b/lib/Target/Mips/MipsOptionRecord.h new file mode 100644 index 000000000000..c0abce3eaddb --- /dev/null +++ b/lib/Target/Mips/MipsOptionRecord.h @@ -0,0 +1,80 @@ +//===-- MipsOptionRecord.h - Abstraction for storing information ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// MipsOptionRecord - Abstraction for storing arbitrary information in +// ELF files. Arbitrary information (e.g. register usage) can be stored in Mips +// specific ELF sections like .Mips.options. Specific records should subclass +// MipsOptionRecord and provide an implementation to EmitMipsOptionRecord which +// basically just dumps the information into an ELF section. More information +// about .Mips.option can be found in the SysV ABI and the 64-bit ELF Object +// specification. +// +//===----------------------------------------------------------------------===// + +#ifndef MIPSOPTIONRECORD_H +#define MIPSOPTIONRECORD_H + +#include "MipsMCTargetDesc.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCRegisterInfo.h" + +using namespace llvm; + +namespace llvm { +class MipsELFStreamer; +class MCSubtargetInfo; +} + +class MipsOptionRecord { +public: + virtual ~MipsOptionRecord(){}; + virtual void EmitMipsOptionRecord() = 0; +}; + +class MipsRegInfoRecord : public MipsOptionRecord { +public: + MipsRegInfoRecord(MipsELFStreamer *S, MCContext &Context, + const MCSubtargetInfo &STI) + : Streamer(S), Context(Context), STI(STI) { + ri_gprmask = 0; + ri_cprmask[0] = ri_cprmask[1] = ri_cprmask[2] = ri_cprmask[3] = 0; + ri_gp_value = 0; + + const MCRegisterInfo *TRI = Context.getRegisterInfo(); + GPR32RegClass = &(TRI->getRegClass(Mips::GPR32RegClassID)); + GPR64RegClass = &(TRI->getRegClass(Mips::GPR64RegClassID)); + FGR32RegClass = &(TRI->getRegClass(Mips::FGR32RegClassID)); + FGR64RegClass = &(TRI->getRegClass(Mips::FGR64RegClassID)); + AFGR64RegClass = &(TRI->getRegClass(Mips::AFGR64RegClassID)); + MSA128BRegClass = &(TRI->getRegClass(Mips::MSA128BRegClassID)); + COP2RegClass = &(TRI->getRegClass(Mips::COP2RegClassID)); + COP3RegClass = &(TRI->getRegClass(Mips::COP3RegClassID)); + } + ~MipsRegInfoRecord() {} + + void EmitMipsOptionRecord(); + void SetPhysRegUsed(unsigned Reg, const MCRegisterInfo *MCRegInfo); + +private: + MipsELFStreamer *Streamer; + MCContext &Context; + const MCSubtargetInfo &STI; + const MCRegisterClass *GPR32RegClass; + const MCRegisterClass *GPR64RegClass; + const MCRegisterClass *FGR32RegClass; + const MCRegisterClass *FGR64RegClass; + const MCRegisterClass *AFGR64RegClass; + const MCRegisterClass *MSA128BRegClass; + const MCRegisterClass *COP2RegClass; + const MCRegisterClass *COP3RegClass; + uint32_t ri_gprmask; + uint32_t ri_cprmask[4]; + int64_t ri_gp_value; +}; +#endif diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp index 83d25ab469c3..084449bba59c 100644 --- a/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -93,6 +93,9 @@ MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (Subtarget.isFP64bit()) return CSR_O32_FP64_SaveList; + if (Subtarget.isFPXX()) + return CSR_O32_FPXX_SaveList; + return CSR_O32_SaveList; } @@ -110,6 +113,9 @@ MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const { if (Subtarget.isFP64bit()) return CSR_O32_FP64_RegMask; + if (Subtarget.isFPXX()) + return CSR_O32_FPXX_RegMask; + return CSR_O32_RegMask; } @@ -201,6 +207,11 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(Mips::GP_64); } + if (Subtarget.isABI_O32() && !Subtarget.useOddSPReg()) { + for (const auto &Reg : Mips::OddSPRegClass) + Reserved.set(Reg); + } + return Reserved; } diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td index b5897af52cdc..74dfa4fe7d9c 100644 --- a/lib/Target/Mips/MipsRegisterInfo.td +++ b/lib/Target/Mips/MipsRegisterInfo.td @@ -340,6 +340,15 @@ def AFGR64 : RegisterClass<"Mips", [f64], 64, (add def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>; +// Used to reserve odd registers when given -mattr=+nooddspreg +// FIXME: Remove double precision registers from this set. +def OddSP : RegisterClass<"Mips", [f32], 32, + (add (decimate (sequence "F%u", 1, 31), 2), + (decimate (sequence "F_HI%u", 1, 31), 2), + (decimate (sequence "D%u", 1, 15), 2), + (decimate (sequence "D%u_64", 1, 31), 2))>, + Unallocatable; + // FP control registers. def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>, Unallocatable; diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp index 6ad5821571d7..d0a17cd834a0 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -16,6 +16,7 @@ #include "MipsAnalyzeImmediate.h" #include "MipsMachineFunction.h" #include "MipsSEInstrInfo.h" +#include "MipsSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -63,6 +64,10 @@ class ExpandPseudo { bool expandCopy(MachineBasicBlock &MBB, Iter I); bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc, unsigned MFLoOpc); + bool expandBuildPairF64(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, bool FP64) const; + bool expandExtractElementF64(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, bool FP64) const; MachineFunction &MF; MachineRegisterInfo &MRI; @@ -107,6 +112,22 @@ bool ExpandPseudo::expandInstr(MachineBasicBlock &MBB, Iter I) { case Mips::STORE_ACC128: expandStoreACC(MBB, I, Mips::PseudoMFHI64, Mips::PseudoMFLO64, 8); break; + case Mips::BuildPairF64: + if (expandBuildPairF64(MBB, I, false)) + MBB.erase(I); + return false; + case Mips::BuildPairF64_64: + if (expandBuildPairF64(MBB, I, true)) + MBB.erase(I); + return false; + case Mips::ExtractElementF64: + if (expandExtractElementF64(MBB, I, false)) + MBB.erase(I); + return false; + case Mips::ExtractElementF64_64: + if (expandExtractElementF64(MBB, I, true)) + MBB.erase(I); + return false; case TargetOpcode::COPY: if (!expandCopy(MBB, I)) return false; @@ -257,6 +278,123 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, return true; } +/// This method expands the same instruction that MipsSEInstrInfo:: +/// expandBuildPairF64 does, for the case when ABI is fpxx and mthc1 is not +/// available and the case where the ABI is FP64A. It is implemented here +/// because frame indexes are eliminated before MipsSEInstrInfo:: +/// expandBuildPairF64 is called. +bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + bool FP64) const { + // For fpxx and when mthc1 is not available, use: + // spill + reload via ldc1 + // + // The case where dmtc1 is available doesn't need to be handled here + // because it never creates a BuildPairF64 node. + // + // The FP64A ABI (fp64 with nooddspreg) must also use a spill/reload sequence + // for odd-numbered double precision values (because the lower 32-bits is + // transferred with mtc1 which is redirected to the upper half of the even + // register). Unfortunately, we have to make this decision before register + // allocation so for now we use a spill/reload sequence for all + // double-precision values in regardless of being an odd/even register. + + const TargetMachine &TM = MF.getTarget(); + const MipsSubtarget &Subtarget = TM.getSubtarget(); + if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) || + (FP64 && !Subtarget.useOddSPReg())) { + const MipsSEInstrInfo &TII = + *static_cast(TM.getInstrInfo()); + const MipsRegisterInfo &TRI = + *static_cast(TM.getRegisterInfo()); + + unsigned DstReg = I->getOperand(0).getReg(); + unsigned LoReg = I->getOperand(1).getReg(); + unsigned HiReg = I->getOperand(2).getReg(); + + // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are + // the cases where mthc1 is not available). 64-bit architectures and + // MIPS32r2 or later can use FGR64 though. + assert(Subtarget.isGP64bit() || Subtarget.hasMTHC1() || + !Subtarget.isFP64bit()); + + const TargetRegisterClass *RC = &Mips::GPR32RegClass; + const TargetRegisterClass *RC2 = + FP64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass; + + // We re-use the same spill slot each time so that the stack frame doesn't + // grow too much in functions with a large number of moves. + int FI = MF.getInfo()->getMoveF64ViaSpillFI(RC2); + TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC, &TRI, + 0); + TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC, &TRI, + 4); + TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &TRI, 0); + return true; + } + + return false; +} + +/// This method expands the same instruction that MipsSEInstrInfo:: +/// expandExtractElementF64 does, for the case when ABI is fpxx and mfhc1 is not +/// available and the case where the ABI is FP64A. It is implemented here +/// because frame indexes are eliminated before MipsSEInstrInfo:: +/// expandExtractElementF64 is called. +bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + bool FP64) const { + // For fpxx and when mfhc1 is not available, use: + // spill + reload via ldc1 + // + // The case where dmfc1 is available doesn't need to be handled here + // because it never creates a ExtractElementF64 node. + // + // The FP64A ABI (fp64 with nooddspreg) must also use a spill/reload sequence + // for odd-numbered double precision values (because the lower 32-bits is + // transferred with mfc1 which is redirected to the upper half of the even + // register). Unfortunately, we have to make this decision before register + // allocation so for now we use a spill/reload sequence for all + // double-precision values in regardless of being an odd/even register. + + const TargetMachine &TM = MF.getTarget(); + const MipsSubtarget &Subtarget = TM.getSubtarget(); + if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) || + (FP64 && !Subtarget.useOddSPReg())) { + const MipsSEInstrInfo &TII = + *static_cast(TM.getInstrInfo()); + const MipsRegisterInfo &TRI = + *static_cast(TM.getRegisterInfo()); + + unsigned DstReg = I->getOperand(0).getReg(); + unsigned SrcReg = I->getOperand(1).getReg(); + unsigned N = I->getOperand(2).getImm(); + + // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are + // the cases where mfhc1 is not available). 64-bit architectures and + // MIPS32r2 or later can use FGR64 though. + assert(Subtarget.isGP64bit() || Subtarget.hasMTHC1() || + !Subtarget.isFP64bit()); + + const TargetRegisterClass *RC = + FP64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass; + const TargetRegisterClass *RC2 = &Mips::GPR32RegClass; + + // We re-use the same spill slot each time so that the stack frame doesn't + // grow too much in functions with a large number of moves. + int FI = MF.getInfo()->getMoveF64ViaSpillFI(RC); + TII.storeRegToStack(MBB, I, SrcReg, I->getOperand(1).isKill(), FI, RC, &TRI, + 0); + TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &TRI, N * 4); + return true; + } + + return false; +} + +MipsSEFrameLowering::MipsSEFrameLowering(const MipsSubtarget &STI) + : MipsFrameLowering(STI, STI.stackAlignment()) {} + unsigned MipsSEFrameLowering::ehDataReg(unsigned I) const { static const unsigned EhDataReg[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 @@ -339,6 +477,22 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const { MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); + } else if (Mips::FGR64RegClass.contains(Reg)) { + unsigned Reg0 = MRI->getDwarfRegNum(Reg, true); + unsigned Reg1 = MRI->getDwarfRegNum(Reg, true) + 1; + + if (!STI.isLittle()) + std::swap(Reg0, Reg1); + + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg0, Offset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); } else { // Reg is either in GPR32 or FGR32. unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h index 5d2801ffb213..e832848754dc 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.h +++ b/lib/Target/Mips/MipsSEFrameLowering.h @@ -20,8 +20,7 @@ namespace llvm { class MipsSEFrameLowering : public MipsFrameLowering { public: - explicit MipsSEFrameLowering(const MipsSubtarget &STI) - : MipsFrameLowering(STI, STI.stackAlignment()) {} + explicit MipsSEFrameLowering(const MipsSubtarget &STI); /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index d5385be7ddb0..47e193191c3e 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -37,7 +37,8 @@ using namespace llvm; #define DEBUG_TYPE "mips-isel" bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { - if (Subtarget.inMips16Mode()) + Subtarget = &TM.getSubtarget(); + if (Subtarget->inMips16Mode()) return false; return MipsDAGToDAGISel::runOnMachineFunction(MF); } @@ -134,7 +135,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg(); const TargetRegisterClass *RC; - if (Subtarget.isABI_N64()) + if (Subtarget->isABI_N64()) RC = (const TargetRegisterClass*)&Mips::GPR64RegClass; else RC = (const TargetRegisterClass*)&Mips::GPR32RegClass; @@ -142,7 +143,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { V0 = RegInfo.createVirtualRegister(RC); V1 = RegInfo.createVirtualRegister(RC); - if (Subtarget.isABI_N64()) { + if (Subtarget->isABI_N64()) { MF.getRegInfo().addLiveIn(Mips::T9_64); MBB.addLiveIn(Mips::T9_64); @@ -174,7 +175,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { MF.getRegInfo().addLiveIn(Mips::T9); MBB.addLiveIn(Mips::T9); - if (Subtarget.isABI_N32()) { + if (Subtarget->isABI_N32()) { // lui $v0, %hi(%neg(%gp_rel(fname))) // addu $v1, $v0, $t9 // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname))) @@ -187,7 +188,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { return; } - assert(Subtarget.isABI_O32()); + assert(Subtarget->isABI_O32()); // For O32 ABI, the following instruction sequence is emitted to initialize // the global base register: @@ -408,7 +409,7 @@ bool MipsSEDAGToDAGISel::selectIntAddrMSA(SDValue Addr, SDValue &Base, // * MSA is enabled // * N is a ISD::BUILD_VECTOR representing a constant splat bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const { - if (!Subtarget.hasMSA()) + if (!Subtarget->hasMSA()) return false; BuildVectorSDNode *Node = dyn_cast(N); @@ -422,7 +423,7 @@ bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const { if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, 8, - !Subtarget.isLittle())) + !Subtarget->isLittle())) return false; Imm = SplatValue; @@ -648,7 +649,7 @@ std::pair MipsSEDAGToDAGISel::selectNode(SDNode *Node) { } case ISD::ADDE: { - if (Subtarget.hasDSP()) // Select DSP instructions, ADDSC and ADDWC. + if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC. break; SDValue InFlag = Node->getOperand(2); Result = selectAddESubE(Mips::ADDu, InFlag, InFlag.getValue(0), DL, Node); @@ -658,11 +659,11 @@ std::pair MipsSEDAGToDAGISel::selectNode(SDNode *Node) { case ISD::ConstantFP: { ConstantFPSDNode *CN = dyn_cast(Node); if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) { - if (Subtarget.isGP64bit()) { + if (Subtarget->isGP64bit()) { SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, Mips::ZERO_64, MVT::i64); Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero); - } else if (Subtarget.isFP64bit()) { + } else if (Subtarget->isFP64bit()) { SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, Mips::ZERO, MVT::i32); Result = CurDAG->getMachineNode(Mips::BuildPairF64_64, DL, MVT::f64, @@ -813,12 +814,12 @@ std::pair MipsSEDAGToDAGISel::selectNode(SDNode *Node) { EVT ResVecTy = BVN->getValueType(0); EVT ViaVecTy; - if (!Subtarget.hasMSA() || !BVN->getValueType(0).is128BitVector()) + if (!Subtarget->hasMSA() || !BVN->getValueType(0).is128BitVector()) return std::make_pair(false, nullptr); if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, 8, - !Subtarget.isLittle())) + !Subtarget->isLittle())) return std::make_pair(false, nullptr); switch (SplatBitSize) { diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index cc7ed714beb5..8173615cdcb5 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -34,15 +34,16 @@ static cl::opt NoDPLoadStore("mno-ldc1-sdc1", cl::init(false), "stores to their single precision " "counterparts")); -MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) - : MipsTargetLowering(TM) { +MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM, + const MipsSubtarget &STI) + : MipsTargetLowering(TM, STI) { // Set up the register classes addRegisterClass(MVT::i32, &Mips::GPR32RegClass); - if (isGP64bit()) + if (Subtarget.isGP64bit()) addRegisterClass(MVT::i64, &Mips::GPR64RegClass); - if (Subtarget->hasDSP() || Subtarget->hasMSA()) { + if (Subtarget.hasDSP() || Subtarget.hasMSA()) { // Expand all truncating stores and extending loads. unsigned FirstVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; unsigned LastVT = (unsigned)MVT::LAST_VECTOR_VALUETYPE; @@ -58,7 +59,7 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) } } - if (Subtarget->hasDSP()) { + if (Subtarget.hasDSP()) { MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8}; for (unsigned i = 0; i < array_lengthof(VecTys); ++i) { @@ -82,10 +83,10 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) setTargetDAGCombine(ISD::VSELECT); } - if (Subtarget->hasDSPR2()) + if (Subtarget.hasDSPR2()) setOperationAction(ISD::MUL, MVT::v2i16, Legal); - if (Subtarget->hasMSA()) { + if (Subtarget.hasMSA()) { addMSAIntType(MVT::v16i8, &Mips::MSA128BRegClass); addMSAIntType(MVT::v8i16, &Mips::MSA128HRegClass); addMSAIntType(MVT::v4i32, &Mips::MSA128WRegClass); @@ -101,12 +102,12 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) setTargetDAGCombine(ISD::XOR); } - if (!Subtarget->mipsSEUsesSoftFloat()) { + if (!Subtarget.abiUsesSoftFloat()) { addRegisterClass(MVT::f32, &Mips::FGR32RegClass); // When dealing with single precision only, use libcalls - if (!Subtarget->isSingleFloat()) { - if (Subtarget->isFP64bit()) + if (!Subtarget.isSingleFloat()) { + if (Subtarget.isFP64bit()) addRegisterClass(MVT::f64, &Mips::FGR64RegClass); else addRegisterClass(MVT::f64, &Mips::AFGR64RegClass); @@ -118,12 +119,12 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::MULHS, MVT::i32, Custom); setOperationAction(ISD::MULHU, MVT::i32, Custom); - if (Subtarget->hasCnMips()) + if (Subtarget.hasCnMips()) setOperationAction(ISD::MUL, MVT::i64, Legal); - else if (isGP64bit()) + else if (Subtarget.isGP64bit()) setOperationAction(ISD::MUL, MVT::i64, Custom); - if (isGP64bit()) { + if (Subtarget.isGP64bit()) { setOperationAction(ISD::MULHS, MVT::i64, Custom); setOperationAction(ISD::MULHU, MVT::i64, Custom); } @@ -152,7 +153,7 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::STORE, MVT::f64, Custom); } - if (Subtarget->hasMips32r6()) { + if (Subtarget.hasMips32r6()) { // MIPS32r6 replaces the accumulator-based multiplies with a three register // instruction setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); @@ -180,7 +181,7 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::SELECT, MVT::f32, Legal); setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); - assert(Subtarget->isFP64bit() && "FR=1 is required for MIPS32r6"); + assert(Subtarget.isFP64bit() && "FR=1 is required for MIPS32r6"); setOperationAction(ISD::SETCC, MVT::f64, Legal); setOperationAction(ISD::SELECT, MVT::f64, Legal); setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); @@ -199,7 +200,7 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); } - if (Subtarget->hasMips64r6()) { + if (Subtarget.hasMips64r6()) { // MIPS64r6 replaces the accumulator-based multiplies with a three register // instruction setOperationAction(ISD::MUL, MVT::i64, Legal); @@ -226,8 +227,17 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) } const MipsTargetLowering * -llvm::createMipsSETargetLowering(MipsTargetMachine &TM) { - return new MipsSETargetLowering(TM); +llvm::createMipsSETargetLowering(MipsTargetMachine &TM, + const MipsSubtarget &STI) { + return new MipsSETargetLowering(TM, STI); +} + +const TargetRegisterClass * +MipsSETargetLowering::getRepRegClassFor(MVT VT) const { + if (VT == MVT::Untyped) + return Subtarget.hasDSP() ? &Mips::ACC64DSPRegClass : &Mips::ACC64RegClass; + + return TargetLowering::getRepRegClassFor(VT); } // Enable MSA support for the given integer type and Register class. @@ -324,7 +334,7 @@ MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy; - if (Subtarget->systemSupportsUnalignedAccess()) { + if (Subtarget.systemSupportsUnalignedAccess()) { // MIPS32r6/MIPS64r6 is required to support unaligned access. It's // implementation defined whether this is handled by hardware, software, or // a hybrid of the two but it's expected that most implementations will @@ -515,11 +525,11 @@ static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) { static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { if (DCI.isBeforeLegalize()) return SDValue(); - if (Subtarget->hasMips32() && !Subtarget->hasMips32r6() && + if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() && N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG)) return SDValue(N, 0); @@ -535,8 +545,8 @@ static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG, // - Removes redundant zero extensions performed by an ISD::AND. static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { - if (!Subtarget->hasMSA()) + const MipsSubtarget &Subtarget) { + if (!Subtarget.hasMSA()) return SDValue(); SDValue Op0 = N->getOperand(0); @@ -651,8 +661,8 @@ static bool isBitwiseInverse(SDValue N, SDValue OfNode) { // vector type. static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { - if (!Subtarget->hasMSA()) + const MipsSubtarget &Subtarget) { + if (!Subtarget.hasMSA()) return SDValue(); EVT Ty = N->getValueType(0); @@ -668,7 +678,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, SDValue Op0Op1 = Op0->getOperand(1); SDValue Op1Op0 = Op1->getOperand(0); SDValue Op1Op1 = Op1->getOperand(1); - bool IsLittleEndian = !Subtarget->isLittle(); + bool IsLittleEndian = !Subtarget.isLittle(); SDValue IfSet, IfClr, Cond; bool IsConstantMask = false; @@ -771,11 +781,11 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { if (DCI.isBeforeLegalize()) return SDValue(); - if (Subtarget->hasMips32() && N->getValueType(0) == MVT::i32 && + if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 && selectMSUB(N, &DAG)) return SDValue(N, 0); @@ -835,7 +845,7 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG, static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty, SelectionDAG &DAG, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { // See if this is a vector splat immediate node. APInt SplatValue, SplatUndef; unsigned SplatBitSize; @@ -843,12 +853,12 @@ static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty, unsigned EltSize = Ty.getVectorElementType().getSizeInBits(); BuildVectorSDNode *BV = dyn_cast(N->getOperand(1)); - if (!Subtarget->hasDSP()) + if (!Subtarget.hasDSP()) return SDValue(); if (!BV || !BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, - EltSize, !Subtarget->isLittle()) || + EltSize, !Subtarget.isLittle()) || (SplatBitSize != EltSize) || (SplatValue.getZExtValue() >= EltSize)) return SDValue(); @@ -859,7 +869,7 @@ static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty, static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { EVT Ty = N->getValueType(0); if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8)) @@ -882,10 +892,10 @@ static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG, // used for DSPr2. static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { EVT Ty = N->getValueType(0); - if (Subtarget->hasMSA()) { + if (Subtarget.hasMSA()) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -920,7 +930,7 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, } } - if ((Ty != MVT::v2i16) && ((Ty != MVT::v4i8) || !Subtarget->hasDSPR2())) + if ((Ty != MVT::v2i16) && ((Ty != MVT::v4i8) || !Subtarget.hasDSPR2())) return SDValue(); return performDSPShiftCombine(MipsISD::SHRA_DSP, N, Ty, DAG, Subtarget); @@ -929,10 +939,10 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, static SDValue performSRLCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { EVT Ty = N->getValueType(0); - if (((Ty != MVT::v2i16) || !Subtarget->hasDSPR2()) && (Ty != MVT::v4i8)) + if (((Ty != MVT::v2i16) || !Subtarget.hasDSPR2()) && (Ty != MVT::v4i8)) return SDValue(); return performDSPShiftCombine(MipsISD::SHRL_DSP, N, Ty, DAG, Subtarget); @@ -1026,10 +1036,10 @@ static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) { } static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG, - const MipsSubtarget *Subtarget) { + const MipsSubtarget &Subtarget) { EVT Ty = N->getValueType(0); - if (Subtarget->hasMSA() && Ty.is128BitVector() && Ty.isInteger()) { + if (Subtarget.hasMSA() && Ty.is128BitVector() && Ty.isInteger()) { // Try the following combines: // (xor (or $a, $b), (build_vector allones)) // (xor (or $a, $b), (bitcast (build_vector allones))) @@ -1207,7 +1217,7 @@ SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const { Nd.isNonTemporal(), Nd.isInvariant(), std::min(Nd.getAlignment(), 4U)); - if (!Subtarget->isLittle()) + if (!Subtarget.isLittle()) std::swap(Lo, Hi); SDValue BP = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi); @@ -1230,7 +1240,7 @@ SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Val, DAG.getConstant(1, MVT::i32)); - if (!Subtarget->isLittle()) + if (!Subtarget.isLittle()) std::swap(Lo, Hi); // i32 store to lower address. @@ -1249,7 +1259,7 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi, SelectionDAG &DAG) const { // MIPS32r6/MIPS64r6 removed accumulator based multiplies. - assert(!Subtarget->hasMips32r6()); + assert(!Subtarget.hasMips32r6()); EVT Ty = Op.getOperand(0).getValueType(); SDLoc DL(Op); @@ -1613,7 +1623,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_bnegi_w: case Intrinsic::mips_bnegi_d: return lowerMSABinaryBitImmIntr(Op, DAG, ISD::XOR, Op->getOperand(2), - !Subtarget->isLittle()); + !Subtarget.isLittle()); case Intrinsic::mips_bnz_b: case Intrinsic::mips_bnz_h: case Intrinsic::mips_bnz_w: @@ -1649,7 +1659,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_bseti_w: case Intrinsic::mips_bseti_d: return lowerMSABinaryBitImmIntr(Op, DAG, ISD::OR, Op->getOperand(2), - !Subtarget->isLittle()); + !Subtarget.isLittle()); case Intrinsic::mips_bz_b: case Intrinsic::mips_bz_h: case Intrinsic::mips_bz_w: @@ -1724,7 +1734,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_copy_s_w: return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT); case Intrinsic::mips_copy_s_d: - if (hasMips64()) + if (Subtarget.hasMips64()) // Lower directly into VEXTRACT_SEXT_ELT since i64 is legal on Mips64. return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT); else { @@ -1739,7 +1749,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_copy_u_w: return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT); case Intrinsic::mips_copy_u_d: - if (hasMips64()) + if (Subtarget.hasMips64()) // Lower directly into VEXTRACT_ZEXT_ELT since i64 is legal on Mips64. return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT); else { @@ -2316,12 +2326,12 @@ SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op, unsigned SplatBitSize; bool HasAnyUndefs; - if (!Subtarget->hasMSA() || !ResTy.is128BitVector()) + if (!Subtarget.hasMSA() || !ResTy.is128BitVector()) return SDValue(); if (Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, 8, - !Subtarget->isLittle()) && SplatBitSize <= 64) { + !Subtarget.isLittle()) && SplatBitSize <= 64) { // We can only cope with 8, 16, 32, or 64-bit elements if (SplatBitSize != 8 && SplatBitSize != 16 && SplatBitSize != 32 && SplatBitSize != 64) @@ -2894,7 +2904,7 @@ emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{ // valid because FR=1 mode which is the only supported mode in MSA. MachineBasicBlock * MipsSETargetLowering:: emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{ - assert(Subtarget->isFP64bit()); + assert(Subtarget.isFP64bit()); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); @@ -2957,7 +2967,7 @@ MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI, MachineBasicBlock * MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI, MachineBasicBlock *BB) const { - assert(Subtarget->isFP64bit()); + assert(Subtarget.isFP64bit()); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); @@ -3016,8 +3026,8 @@ MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI, unsigned SrcValReg = MI->getOperand(3).getReg(); const TargetRegisterClass *VecRC = nullptr; - const TargetRegisterClass *GPRRC = isGP64bit() ? &Mips::GPR64RegClass - : &Mips::GPR32RegClass; + const TargetRegisterClass *GPRRC = + Subtarget.isGP64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; unsigned EltLog2Size; unsigned InsertOp = 0; unsigned InsveOp = 0; @@ -3146,7 +3156,7 @@ MipsSETargetLowering::emitFILL_FW(MachineInstr *MI, MachineBasicBlock * MipsSETargetLowering::emitFILL_FD(MachineInstr *MI, MachineBasicBlock *BB) const { - assert(Subtarget->isFP64bit()); + assert(Subtarget.isFP64bit()); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h index 03a20ef6741c..00d86834be0a 100644 --- a/lib/Target/Mips/MipsSEISelLowering.h +++ b/lib/Target/Mips/MipsSEISelLowering.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef MipsSEISELLOWERING_H -#define MipsSEISELLOWERING_H +#ifndef MIPSSEISELLOWERING_H +#define MIPSSEISELLOWERING_H #include "MipsISelLowering.h" #include "MipsRegisterInfo.h" @@ -20,7 +20,8 @@ namespace llvm { class MipsSETargetLowering : public MipsTargetLowering { public: - explicit MipsSETargetLowering(MipsTargetMachine &TM); + explicit MipsSETargetLowering(MipsTargetMachine &TM, + const MipsSubtarget &STI); /// \brief Enable MSA support for the given integer type and Register /// class. @@ -46,13 +47,7 @@ namespace llvm { return false; } - const TargetRegisterClass *getRepRegClassFor(MVT VT) const override { - if (VT == MVT::Untyped) - return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass : - &Mips::ACC64RegClass; - - return TargetLowering::getRepRegClassFor(VT); - } + const TargetRegisterClass *getRepRegClassFor(MVT VT) const override; private: bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo, diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp index e82c8cff7225..69cb74cb1e49 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -24,11 +24,10 @@ using namespace llvm; -MipsSEInstrInfo::MipsSEInstrInfo(MipsTargetMachine &tm) - : MipsInstrInfo(tm, - tm.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J), - RI(*tm.getSubtargetImpl()), - IsN64(tm.getSubtarget().isABI_N64()) {} +MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI) + : MipsInstrInfo(STI, STI.getRelocationModel() == Reloc::PIC_ ? Mips::B + : Mips::J), + RI(STI), IsN64(STI.isABI_N64()) {} const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const { return RI; @@ -84,7 +83,7 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { unsigned Opc = 0, ZeroReg = 0; - bool isMicroMips = TM.getSubtarget().inMicroMipsMode(); + bool isMicroMips = Subtarget.inMicroMipsMode(); if (Mips::GPR32RegClass.contains(DestReg)) { // Copy to CPU Reg. if (Mips::GPR32RegClass.contains(SrcReg)) { @@ -265,14 +264,14 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MachineBasicBlock &MBB = *MI->getParent(); - bool isMicroMips = TM.getSubtarget().inMicroMipsMode(); + bool isMicroMips = Subtarget.inMicroMipsMode(); unsigned Opc; switch(MI->getDesc().getOpcode()) { default: return false; case Mips::RetRA: - expandRetRA(MBB, MI, Mips::RET); + expandRetRA(MBB, MI); break; case Mips::PseudoMFHI: Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI; @@ -360,7 +359,7 @@ unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const { void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const MipsSubtarget &STI = TM.getSubtarget(); + const MipsSubtarget &STI = Subtarget; DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu; unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu; @@ -380,7 +379,7 @@ MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB, MachineBasicBlock::iterator II, DebugLoc DL, unsigned *NewImm) const { MipsAnalyzeImmediate AnalyzeImm; - const MipsSubtarget &STI = TM.getSubtarget(); + const MipsSubtarget &STI = Subtarget; MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); unsigned Size = STI.isABI_N64() ? 64 : 32; unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi; @@ -428,9 +427,12 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const { } void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned Opc) const { - BuildMI(MBB, I, I->getDebugLoc(), get(Opc)).addReg(Mips::RA); + MachineBasicBlock::iterator I) const { + if (Subtarget.isGP64bit()) + BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64)) + .addReg(Mips::RA_64); + else + BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA); } std::pair @@ -516,8 +518,17 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB, unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo; unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx); - if (SubIdx == Mips::sub_hi && FP64) { - // FIXME: The .addReg(SrcReg, RegState::Implicit) is a white lie used to + // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload + // in MipsSEFrameLowering.cpp. + assert(!(Subtarget.isABI_FPXX() && !Subtarget.hasMips32r2())); + + // FP64A (FP64 with nooddspreg) should have been handled with a spill/reload + // in MipsSEFrameLowering.cpp. + assert(!(Subtarget.isFP64bit() && !Subtarget.useOddSPReg())); + + if (SubIdx == Mips::sub_hi && Subtarget.hasMTHC1()) { + // FIXME: Strictly speaking MFHC1 only reads the top 32-bits however, we + // claim to read the whole 64-bits as part of a white lie used to // temporarily work around a widespread bug in the -mfp64 support. // The problem is that none of the 32-bit fpu ops mention the fact // that they clobber the upper 32-bits of the 64-bit FPR. Fixing that @@ -528,8 +539,8 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB, // We therefore pretend that it reads the bottom 32-bits to // artificially create a dependency and prevent the scheduler // changing the behaviour of the code. - BuildMI(MBB, I, dl, get(Mips::MFHC1), DstReg).addReg(SubReg).addReg( - SrcReg, RegState::Implicit); + BuildMI(MBB, I, dl, get(FP64 ? Mips::MFHC1_D64 : Mips::MFHC1_D32), DstReg) + .addReg(SrcReg); } else BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg); } @@ -542,29 +553,34 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB, const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1); DebugLoc dl = I->getDebugLoc(); const TargetRegisterInfo &TRI = getRegisterInfo(); - bool HasMTHC1 = TM.getSubtarget().hasMips32r2() || - TM.getSubtarget().hasMips32r6(); // When mthc1 is available, use: // mtc1 Lo, $fp // mthc1 Hi, $fp // - // Otherwise, for FP64: + // Otherwise, for O32 FPXX ABI: // spill + reload via ldc1 - // This has not been implemented since FP64 on MIPS32 and earlier is not - // supported. + // This case is handled by the frame lowering code. // // Otherwise, for FP32: // mtc1 Lo, $fp // mtc1 Hi, $fp + 1 + // + // The case where dmtc1 is available doesn't need to be handled here + // because it never creates a BuildPairF64 node. + + // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload + // in MipsSEFrameLowering.cpp. + assert(!(Subtarget.isABI_FPXX() && !Subtarget.hasMips32r2())); + + // FP64A (FP64 with nooddspreg) should have been handled with a spill/reload + // in MipsSEFrameLowering.cpp. + assert(!(Subtarget.isFP64bit() && !Subtarget.useOddSPReg())); BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo)) .addReg(LoReg); - if (HasMTHC1 || FP64) { - assert(TM.getSubtarget().hasMips32r2() && - "MTHC1 requires MIPS32r2"); - + if (Subtarget.hasMTHC1()) { // FIXME: The .addReg(DstReg) is a white lie used to temporarily work // around a widespread bug in the -mfp64 support. // The problem is that none of the 32-bit fpu ops mention the fact @@ -579,7 +595,9 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB, BuildMI(MBB, I, dl, get(FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32), DstReg) .addReg(DstReg) .addReg(HiReg); - } else + } else if (Subtarget.isABI_FPXX()) + llvm_unreachable("BuildPairF64 not expanded in frame lowering code!"); + else BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi)) .addReg(HiReg); } @@ -589,29 +607,31 @@ void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB, // This pseudo instruction is generated as part of the lowering of // ISD::EH_RETURN. We convert it to a stack increment by OffsetReg, and // indirect jump to TargetReg - const MipsSubtarget &STI = TM.getSubtarget(); - unsigned ADDU = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu; - unsigned JR = STI.isABI_N64() ? Mips::JR64 : Mips::JR; - unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP; - unsigned RA = STI.isABI_N64() ? Mips::RA_64 : Mips::RA; - unsigned T9 = STI.isABI_N64() ? Mips::T9_64 : Mips::T9; - unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO; + unsigned ADDU = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu; + unsigned SP = Subtarget.isGP64bit() ? Mips::SP_64 : Mips::SP; + unsigned RA = Subtarget.isGP64bit() ? Mips::RA_64 : Mips::RA; + unsigned T9 = Subtarget.isGP64bit() ? Mips::T9_64 : Mips::T9; + unsigned ZERO = Subtarget.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO; unsigned OffsetReg = I->getOperand(0).getReg(); unsigned TargetReg = I->getOperand(1).getReg(); // addu $ra, $v0, $zero // addu $sp, $sp, $v1 - // jr $ra + // jr $ra (via RetRA) + const TargetMachine &TM = MBB.getParent()->getTarget(); if (TM.getRelocationModel() == Reloc::PIC_) BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), T9) - .addReg(TargetReg).addReg(ZERO); + .addReg(TargetReg) + .addReg(ZERO); BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), RA) - .addReg(TargetReg).addReg(ZERO); + .addReg(TargetReg) + .addReg(ZERO); BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), SP) - .addReg(SP).addReg(OffsetReg); - BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(JR)).addReg(RA); + .addReg(SP) + .addReg(OffsetReg); + expandRetRA(MBB, I); } -const MipsInstrInfo *llvm::createMipsSEInstrInfo(MipsTargetMachine &TM) { - return new MipsSEInstrInfo(TM); +const MipsInstrInfo *llvm::createMipsSEInstrInfo(const MipsSubtarget &STI) { + return new MipsSEInstrInfo(STI); } diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h index aa68552066c3..9576fef1bd92 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.h +++ b/lib/Target/Mips/MipsSEInstrInfo.h @@ -24,7 +24,7 @@ class MipsSEInstrInfo : public MipsInstrInfo { bool IsN64; public: - explicit MipsSEInstrInfo(MipsTargetMachine &TM); + explicit MipsSEInstrInfo(const MipsSubtarget &STI); const MipsRegisterInfo &getRegisterInfo() const override; @@ -81,8 +81,7 @@ class MipsSEInstrInfo : public MipsInstrInfo { private: unsigned getAnalyzableBrOpc(unsigned Opc) const override; - void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned Opc) const; + void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; std::pair compareOpndSize(unsigned Opc, const MachineFunction &MF) const; diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/lib/Target/Mips/MipsSelectionDAGInfo.cpp index c8e995a991da..edd8f670707f 100644 --- a/lib/Target/Mips/MipsSelectionDAGInfo.cpp +++ b/lib/Target/Mips/MipsSelectionDAGInfo.cpp @@ -16,8 +16,8 @@ using namespace llvm; #define DEBUG_TYPE "mips-selectiondag-info" -MipsSelectionDAGInfo::MipsSelectionDAGInfo(const MipsTargetMachine &TM) - : TargetSelectionDAGInfo(TM.getDataLayout()) {} +MipsSelectionDAGInfo::MipsSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} MipsSelectionDAGInfo::~MipsSelectionDAGInfo() { } diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.h b/lib/Target/Mips/MipsSelectionDAGInfo.h index 6cafb558b35a..2b3d527fe6ff 100644 --- a/lib/Target/Mips/MipsSelectionDAGInfo.h +++ b/lib/Target/Mips/MipsSelectionDAGInfo.h @@ -22,7 +22,7 @@ class MipsTargetMachine; class MipsSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit MipsSelectionDAGInfo(const MipsTargetMachine &TM); + explicit MipsSelectionDAGInfo(const DataLayout &DL); ~MipsSelectionDAGInfo(); }; diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index 74ec06479c4e..5bf875daea99 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -60,11 +60,9 @@ Mips16ConstantIslands( /// Select the Mips CPU for the given triple and cpu name. /// FIXME: Merge with the copy in MipsMCTargetDesc.cpp -static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) { +static StringRef selectMipsCPU(Triple TT, StringRef CPU) { if (CPU.empty() || CPU == "generic") { - Triple TheTriple(TT); - if (TheTriple.getArch() == Triple::mips || - TheTriple.getArch() == Triple::mipsel) + if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel) CPU = "mips32"; else CPU = "mips64"; @@ -74,39 +72,55 @@ static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) { void MipsSubtarget::anchor() { } +static std::string computeDataLayout(const MipsSubtarget &ST) { + std::string Ret = ""; + + // There are both little and big endian mips. + if (ST.isLittle()) + Ret += "e"; + else + Ret += "E"; + + Ret += "-m:m"; + + // Pointers are 32 bit on some ABIs. + if (!ST.isABI_N64()) + Ret += "-p:32:32"; + + // 8 and 16 bit integers only need no have natural alignment, but try to + // align them to 32 bits. 64 bit integers have natural alignment. + Ret += "-i8:8:32-i16:16:32-i64:64"; + + // 32 bit registers are always available and the stack is at least 64 bit + // aligned. On N64 64 bit registers are also available and the stack is + // 128 bit aligned. + if (ST.isABI_N64() || ST.isABI_N32()) + Ret += "-n32:64-S128"; + else + Ret += "-n32-S64"; + + return Ret; +} + MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, const std::string &FS, bool little, - Reloc::Model _RM, MipsTargetMachine *_TM) + MipsTargetMachine *_TM) : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little), IsSingleFloat(false), - IsFP64bit(false), IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false), - HasCnMips(false), IsLinux(true), HasMips3_32(false), HasMips3_32r2(false), - HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false), - InMips16Mode(false), InMips16HardFloat(Mips16HardFloat), - InMicroMipsMode(false), HasDSP(false), HasDSPR2(false), - AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false), - RM(_RM), OverrideMode(NoOverride), TM(_TM), TargetTriple(TT) { - std::string CPUName = CPU; - CPUName = selectMipsCPU(TT, CPUName); - - // Parse features string. - ParseSubtargetFeatures(CPUName, FS); - - if (InMips16Mode && !TM->Options.UseSoftFloat) { - // Hard float for mips16 means essentially to compile as soft float - // but to use a runtime library for soft float that is written with - // native mips32 floating point instructions (those runtime routines - // run in mips32 hard float mode). - TM->Options.UseSoftFloat = true; - TM->Options.FloatABIType = FloatABI::Soft; - InMips16HardFloat = true; - } + IsFPXX(false), IsFP64bit(false), UseOddSPReg(true), IsNaN2008bit(false), + IsGP64bit(false), HasVFPU(false), HasCnMips(false), IsLinux(true), + HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false), + HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false), + InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false), + HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), + HasMSA(false), TM(_TM), TargetTriple(TT), + DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS, TM))), + TSInfo(DL), JITInfo(), InstrInfo(MipsInstrInfo::create(*this)), + FrameLowering(MipsFrameLowering::create(*this)), + TLInfo(MipsTargetLowering::create(*TM, *this)) { PreviousInMips16Mode = InMips16Mode; - // Initialize scheduling itinerary for the specified CPU. - InstrItins = getInstrItineraryForCPU(CPUName); - // Don't even attempt to generate code for MIPS-I, MIPS-II, MIPS-III, and // MIPS-V. They have not been tested and currently exist for the integrated // assembler only. @@ -137,6 +151,12 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, "See -mattr=+fp64.", false); + if (!isABI_O32() && !useOddSPReg()) + report_fatal_error("-mattr=+nooddspreg requires the O32 ABI.", false); + + if (IsFPXX && (isABI_N32() || isABI_N64())) + report_fatal_error("FPXX is not permitted for the N32/N64 ABI's.", false); + if (hasMips32r6()) { StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6"; @@ -153,65 +173,39 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, // Set UseSmallSection. // TODO: Investigate the IsLinux check. I suspect it's really checking for // bare-metal. - UseSmallSection = !IsLinux && (RM == Reloc::Static); + UseSmallSection = !IsLinux && (TM->getRelocationModel() == Reloc::Static); } -bool -MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode &Mode, - RegClassVector &CriticalPathRCs) const { - Mode = TargetSubtargetInfo::ANTIDEP_NONE; +/// This overrides the PostRAScheduler bit in the SchedModel for any CPU. +bool MipsSubtarget::enablePostMachineScheduler() const { return true; } + +void MipsSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const { CriticalPathRCs.clear(); - CriticalPathRCs.push_back(isGP64bit() ? &Mips::GPR64RegClass - : &Mips::GPR32RegClass); - return OptLevel >= CodeGenOpt::Aggressive; + CriticalPathRCs.push_back(isGP64bit() ? + &Mips::GPR64RegClass : &Mips::GPR32RegClass); } -//FIXME: This logic for reseting the subtarget along with -// the helper classes can probably be simplified but there are a lot of -// cases so we will defer rewriting this to later. -// -void MipsSubtarget::resetSubtarget(MachineFunction *MF) { - bool ChangeToMips16 = false, ChangeToNoMips16 = false; - DEBUG(dbgs() << "resetSubtargetFeatures" << "\n"); - AttributeSet FnAttrs = MF->getFunction()->getAttributes(); - ChangeToMips16 = FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - "mips16"); - ChangeToNoMips16 = FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - "nomips16"); - assert (!(ChangeToMips16 & ChangeToNoMips16) && - "mips16 and nomips16 specified on the same function"); - if (ChangeToMips16) { - if (PreviousInMips16Mode) - return; - OverrideMode = Mips16Override; - PreviousInMips16Mode = true; - TM->setHelperClassesMips16(); - return; - } else if (ChangeToNoMips16) { - if (!PreviousInMips16Mode) - return; - OverrideMode = NoMips16Override; - PreviousInMips16Mode = false; - TM->setHelperClassesMipsSE(); - return; - } else { - if (OverrideMode == NoOverride) - return; - OverrideMode = NoOverride; - DEBUG(dbgs() << "back to default" << "\n"); - if (inMips16Mode() && !PreviousInMips16Mode) { - TM->setHelperClassesMips16(); - PreviousInMips16Mode = true; - } else if (!inMips16Mode() && PreviousInMips16Mode) { - TM->setHelperClassesMipsSE(); - PreviousInMips16Mode = false; - } - return; - } +CodeGenOpt::Level MipsSubtarget::getOptLevelToEnablePostRAScheduler() const { + return CodeGenOpt::Aggressive; +} + +MipsSubtarget & +MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS, + const TargetMachine *TM) { + std::string CPUName = selectMipsCPU(TargetTriple, CPU); + + // Parse features string. + ParseSubtargetFeatures(CPUName, FS); + // Initialize scheduling itinerary for the specified CPU. + InstrItins = getInstrItineraryForCPU(CPUName); + + if (InMips16Mode && !TM->Options.UseSoftFloat) + InMips16HardFloat = true; + + return *this; } -bool MipsSubtarget::mipsSEUsesSoftFloat() const { +bool MipsSubtarget::abiUsesSoftFloat() const { return TM->Options.UseSoftFloat && !InMips16HardFloat; } @@ -219,3 +213,7 @@ bool MipsSubtarget::useConstantIslands() { DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands << "\n"); return Mips16ConstantIslands; } + +Reloc::Model MipsSubtarget::getRelocationModel() const { + return TM->getRelocationModel(); +} diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index bff656ff1257..70aa148d9bbb 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -14,6 +14,12 @@ #ifndef MIPSSUBTARGET_H #define MIPSSUBTARGET_H +#include "MipsFrameLowering.h" +#include "MipsISelLowering.h" +#include "MipsInstrInfo.h" +#include "MipsJITInfo.h" +#include "MipsSelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -56,9 +62,16 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // floating point registers instead of only using even ones. bool IsSingleFloat; + // IsFPXX - MIPS O32 modeless ABI. + bool IsFPXX; + // IsFP64bit - The target processor has 64-bit floating point registers. bool IsFP64bit; + /// Are odd single-precision registers permitted? + /// This corresponds to -modd-spreg and -mno-odd-spreg + bool UseOddSPReg; + // IsNan2008 - IEEE 754-2008 NaN encoding. bool IsNaN2008bit; @@ -122,9 +135,6 @@ class MipsSubtarget : public MipsGenSubtargetInfo { InstrItineraryData InstrItins; - // Relocation Model - Reloc::Model RM; - // We can override the determination of whether we are in mips16 mode // as from the command line enum {NoOverride, Mips16Override, NoMips16Override} OverrideMode; @@ -132,30 +142,42 @@ class MipsSubtarget : public MipsGenSubtargetInfo { MipsTargetMachine *TM; Triple TargetTriple; + + const DataLayout DL; // Calculates type size & alignment + const MipsSelectionDAGInfo TSInfo; + MipsJITInfo JITInfo; + std::unique_ptr InstrInfo; + std::unique_ptr FrameLowering; + std::unique_ptr TLInfo; + public: - bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, - AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const override; + /// This overrides the PostRAScheduler bit in the SchedModel for each CPU. + bool enablePostMachineScheduler() const override; + void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override; + CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const override; /// Only O32 and EABI supported right now. bool isABI_EABI() const { return MipsABI == EABI; } bool isABI_N64() const { return MipsABI == N64; } bool isABI_N32() const { return MipsABI == N32; } bool isABI_O32() const { return MipsABI == O32; } + bool isABI_FPXX() const { return isABI_O32() && IsFPXX; } unsigned getTargetABI() const { return MipsABI; } /// This constructor initializes the data members to match that /// of the specified triple. MipsSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool little, Reloc::Model RM, - MipsTargetMachine *TM); + const std::string &FS, bool little, MipsTargetMachine *TM); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + bool hasMips1() const { return MipsArchVersion >= Mips1; } bool hasMips2() const { return MipsArchVersion >= Mips2; } bool hasMips3() const { return MipsArchVersion >= Mips3; } + bool hasMips4() const { return MipsArchVersion >= Mips4; } + bool hasMips5() const { return MipsArchVersion >= Mips5; } bool hasMips4_32() const { return HasMips4_32; } bool hasMips4_32r2() const { return HasMips4_32r2; } bool hasMips32() const { @@ -178,7 +200,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo { bool hasCnMips() const { return HasCnMips; } bool isLittle() const { return IsLittle; } + bool isFPXX() const { return IsFPXX; } bool isFP64bit() const { return IsFP64bit; } + bool useOddSPReg() const { return UseOddSPReg; } bool isNaN2008() const { return IsNaN2008bit; } bool isNotFP64bit() const { return !IsFP64bit; } bool isGP64bit() const { return IsGP64bit; } @@ -186,20 +210,14 @@ class MipsSubtarget : public MipsGenSubtargetInfo { bool isSingleFloat() const { return IsSingleFloat; } bool isNotSingleFloat() const { return !IsSingleFloat; } bool hasVFPU() const { return HasVFPU; } - bool inMips16Mode() const { - switch (OverrideMode) { - case NoOverride: - return InMips16Mode; - case Mips16Override: - return true; - case NoMips16Override: - return false; - } - llvm_unreachable("Unexpected mode"); - } + bool inMips16Mode() const { return InMips16Mode; } bool inMips16ModeDefault() const { return InMips16Mode; } + // Hard float for mips16 means essentially to compile as soft float + // but to use a runtime library for soft float that is written with + // native mips32 floating point instructions (those runtime routines + // run in mips32 hard float mode). bool inMips16HardFloat() const { return inMips16Mode() && InMips16HardFloat; } @@ -212,7 +230,7 @@ class MipsSubtarget : public MipsGenSubtargetInfo { bool hasStandardEncoding() const { return !inMips16Mode(); } - bool mipsSEUsesSoftFloat() const; + bool abiUsesSoftFloat() const; bool enableLongBranchPass() const { return hasStandardEncoding() || allowMixed16_32(); @@ -220,6 +238,7 @@ class MipsSubtarget : public MipsGenSubtargetInfo { /// Features related to the presence of specific instructions. bool hasExtractInsert() const { return !inMips16Mode() && hasMips32r2(); } + bool hasMTHC1() const { return hasMips32r2(); } const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } bool allowMixed16_32() const { return inMips16ModeDefault() | @@ -237,10 +256,10 @@ class MipsSubtarget : public MipsGenSubtargetInfo { unsigned stackAlignment() const { return hasMips64() ? 16 : 8; } // Grab relocation model - Reloc::Model getRelocationModel() const {return RM;} + Reloc::Model getRelocationModel() const; - /// \brief Reset the subtarget for the Mips target. - void resetSubtarget(MachineFunction *MF); + MipsSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS, + const TargetMachine *TM); /// Does the system support unaligned memory access. /// @@ -248,6 +267,22 @@ class MipsSubtarget : public MipsGenSubtargetInfo { /// specify which component of the system provides it. Hardware, software, and /// hybrid implementations are all valid. bool systemSupportsUnalignedAccess() const { return hasMips32r6(); } + + // Set helper classes + void setHelperClassesMips16(); + void setHelperClassesMipsSE(); + + MipsJITInfo *getJITInfo() { return &JITInfo; } + const MipsSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const MipsInstrInfo *getInstrInfo() const { return InstrInfo.get(); } + const TargetFrameLowering *getFrameLowering() const { + return FrameLowering.get(); + } + const MipsRegisterInfo *getRegisterInfo() const { + return &InstrInfo->getRegisterInfo(); + } + const MipsTargetLowering *getTargetLowering() const { return TLInfo.get(); } }; } // End llvm namespace diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 984c58eb6c1e..bb1870ebe605 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -45,93 +45,26 @@ extern "C" void LLVMInitializeMipsTarget() { RegisterTargetMachine B(TheMips64elTarget); } -static std::string computeDataLayout(const MipsSubtarget &ST) { - std::string Ret = ""; - - // There are both little and big endian mips. - if (ST.isLittle()) - Ret += "e"; - else - Ret += "E"; - - Ret += "-m:m"; - - // Pointers are 32 bit on some ABIs. - if (!ST.isABI_N64()) - Ret += "-p:32:32"; - - // 8 and 16 bit integers only need no have natural alignment, but try to - // align them to 32 bits. 64 bit integers have natural alignment. - Ret += "-i8:8:32-i16:16:32-i64:64"; - - // 32 bit registers are always available and the stack is at least 64 bit - // aligned. On N64 64 bit registers are also available and the stack is - // 128 bit aligned. - if (ST.isABI_N64() || ST.isABI_N32()) - Ret += "-n32:64-S128"; - else - Ret += "-n32-S64"; - - return Ret; -} - // On function prologue, the stack is created by decrementing // its pointer. Once decremented, all references are done with positive // offset from the stack/frame pointer, using StackGrowsUp enables // an easier handling. // Using CodeModel::Large enables different CALL behavior. -MipsTargetMachine:: -MipsTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, isLittle, RM, this), - DL(computeDataLayout(Subtarget)), - InstrInfo(MipsInstrInfo::create(*this)), - FrameLowering(MipsFrameLowering::create(*this, Subtarget)), - TLInfo(MipsTargetLowering::create(*this)), TSInfo(*this), - InstrItins(Subtarget.getInstrItineraryData()), JITInfo() { +MipsTargetMachine::MipsTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool isLittle) + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, this), + NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16", + isLittle, this), + Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16", + isLittle, this) { + Subtarget = &DefaultSubtarget; initAsmInfo(); } - -void MipsTargetMachine::setHelperClassesMips16() { - InstrInfoSE.swap(InstrInfo); - FrameLoweringSE.swap(FrameLowering); - TLInfoSE.swap(TLInfo); - if (!InstrInfo16) { - InstrInfo.reset(MipsInstrInfo::create(*this)); - FrameLowering.reset(MipsFrameLowering::create(*this, Subtarget)); - TLInfo.reset(MipsTargetLowering::create(*this)); - } else { - InstrInfo16.swap(InstrInfo); - FrameLowering16.swap(FrameLowering); - TLInfo16.swap(TLInfo); - } - assert(TLInfo && "null target lowering 16"); - assert(InstrInfo && "null instr info 16"); - assert(FrameLowering && "null frame lowering 16"); -} - -void MipsTargetMachine::setHelperClassesMipsSE() { - InstrInfo16.swap(InstrInfo); - FrameLowering16.swap(FrameLowering); - TLInfo16.swap(TLInfo); - if (!InstrInfoSE) { - InstrInfo.reset(MipsInstrInfo::create(*this)); - FrameLowering.reset(MipsFrameLowering::create(*this, Subtarget)); - TLInfo.reset(MipsTargetLowering::create(*this)); - } else { - InstrInfoSE.swap(InstrInfo); - FrameLoweringSE.swap(FrameLowering); - TLInfoSE.swap(TLInfo); - } - assert(TLInfo && "null target lowering in SE"); - assert(InstrInfo && "null instr info SE"); - assert(FrameLowering && "null frame lowering SE"); -} void MipsebTargetMachine::anchor() { } MipsebTargetMachine:: @@ -150,6 +83,23 @@ MipselTargetMachine(const Target &T, StringRef TT, CodeGenOpt::Level OL) : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} +void MipsTargetMachine::resetSubtarget(MachineFunction *MF) { + DEBUG(dbgs() << "resetSubtarget\n"); + AttributeSet FnAttrs = MF->getFunction()->getAttributes(); + bool Mips16Attr = FnAttrs.hasAttribute(AttributeSet::FunctionIndex, "mips16"); + bool NoMips16Attr = + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, "nomips16"); + assert(!(Mips16Attr && NoMips16Attr) && + "mips16 and nomips16 specified on the same function"); + if (Mips16Attr) + Subtarget = &Mips16Subtarget; + else if (NoMips16Attr) + Subtarget = &NoMips16Subtarget; + else + Subtarget = &DefaultSubtarget; + return; +} + namespace { /// Mips Code Generator Pass Configuration Options. class MipsPassConfig : public TargetPassConfig { @@ -196,13 +146,9 @@ void MipsPassConfig::addIRPasses() { // Install an instruction selector pass using // the ISelDag to gen Mips code. bool MipsPassConfig::addInstSelector() { - if (getMipsSubtarget().allowMixed16_32()) { - addPass(createMipsModuleISelDag(getMipsTargetMachine())); - addPass(createMips16ISelDag(getMipsTargetMachine())); - addPass(createMipsSEISelDag(getMipsTargetMachine())); - } else { - addPass(createMipsISelDag(getMipsTargetMachine())); - } + addPass(createMipsModuleISelDag(getMipsTargetMachine())); + addPass(createMips16ISelDag(getMipsTargetMachine())); + addPass(createMipsSEISelDag(getMipsTargetMachine())); return false; } @@ -221,7 +167,7 @@ bool MipsPassConfig::addPreRegAlloc() { } void MipsTargetMachine::addAnalysisPasses(PassManagerBase &PM) { - if (Subtarget.allowMixed16_32()) { + if (Subtarget->allowMixed16_32()) { DEBUG(errs() << "No "); //FIXME: The Basic Target Transform Info // pass needs to become a function pass instead of @@ -238,15 +184,9 @@ void MipsTargetMachine::addAnalysisPasses(PassManagerBase &PM) { // print out the code after the passes. bool MipsPassConfig::addPreEmitPass() { MipsTargetMachine &TM = getMipsTargetMachine(); - const MipsSubtarget &Subtarget = TM.getSubtarget(); addPass(createMipsDelaySlotFillerPass(TM)); - - if (Subtarget.enableLongBranchPass()) - addPass(createMipsLongBranchPass(TM)); - if (Subtarget.inMips16Mode() || - Subtarget.allowMixed16_32()) - addPass(createMipsConstantIslandPass(TM)); - + addPass(createMipsLongBranchPass(TM)); + addPass(createMipsConstantIslandPass(TM)); return true; } diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index a5aa39bc9f4d..bcf411f9cd6f 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -14,15 +14,9 @@ #ifndef MIPSTARGETMACHINE_H #define MIPSTARGETMACHINE_H -#include "MipsFrameLowering.h" -#include "MipsISelLowering.h" -#include "MipsInstrInfo.h" -#include "MipsJITInfo.h" -#include "MipsSelectionDAGInfo.h" #include "MipsSubtarget.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" @@ -31,69 +25,57 @@ class formatted_raw_ostream; class MipsRegisterInfo; class MipsTargetMachine : public LLVMTargetMachine { - MipsSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - std::unique_ptr InstrInfo; - std::unique_ptr FrameLowering; - std::unique_ptr TLInfo; - std::unique_ptr InstrInfo16; - std::unique_ptr FrameLowering16; - std::unique_ptr TLInfo16; - std::unique_ptr InstrInfoSE; - std::unique_ptr FrameLoweringSE; - std::unique_ptr TLInfoSE; - MipsSelectionDAGInfo TSInfo; - const InstrItineraryData &InstrItins; - MipsJITInfo JITInfo; + MipsSubtarget *Subtarget; + MipsSubtarget DefaultSubtarget; + MipsSubtarget NoMips16Subtarget; + MipsSubtarget Mips16Subtarget; public: - MipsTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle); + MipsTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle); virtual ~MipsTargetMachine() {} void addAnalysisPasses(PassManagerBase &PM) override; - const MipsInstrInfo *getInstrInfo() const override - { return InstrInfo.get(); } - const TargetFrameLowering *getFrameLowering() const override - { return FrameLowering.get(); } - const MipsSubtarget *getSubtargetImpl() const override - { return &Subtarget; } - const DataLayout *getDataLayout() const override - { return &DL;} - + const MipsInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const TargetFrameLowering *getFrameLowering() const override { + return getSubtargetImpl()->getFrameLowering(); + } + const MipsSubtarget *getSubtargetImpl() const override { + if (Subtarget) + return Subtarget; + return &DefaultSubtarget; + } const InstrItineraryData *getInstrItineraryData() const override { - return Subtarget.inMips16Mode() ? nullptr : &InstrItins; + return Subtarget->inMips16Mode() + ? nullptr + : &getSubtargetImpl()->getInstrItineraryData(); + } + MipsJITInfo *getJITInfo() override { + return Subtarget->getJITInfo(); } - - MipsJITInfo *getJITInfo() override { return &JITInfo; } - const MipsRegisterInfo *getRegisterInfo() const override { - return &InstrInfo->getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); } - const MipsTargetLowering *getTargetLowering() const override { - return TLInfo.get(); + return getSubtargetImpl()->getTargetLowering(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - const MipsSelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } + /// \brief Reset the subtarget for the Mips target. + void resetSubtarget(MachineFunction *MF); // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override; - - // Set helper classes - void setHelperClassesMips16(); - - void setHelperClassesMipsSE(); - - }; /// MipsebTargetMachine - Mips32/64 big endian target machine. diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h index 4ad37ac5b12c..99f7d4c92cfb 100644 --- a/lib/Target/Mips/MipsTargetStreamer.h +++ b/lib/Target/Mips/MipsTargetStreamer.h @@ -12,46 +12,83 @@ #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCStreamer.h" +#include "MCTargetDesc/MipsABIFlagsSection.h" namespace llvm { -class MipsTargetStreamer : public MCTargetStreamer { - virtual void anchor(); +struct MipsABIFlagsSection; + +class MipsTargetStreamer : public MCTargetStreamer { public: MipsTargetStreamer(MCStreamer &S); - virtual void emitDirectiveSetMicroMips() = 0; - virtual void emitDirectiveSetNoMicroMips() = 0; - virtual void emitDirectiveSetMips16() = 0; - virtual void emitDirectiveSetNoMips16() = 0; - - virtual void emitDirectiveSetReorder() = 0; - virtual void emitDirectiveSetNoReorder() = 0; - virtual void emitDirectiveSetMacro() = 0; - virtual void emitDirectiveSetNoMacro() = 0; - virtual void emitDirectiveSetAt() = 0; - virtual void emitDirectiveSetNoAt() = 0; - virtual void emitDirectiveEnd(StringRef Name) = 0; - - virtual void emitDirectiveEnt(const MCSymbol &Symbol) = 0; - virtual void emitDirectiveAbiCalls() = 0; - virtual void emitDirectiveNaN2008() = 0; - virtual void emitDirectiveNaNLegacy() = 0; - virtual void emitDirectiveOptionPic0() = 0; - virtual void emitDirectiveOptionPic2() = 0; + virtual void emitDirectiveSetMicroMips(); + virtual void emitDirectiveSetNoMicroMips(); + virtual void emitDirectiveSetMips16(); + virtual void emitDirectiveSetNoMips16(); + + virtual void emitDirectiveSetReorder(); + virtual void emitDirectiveSetNoReorder(); + virtual void emitDirectiveSetMacro(); + virtual void emitDirectiveSetNoMacro(); + virtual void emitDirectiveSetAt(); + virtual void emitDirectiveSetNoAt(); + virtual void emitDirectiveEnd(StringRef Name); + + virtual void emitDirectiveEnt(const MCSymbol &Symbol); + virtual void emitDirectiveAbiCalls(); + virtual void emitDirectiveNaN2008(); + virtual void emitDirectiveNaNLegacy(); + virtual void emitDirectiveOptionPic0(); + virtual void emitDirectiveOptionPic2(); virtual void emitFrame(unsigned StackReg, unsigned StackSize, - unsigned ReturnReg) = 0; - virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) = 0; - virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) = 0; + unsigned ReturnReg); + virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff); + virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff); - virtual void emitDirectiveSetMips32R2() = 0; - virtual void emitDirectiveSetMips64() = 0; - virtual void emitDirectiveSetMips64R2() = 0; - virtual void emitDirectiveSetDsp() = 0; + virtual void emitDirectiveSetMips32R2(); + virtual void emitDirectiveSetMips64(); + virtual void emitDirectiveSetMips64R2(); + virtual void emitDirectiveSetDsp(); // PIC support - virtual void emitDirectiveCpload(unsigned RegNo) = 0; + virtual void emitDirectiveCpload(unsigned RegNo); virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, - const MCSymbol &Sym, bool IsReg) = 0; + const MCSymbol &Sym, bool IsReg); + + /// Emit a '.module fp=value' directive using the given values. + /// Updates the .MIPS.abiflags section + virtual void emitDirectiveModuleFP(MipsABIFlagsSection::FpABIKind Value, + bool Is32BitABI) { + ABIFlagsSection.setFpABI(Value, Is32BitABI); + } + + /// Emit a '.module fp=value' directive using the current values of the + /// .MIPS.abiflags section. + void emitDirectiveModuleFP() { + emitDirectiveModuleFP(ABIFlagsSection.getFpABI(), + ABIFlagsSection.Is32BitABI); + } + + virtual void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI); + virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value){}; + virtual void emitMipsAbiFlags(){}; + void setCanHaveModuleDir(bool Can) { canHaveModuleDirective = Can; } + bool getCanHaveModuleDir() { return canHaveModuleDirective; } + + // This method enables template classes to set internal abi flags + // structure values. + template + void updateABIInfo(const PredicateLibrary &P) { + ABIFlagsSection.setAllFromPredicates(P); + } + + MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; } + +protected: + MipsABIFlagsSection ABIFlagsSection; + +private: + bool canHaveModuleDirective; }; // This part is for ascii assembly output @@ -93,6 +130,13 @@ class MipsTargetAsmStreamer : public MipsTargetStreamer { virtual void emitDirectiveCpload(unsigned RegNo); void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg) override; + + // ABI Flags + void emitDirectiveModuleFP(MipsABIFlagsSection::FpABIKind Value, + bool Is32BitABI) override; + void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override; + void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override; + void emitMipsAbiFlags() override; }; // This part is for ELF object output @@ -144,6 +188,10 @@ class MipsTargetELFStreamer : public MipsTargetStreamer { void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg) override; + // ABI Flags + void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override; + void emitMipsAbiFlags() override; + protected: bool isO32() const { return STI.getFeatureBits() & Mips::FeatureO32; } bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; } diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp index 961889685661..80b2f621fb94 100644 --- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp +++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp @@ -57,13 +57,13 @@ void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { OS << "%r"; break; case 4: - OS << "%rl"; + OS << "%rd"; break; case 5: OS << "%f"; break; case 6: - OS << "%fl"; + OS << "%fd"; break; } diff --git a/lib/Target/NVPTX/LLVMBuild.txt b/lib/Target/NVPTX/LLVMBuild.txt index e805aba58aab..bc8d82e022f6 100644 --- a/lib/Target/NVPTX/LLVMBuild.txt +++ b/lib/Target/NVPTX/LLVMBuild.txt @@ -28,5 +28,5 @@ has_asmprinter = 1 type = Library name = NVPTXCodeGen parent = NVPTX -required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo SelectionDAG Support Target +required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo Scalar SelectionDAG Support Target add_to_library_groups = NVPTX diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h index ddb122f65b5a..16ec19c25f16 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h @@ -84,6 +84,17 @@ __attribute__((unused)) #endif static const char *NamedMDForAnnotations = "nvvm.annotations"; +namespace NVPTXII { +enum { + // These must be kept in sync with TSFlags in NVPTXInstrFormats.td + IsTexFlag = 0x80, + IsSuldMask = 0x300, + IsSuldShift = 8, + IsSustFlag = 0x400, + IsSurfTexQueryFlag = 0x800, + IsTexModeUnifiedFlag = 0x1000 +}; +} } #endif diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td index d78b4e81a3e5..93fabf615369 100644 --- a/lib/Target/NVPTX/NVPTX.td +++ b/lib/Target/NVPTX/NVPTX.td @@ -34,12 +34,18 @@ def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30", "Target SM 3.0">; def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35", "Target SM 3.5">; +def SM50 : SubtargetFeature<"sm_50", "SmVersion", "50", + "Target SM 5.0">; // PTX Versions def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30", "Use PTX version 3.0">; def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31", "Use PTX version 3.1">; +def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32", + "Use PTX version 3.2">; +def PTX40 : SubtargetFeature<"ptx40", "PTXVersion", "40", + "Use PTX version 4.0">; //===----------------------------------------------------------------------===// // NVPTX supported processors. @@ -52,6 +58,7 @@ def : Proc<"sm_20", [SM20]>; def : Proc<"sm_21", [SM21]>; def : Proc<"sm_30", [SM30]>; def : Proc<"sm_35", [SM35]>; +def : Proc<"sm_50", [SM50]>; def NVPTXInstrInfo : InstrInfo { diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 195b3c0fe9ed..187b88c1d54a 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -330,253 +330,51 @@ void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) { bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo, MCOperand &MCOp) { const MachineOperand &MO = MI->getOperand(OpNo); + const MCInstrDesc &MCID = MI->getDesc(); - switch (MI->getOpcode()) { - default: return false; - case NVPTX::TEX_1D_F32_I32: - case NVPTX::TEX_1D_F32_F32: - case NVPTX::TEX_1D_F32_F32_LEVEL: - case NVPTX::TEX_1D_F32_F32_GRAD: - case NVPTX::TEX_1D_I32_I32: - case NVPTX::TEX_1D_I32_F32: - case NVPTX::TEX_1D_I32_F32_LEVEL: - case NVPTX::TEX_1D_I32_F32_GRAD: - case NVPTX::TEX_1D_ARRAY_F32_I32: - case NVPTX::TEX_1D_ARRAY_F32_F32: - case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL: - case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD: - case NVPTX::TEX_1D_ARRAY_I32_I32: - case NVPTX::TEX_1D_ARRAY_I32_F32: - case NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL: - case NVPTX::TEX_1D_ARRAY_I32_F32_GRAD: - case NVPTX::TEX_2D_F32_I32: - case NVPTX::TEX_2D_F32_F32: - case NVPTX::TEX_2D_F32_F32_LEVEL: - case NVPTX::TEX_2D_F32_F32_GRAD: - case NVPTX::TEX_2D_I32_I32: - case NVPTX::TEX_2D_I32_F32: - case NVPTX::TEX_2D_I32_F32_LEVEL: - case NVPTX::TEX_2D_I32_F32_GRAD: - case NVPTX::TEX_2D_ARRAY_F32_I32: - case NVPTX::TEX_2D_ARRAY_F32_F32: - case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL: - case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD: - case NVPTX::TEX_2D_ARRAY_I32_I32: - case NVPTX::TEX_2D_ARRAY_I32_F32: - case NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL: - case NVPTX::TEX_2D_ARRAY_I32_F32_GRAD: - case NVPTX::TEX_3D_F32_I32: - case NVPTX::TEX_3D_F32_F32: - case NVPTX::TEX_3D_F32_F32_LEVEL: - case NVPTX::TEX_3D_F32_F32_GRAD: - case NVPTX::TEX_3D_I32_I32: - case NVPTX::TEX_3D_I32_F32: - case NVPTX::TEX_3D_I32_F32_LEVEL: - case NVPTX::TEX_3D_I32_F32_GRAD: - { + if (MCID.TSFlags & NVPTXII::IsTexFlag) { // This is a texture fetch, so operand 4 is a texref and operand 5 is // a samplerref - if (OpNo == 4) { + if (OpNo == 4 && MO.isImm()) { lowerImageHandleSymbol(MO.getImm(), MCOp); return true; } - if (OpNo == 5) { + if (OpNo == 5 && MO.isImm() && !(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) { lowerImageHandleSymbol(MO.getImm(), MCOp); return true; } return false; - } - case NVPTX::SULD_1D_I8_TRAP: - case NVPTX::SULD_1D_I16_TRAP: - case NVPTX::SULD_1D_I32_TRAP: - case NVPTX::SULD_1D_ARRAY_I8_TRAP: - case NVPTX::SULD_1D_ARRAY_I16_TRAP: - case NVPTX::SULD_1D_ARRAY_I32_TRAP: - case NVPTX::SULD_2D_I8_TRAP: - case NVPTX::SULD_2D_I16_TRAP: - case NVPTX::SULD_2D_I32_TRAP: - case NVPTX::SULD_2D_ARRAY_I8_TRAP: - case NVPTX::SULD_2D_ARRAY_I16_TRAP: - case NVPTX::SULD_2D_ARRAY_I32_TRAP: - case NVPTX::SULD_3D_I8_TRAP: - case NVPTX::SULD_3D_I16_TRAP: - case NVPTX::SULD_3D_I32_TRAP: { - // This is a V1 surface load, so operand 1 is a surfref - if (OpNo == 1) { - lowerImageHandleSymbol(MO.getImm(), MCOp); - return true; - } + } else if (MCID.TSFlags & NVPTXII::IsSuldMask) { + unsigned VecSize = + 1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1); - return false; - } - case NVPTX::SULD_1D_V2I8_TRAP: - case NVPTX::SULD_1D_V2I16_TRAP: - case NVPTX::SULD_1D_V2I32_TRAP: - case NVPTX::SULD_1D_ARRAY_V2I8_TRAP: - case NVPTX::SULD_1D_ARRAY_V2I16_TRAP: - case NVPTX::SULD_1D_ARRAY_V2I32_TRAP: - case NVPTX::SULD_2D_V2I8_TRAP: - case NVPTX::SULD_2D_V2I16_TRAP: - case NVPTX::SULD_2D_V2I32_TRAP: - case NVPTX::SULD_2D_ARRAY_V2I8_TRAP: - case NVPTX::SULD_2D_ARRAY_V2I16_TRAP: - case NVPTX::SULD_2D_ARRAY_V2I32_TRAP: - case NVPTX::SULD_3D_V2I8_TRAP: - case NVPTX::SULD_3D_V2I16_TRAP: - case NVPTX::SULD_3D_V2I32_TRAP: { - // This is a V2 surface load, so operand 2 is a surfref - if (OpNo == 2) { + // For a surface load of vector size N, the Nth operand will be the surfref + if (OpNo == VecSize && MO.isImm()) { lowerImageHandleSymbol(MO.getImm(), MCOp); return true; } return false; - } - case NVPTX::SULD_1D_V4I8_TRAP: - case NVPTX::SULD_1D_V4I16_TRAP: - case NVPTX::SULD_1D_V4I32_TRAP: - case NVPTX::SULD_1D_ARRAY_V4I8_TRAP: - case NVPTX::SULD_1D_ARRAY_V4I16_TRAP: - case NVPTX::SULD_1D_ARRAY_V4I32_TRAP: - case NVPTX::SULD_2D_V4I8_TRAP: - case NVPTX::SULD_2D_V4I16_TRAP: - case NVPTX::SULD_2D_V4I32_TRAP: - case NVPTX::SULD_2D_ARRAY_V4I8_TRAP: - case NVPTX::SULD_2D_ARRAY_V4I16_TRAP: - case NVPTX::SULD_2D_ARRAY_V4I32_TRAP: - case NVPTX::SULD_3D_V4I8_TRAP: - case NVPTX::SULD_3D_V4I16_TRAP: - case NVPTX::SULD_3D_V4I32_TRAP: { - // This is a V4 surface load, so operand 4 is a surfref - if (OpNo == 4) { - lowerImageHandleSymbol(MO.getImm(), MCOp); - return true; - } - - return false; - } - case NVPTX::SUST_B_1D_B8_TRAP: - case NVPTX::SUST_B_1D_B16_TRAP: - case NVPTX::SUST_B_1D_B32_TRAP: - case NVPTX::SUST_B_1D_V2B8_TRAP: - case NVPTX::SUST_B_1D_V2B16_TRAP: - case NVPTX::SUST_B_1D_V2B32_TRAP: - case NVPTX::SUST_B_1D_V4B8_TRAP: - case NVPTX::SUST_B_1D_V4B16_TRAP: - case NVPTX::SUST_B_1D_V4B32_TRAP: - case NVPTX::SUST_B_1D_ARRAY_B8_TRAP: - case NVPTX::SUST_B_1D_ARRAY_B16_TRAP: - case NVPTX::SUST_B_1D_ARRAY_B32_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP: - case NVPTX::SUST_B_2D_B8_TRAP: - case NVPTX::SUST_B_2D_B16_TRAP: - case NVPTX::SUST_B_2D_B32_TRAP: - case NVPTX::SUST_B_2D_V2B8_TRAP: - case NVPTX::SUST_B_2D_V2B16_TRAP: - case NVPTX::SUST_B_2D_V2B32_TRAP: - case NVPTX::SUST_B_2D_V4B8_TRAP: - case NVPTX::SUST_B_2D_V4B16_TRAP: - case NVPTX::SUST_B_2D_V4B32_TRAP: - case NVPTX::SUST_B_2D_ARRAY_B8_TRAP: - case NVPTX::SUST_B_2D_ARRAY_B16_TRAP: - case NVPTX::SUST_B_2D_ARRAY_B32_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP: - case NVPTX::SUST_B_3D_B8_TRAP: - case NVPTX::SUST_B_3D_B16_TRAP: - case NVPTX::SUST_B_3D_B32_TRAP: - case NVPTX::SUST_B_3D_V2B8_TRAP: - case NVPTX::SUST_B_3D_V2B16_TRAP: - case NVPTX::SUST_B_3D_V2B32_TRAP: - case NVPTX::SUST_B_3D_V4B8_TRAP: - case NVPTX::SUST_B_3D_V4B16_TRAP: - case NVPTX::SUST_B_3D_V4B32_TRAP: - case NVPTX::SUST_P_1D_B8_TRAP: - case NVPTX::SUST_P_1D_B16_TRAP: - case NVPTX::SUST_P_1D_B32_TRAP: - case NVPTX::SUST_P_1D_V2B8_TRAP: - case NVPTX::SUST_P_1D_V2B16_TRAP: - case NVPTX::SUST_P_1D_V2B32_TRAP: - case NVPTX::SUST_P_1D_V4B8_TRAP: - case NVPTX::SUST_P_1D_V4B16_TRAP: - case NVPTX::SUST_P_1D_V4B32_TRAP: - case NVPTX::SUST_P_1D_ARRAY_B8_TRAP: - case NVPTX::SUST_P_1D_ARRAY_B16_TRAP: - case NVPTX::SUST_P_1D_ARRAY_B32_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP: - case NVPTX::SUST_P_2D_B8_TRAP: - case NVPTX::SUST_P_2D_B16_TRAP: - case NVPTX::SUST_P_2D_B32_TRAP: - case NVPTX::SUST_P_2D_V2B8_TRAP: - case NVPTX::SUST_P_2D_V2B16_TRAP: - case NVPTX::SUST_P_2D_V2B32_TRAP: - case NVPTX::SUST_P_2D_V4B8_TRAP: - case NVPTX::SUST_P_2D_V4B16_TRAP: - case NVPTX::SUST_P_2D_V4B32_TRAP: - case NVPTX::SUST_P_2D_ARRAY_B8_TRAP: - case NVPTX::SUST_P_2D_ARRAY_B16_TRAP: - case NVPTX::SUST_P_2D_ARRAY_B32_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP: - case NVPTX::SUST_P_3D_B8_TRAP: - case NVPTX::SUST_P_3D_B16_TRAP: - case NVPTX::SUST_P_3D_B32_TRAP: - case NVPTX::SUST_P_3D_V2B8_TRAP: - case NVPTX::SUST_P_3D_V2B16_TRAP: - case NVPTX::SUST_P_3D_V2B32_TRAP: - case NVPTX::SUST_P_3D_V4B8_TRAP: - case NVPTX::SUST_P_3D_V4B16_TRAP: - case NVPTX::SUST_P_3D_V4B32_TRAP: { + } else if (MCID.TSFlags & NVPTXII::IsSustFlag) { // This is a surface store, so operand 0 is a surfref - if (OpNo == 0) { + if (OpNo == 0 && MO.isImm()) { lowerImageHandleSymbol(MO.getImm(), MCOp); return true; } return false; - } - case NVPTX::TXQ_CHANNEL_ORDER: - case NVPTX::TXQ_CHANNEL_DATA_TYPE: - case NVPTX::TXQ_WIDTH: - case NVPTX::TXQ_HEIGHT: - case NVPTX::TXQ_DEPTH: - case NVPTX::TXQ_ARRAY_SIZE: - case NVPTX::TXQ_NUM_SAMPLES: - case NVPTX::TXQ_NUM_MIPMAP_LEVELS: - case NVPTX::SUQ_CHANNEL_ORDER: - case NVPTX::SUQ_CHANNEL_DATA_TYPE: - case NVPTX::SUQ_WIDTH: - case NVPTX::SUQ_HEIGHT: - case NVPTX::SUQ_DEPTH: - case NVPTX::SUQ_ARRAY_SIZE: { + } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) { // This is a query, so operand 1 is a surfref/texref - if (OpNo == 1) { + if (OpNo == 1 && MO.isImm()) { lowerImageHandleSymbol(MO.getImm(), MCOp); return true; } return false; } - } + + return false; } void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) { @@ -734,23 +532,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { << " func_retval0"; } else { if ((Ty->getTypeID() == Type::StructTyID) || isa(Ty)) { - SmallVector vtparts; - ComputeValueVTs(*TLI, Ty, vtparts); - unsigned totalsz = 0; - for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { - unsigned elems = 1; - EVT elemtype = vtparts[i]; - if (vtparts[i].isVector()) { - elems = vtparts[i].getVectorNumElements(); - elemtype = vtparts[i].getVectorElementType(); - } - for (unsigned j = 0, je = elems; j != je; ++j) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - totalsz += sz / 8; - } - } + unsigned totalsz = TD->getTypeAllocSize(Ty); unsigned retAlignment = 0; if (!llvm::getAlign(*F, 0, retAlignment)) retAlignment = TD->getABITypeAlignment(Ty); @@ -1321,6 +1103,10 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) { // external global variable with init -> .visible // external without init -> .extern // appending -> not allowed, assert. +// for any linkage other than +// internal, private, linker_private, +// linker_private_weak, linker_private_weak_def_auto, +// we emit -> .weak. void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, raw_ostream &O) { @@ -1346,6 +1132,9 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, msg.append(V->getName().str()); msg.append("has unsupported appending linkage type"); llvm_unreachable(msg.c_str()); + } else if (!V->hasInternalLinkage() && + !V->hasPrivateLinkage()) { + O << ".weak "; } } } @@ -1360,6 +1149,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, return; } + // Skip LLVM intrinsic global variables + if (GVar->getName().startswith("llvm.") || + GVar->getName().startswith("nvvm.")) + return; + const DataLayout *TD = TM.getDataLayout(); // GlobalVariables are always constant pointers themselves. @@ -1371,6 +1165,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << ".visible "; else O << ".extern "; + } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() || + GVar->hasAvailableExternallyLinkage() || + GVar->hasCommonLinkage()) { + O << ".weak "; } if (llvm::isTexture(*GVar)) { @@ -1480,6 +1278,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << "."; emitPTXAddressSpace(PTy->getAddressSpace(), O); + + if (isManaged(*GVar)) { + O << " .attribute(.managed)"; + } + if (GVar->getAlignment() == 0) O << " .align " << (int) TD->getPrefTypeAlignment(ETy); else @@ -1497,13 +1300,24 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // Ptx allows variable initilization only for constant and global state // spaces. - if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || - (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) && - GVar->hasInitializer()) { - const Constant *Initializer = GVar->getInitializer(); - if (!Initializer->isNullValue()) { - O << " = "; - printScalarConstant(Initializer, O); + if (GVar->hasInitializer()) { + if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || + (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) { + const Constant *Initializer = GVar->getInitializer(); + // 'undef' is treated as there is no value spefied. + if (!Initializer->isNullValue() && !isa(Initializer)) { + O << " = "; + printScalarConstant(Initializer, O); + } + } else { + // The frontend adds zero-initializer to variables that don't have an + // initial value, so skip warning for this case. + if (!GVar->getInitializer()->isNullValue()) { + std::string warnMsg = "initial value of '" + GVar->getName().str() + + "' is not allowed in addrspace(" + + llvm::utostr_32(PTy->getAddressSpace()) + ")"; + report_fatal_error(warnMsg.c_str()); + } } } } else { @@ -1994,9 +1808,9 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n"; // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n"; // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n"; - // O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .s64 %rd<" << NVPTXNumRegisters << ">;\n"; // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n"; - // O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .f64 %fd<" << NVPTXNumRegisters << ">;\n"; // Emit declaration of the virtual registers or 'physical' registers for // each register class diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 9030584f06fc..8b088412dbba 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -26,6 +26,10 @@ using namespace llvm; +NVPTXFrameLowering::NVPTXFrameLowering(NVPTXSubtarget &STI) + : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), + is64bit(STI.is64Bit()) {} + bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; } void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const { @@ -43,17 +47,21 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const { // cvta.local %SP, %SPL; if (is64bit) { unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass); - MachineInstr *MI = BuildMI( - MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64), - NVPTX::VRFrame).addReg(LocalReg); - BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), + MachineInstr *MI = + BuildMI(MBB, MBBI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes_64), + NVPTX::VRFrame).addReg(LocalReg); + BuildMI(MBB, MI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), LocalReg).addImm(MF.getFunctionNumber()); } else { unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass); - MachineInstr *MI = BuildMI( - MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes), - NVPTX::VRFrame).addReg(LocalReg); - BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), + MachineInstr *MI = + BuildMI(MBB, MBBI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes), + NVPTX::VRFrame).addReg(LocalReg); + BuildMI(MBB, MI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), LocalReg).addImm(MF.getFunctionNumber()); } } diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h index 2ae6d72720e2..56fb673de0eb 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -17,16 +17,12 @@ #include "llvm/Target/TargetFrameLowering.h" namespace llvm { -class NVPTXTargetMachine; - +class NVPTXSubtarget; class NVPTXFrameLowering : public TargetFrameLowering { - NVPTXTargetMachine &tm; bool is64bit; public: - explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit) - : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), tm(_tm), - is64bit(_is64bit) {} + explicit NVPTXFrameLowering(NVPTXSubtarget &STI); bool hasFP(const MachineFunction &MF) const override; void emitPrologue(MachineFunction &MF) const override; diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index 023dd5e48c4a..faa9fdb424b6 100644 --- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -84,7 +84,7 @@ bool GenericToNVVM::runOnModule(Module &M) { GlobalVariable *GV = I++; if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC && !llvm::isTexture(*GV) && !llvm::isSurface(*GV) && - !GV->getName().startswith("llvm.")) { + !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) { GlobalVariable *NewGV = new GlobalVariable( M, GV->getType()->getElementType(), GV->isConstant(), GV->getLinkage(), diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index cd308806c36a..aeda71ff0e90 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -24,12 +24,6 @@ using namespace llvm; #define DEBUG_TYPE "nvptx-isel" -static cl::opt -FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, - cl::desc("NVPTX Specific: FMA contraction (0: don't do it" - " 1: do it 2: do it aggressively"), - cl::init(2)); - static cl::opt UsePrecDivF32( "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" @@ -58,16 +52,6 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), Subtarget(tm.getSubtarget()) { - - doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1); - doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1); - doFMAF32AGG = - (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel == 2); - doFMAF64AGG = - (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2); - - allowFMA = (FMAContractLevel >= 1); - doMulWide = (OptLevel > 0); } @@ -113,6 +97,11 @@ bool NVPTXDAGToDAGISel::useF32FTZ() const { } } +bool NVPTXDAGToDAGISel::allowFMA() const { + const NVPTXTargetLowering *TL = (NVPTXTargetLowering *)getTargetLowering(); + return TL->allowFMA(*MF, OptLevel); +} + /// Select - Select instructions not customized! Used for /// expanded, promoted and normal instructions. SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { @@ -138,7 +127,7 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::LDGV4: case NVPTXISD::LDUV2: case NVPTXISD::LDUV4: - ResNode = SelectLDGLDUVector(N); + ResNode = SelectLDGLDU(N); break; case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: @@ -164,95 +153,352 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case ISD::INTRINSIC_WO_CHAIN: ResNode = SelectIntrinsicNoChain(N); break; - case NVPTXISD::Tex1DFloatI32: + case ISD::INTRINSIC_W_CHAIN: + ResNode = SelectIntrinsicChain(N); + break; + case NVPTXISD::Tex1DFloatS32: case NVPTXISD::Tex1DFloatFloat: case NVPTXISD::Tex1DFloatFloatLevel: case NVPTXISD::Tex1DFloatFloatGrad: - case NVPTXISD::Tex1DI32I32: - case NVPTXISD::Tex1DI32Float: - case NVPTXISD::Tex1DI32FloatLevel: - case NVPTXISD::Tex1DI32FloatGrad: - case NVPTXISD::Tex1DArrayFloatI32: + case NVPTXISD::Tex1DS32S32: + case NVPTXISD::Tex1DS32Float: + case NVPTXISD::Tex1DS32FloatLevel: + case NVPTXISD::Tex1DS32FloatGrad: + case NVPTXISD::Tex1DU32S32: + case NVPTXISD::Tex1DU32Float: + case NVPTXISD::Tex1DU32FloatLevel: + case NVPTXISD::Tex1DU32FloatGrad: + case NVPTXISD::Tex1DArrayFloatS32: case NVPTXISD::Tex1DArrayFloatFloat: case NVPTXISD::Tex1DArrayFloatFloatLevel: case NVPTXISD::Tex1DArrayFloatFloatGrad: - case NVPTXISD::Tex1DArrayI32I32: - case NVPTXISD::Tex1DArrayI32Float: - case NVPTXISD::Tex1DArrayI32FloatLevel: - case NVPTXISD::Tex1DArrayI32FloatGrad: - case NVPTXISD::Tex2DFloatI32: + case NVPTXISD::Tex1DArrayS32S32: + case NVPTXISD::Tex1DArrayS32Float: + case NVPTXISD::Tex1DArrayS32FloatLevel: + case NVPTXISD::Tex1DArrayS32FloatGrad: + case NVPTXISD::Tex1DArrayU32S32: + case NVPTXISD::Tex1DArrayU32Float: + case NVPTXISD::Tex1DArrayU32FloatLevel: + case NVPTXISD::Tex1DArrayU32FloatGrad: + case NVPTXISD::Tex2DFloatS32: case NVPTXISD::Tex2DFloatFloat: case NVPTXISD::Tex2DFloatFloatLevel: case NVPTXISD::Tex2DFloatFloatGrad: - case NVPTXISD::Tex2DI32I32: - case NVPTXISD::Tex2DI32Float: - case NVPTXISD::Tex2DI32FloatLevel: - case NVPTXISD::Tex2DI32FloatGrad: - case NVPTXISD::Tex2DArrayFloatI32: + case NVPTXISD::Tex2DS32S32: + case NVPTXISD::Tex2DS32Float: + case NVPTXISD::Tex2DS32FloatLevel: + case NVPTXISD::Tex2DS32FloatGrad: + case NVPTXISD::Tex2DU32S32: + case NVPTXISD::Tex2DU32Float: + case NVPTXISD::Tex2DU32FloatLevel: + case NVPTXISD::Tex2DU32FloatGrad: + case NVPTXISD::Tex2DArrayFloatS32: case NVPTXISD::Tex2DArrayFloatFloat: case NVPTXISD::Tex2DArrayFloatFloatLevel: case NVPTXISD::Tex2DArrayFloatFloatGrad: - case NVPTXISD::Tex2DArrayI32I32: - case NVPTXISD::Tex2DArrayI32Float: - case NVPTXISD::Tex2DArrayI32FloatLevel: - case NVPTXISD::Tex2DArrayI32FloatGrad: - case NVPTXISD::Tex3DFloatI32: + case NVPTXISD::Tex2DArrayS32S32: + case NVPTXISD::Tex2DArrayS32Float: + case NVPTXISD::Tex2DArrayS32FloatLevel: + case NVPTXISD::Tex2DArrayS32FloatGrad: + case NVPTXISD::Tex2DArrayU32S32: + case NVPTXISD::Tex2DArrayU32Float: + case NVPTXISD::Tex2DArrayU32FloatLevel: + case NVPTXISD::Tex2DArrayU32FloatGrad: + case NVPTXISD::Tex3DFloatS32: case NVPTXISD::Tex3DFloatFloat: case NVPTXISD::Tex3DFloatFloatLevel: case NVPTXISD::Tex3DFloatFloatGrad: - case NVPTXISD::Tex3DI32I32: - case NVPTXISD::Tex3DI32Float: - case NVPTXISD::Tex3DI32FloatLevel: - case NVPTXISD::Tex3DI32FloatGrad: + case NVPTXISD::Tex3DS32S32: + case NVPTXISD::Tex3DS32Float: + case NVPTXISD::Tex3DS32FloatLevel: + case NVPTXISD::Tex3DS32FloatGrad: + case NVPTXISD::Tex3DU32S32: + case NVPTXISD::Tex3DU32Float: + case NVPTXISD::Tex3DU32FloatLevel: + case NVPTXISD::Tex3DU32FloatGrad: + case NVPTXISD::TexCubeFloatFloat: + case NVPTXISD::TexCubeFloatFloatLevel: + case NVPTXISD::TexCubeS32Float: + case NVPTXISD::TexCubeS32FloatLevel: + case NVPTXISD::TexCubeU32Float: + case NVPTXISD::TexCubeU32FloatLevel: + case NVPTXISD::TexCubeArrayFloatFloat: + case NVPTXISD::TexCubeArrayFloatFloatLevel: + case NVPTXISD::TexCubeArrayS32Float: + case NVPTXISD::TexCubeArrayS32FloatLevel: + case NVPTXISD::TexCubeArrayU32Float: + case NVPTXISD::TexCubeArrayU32FloatLevel: + case NVPTXISD::Tld4R2DFloatFloat: + case NVPTXISD::Tld4G2DFloatFloat: + case NVPTXISD::Tld4B2DFloatFloat: + case NVPTXISD::Tld4A2DFloatFloat: + case NVPTXISD::Tld4R2DS64Float: + case NVPTXISD::Tld4G2DS64Float: + case NVPTXISD::Tld4B2DS64Float: + case NVPTXISD::Tld4A2DS64Float: + case NVPTXISD::Tld4R2DU64Float: + case NVPTXISD::Tld4G2DU64Float: + case NVPTXISD::Tld4B2DU64Float: + case NVPTXISD::Tld4A2DU64Float: + case NVPTXISD::TexUnified1DFloatS32: + case NVPTXISD::TexUnified1DFloatFloat: + case NVPTXISD::TexUnified1DFloatFloatLevel: + case NVPTXISD::TexUnified1DFloatFloatGrad: + case NVPTXISD::TexUnified1DS32S32: + case NVPTXISD::TexUnified1DS32Float: + case NVPTXISD::TexUnified1DS32FloatLevel: + case NVPTXISD::TexUnified1DS32FloatGrad: + case NVPTXISD::TexUnified1DU32S32: + case NVPTXISD::TexUnified1DU32Float: + case NVPTXISD::TexUnified1DU32FloatLevel: + case NVPTXISD::TexUnified1DU32FloatGrad: + case NVPTXISD::TexUnified1DArrayFloatS32: + case NVPTXISD::TexUnified1DArrayFloatFloat: + case NVPTXISD::TexUnified1DArrayFloatFloatLevel: + case NVPTXISD::TexUnified1DArrayFloatFloatGrad: + case NVPTXISD::TexUnified1DArrayS32S32: + case NVPTXISD::TexUnified1DArrayS32Float: + case NVPTXISD::TexUnified1DArrayS32FloatLevel: + case NVPTXISD::TexUnified1DArrayS32FloatGrad: + case NVPTXISD::TexUnified1DArrayU32S32: + case NVPTXISD::TexUnified1DArrayU32Float: + case NVPTXISD::TexUnified1DArrayU32FloatLevel: + case NVPTXISD::TexUnified1DArrayU32FloatGrad: + case NVPTXISD::TexUnified2DFloatS32: + case NVPTXISD::TexUnified2DFloatFloat: + case NVPTXISD::TexUnified2DFloatFloatLevel: + case NVPTXISD::TexUnified2DFloatFloatGrad: + case NVPTXISD::TexUnified2DS32S32: + case NVPTXISD::TexUnified2DS32Float: + case NVPTXISD::TexUnified2DS32FloatLevel: + case NVPTXISD::TexUnified2DS32FloatGrad: + case NVPTXISD::TexUnified2DU32S32: + case NVPTXISD::TexUnified2DU32Float: + case NVPTXISD::TexUnified2DU32FloatLevel: + case NVPTXISD::TexUnified2DU32FloatGrad: + case NVPTXISD::TexUnified2DArrayFloatS32: + case NVPTXISD::TexUnified2DArrayFloatFloat: + case NVPTXISD::TexUnified2DArrayFloatFloatLevel: + case NVPTXISD::TexUnified2DArrayFloatFloatGrad: + case NVPTXISD::TexUnified2DArrayS32S32: + case NVPTXISD::TexUnified2DArrayS32Float: + case NVPTXISD::TexUnified2DArrayS32FloatLevel: + case NVPTXISD::TexUnified2DArrayS32FloatGrad: + case NVPTXISD::TexUnified2DArrayU32S32: + case NVPTXISD::TexUnified2DArrayU32Float: + case NVPTXISD::TexUnified2DArrayU32FloatLevel: + case NVPTXISD::TexUnified2DArrayU32FloatGrad: + case NVPTXISD::TexUnified3DFloatS32: + case NVPTXISD::TexUnified3DFloatFloat: + case NVPTXISD::TexUnified3DFloatFloatLevel: + case NVPTXISD::TexUnified3DFloatFloatGrad: + case NVPTXISD::TexUnified3DS32S32: + case NVPTXISD::TexUnified3DS32Float: + case NVPTXISD::TexUnified3DS32FloatLevel: + case NVPTXISD::TexUnified3DS32FloatGrad: + case NVPTXISD::TexUnified3DU32S32: + case NVPTXISD::TexUnified3DU32Float: + case NVPTXISD::TexUnified3DU32FloatLevel: + case NVPTXISD::TexUnified3DU32FloatGrad: + case NVPTXISD::TexUnifiedCubeFloatFloat: + case NVPTXISD::TexUnifiedCubeFloatFloatLevel: + case NVPTXISD::TexUnifiedCubeS32Float: + case NVPTXISD::TexUnifiedCubeS32FloatLevel: + case NVPTXISD::TexUnifiedCubeU32Float: + case NVPTXISD::TexUnifiedCubeU32FloatLevel: + case NVPTXISD::TexUnifiedCubeArrayFloatFloat: + case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: + case NVPTXISD::TexUnifiedCubeArrayS32Float: + case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: + case NVPTXISD::TexUnifiedCubeArrayU32Float: + case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: + case NVPTXISD::Tld4UnifiedR2DFloatFloat: + case NVPTXISD::Tld4UnifiedG2DFloatFloat: + case NVPTXISD::Tld4UnifiedB2DFloatFloat: + case NVPTXISD::Tld4UnifiedA2DFloatFloat: + case NVPTXISD::Tld4UnifiedR2DS64Float: + case NVPTXISD::Tld4UnifiedG2DS64Float: + case NVPTXISD::Tld4UnifiedB2DS64Float: + case NVPTXISD::Tld4UnifiedA2DS64Float: + case NVPTXISD::Tld4UnifiedR2DU64Float: + case NVPTXISD::Tld4UnifiedG2DU64Float: + case NVPTXISD::Tld4UnifiedB2DU64Float: + case NVPTXISD::Tld4UnifiedA2DU64Float: ResNode = SelectTextureIntrinsic(N); break; + case NVPTXISD::Suld1DI8Clamp: + case NVPTXISD::Suld1DI16Clamp: + case NVPTXISD::Suld1DI32Clamp: + case NVPTXISD::Suld1DI64Clamp: + case NVPTXISD::Suld1DV2I8Clamp: + case NVPTXISD::Suld1DV2I16Clamp: + case NVPTXISD::Suld1DV2I32Clamp: + case NVPTXISD::Suld1DV2I64Clamp: + case NVPTXISD::Suld1DV4I8Clamp: + case NVPTXISD::Suld1DV4I16Clamp: + case NVPTXISD::Suld1DV4I32Clamp: + case NVPTXISD::Suld1DArrayI8Clamp: + case NVPTXISD::Suld1DArrayI16Clamp: + case NVPTXISD::Suld1DArrayI32Clamp: + case NVPTXISD::Suld1DArrayI64Clamp: + case NVPTXISD::Suld1DArrayV2I8Clamp: + case NVPTXISD::Suld1DArrayV2I16Clamp: + case NVPTXISD::Suld1DArrayV2I32Clamp: + case NVPTXISD::Suld1DArrayV2I64Clamp: + case NVPTXISD::Suld1DArrayV4I8Clamp: + case NVPTXISD::Suld1DArrayV4I16Clamp: + case NVPTXISD::Suld1DArrayV4I32Clamp: + case NVPTXISD::Suld2DI8Clamp: + case NVPTXISD::Suld2DI16Clamp: + case NVPTXISD::Suld2DI32Clamp: + case NVPTXISD::Suld2DI64Clamp: + case NVPTXISD::Suld2DV2I8Clamp: + case NVPTXISD::Suld2DV2I16Clamp: + case NVPTXISD::Suld2DV2I32Clamp: + case NVPTXISD::Suld2DV2I64Clamp: + case NVPTXISD::Suld2DV4I8Clamp: + case NVPTXISD::Suld2DV4I16Clamp: + case NVPTXISD::Suld2DV4I32Clamp: + case NVPTXISD::Suld2DArrayI8Clamp: + case NVPTXISD::Suld2DArrayI16Clamp: + case NVPTXISD::Suld2DArrayI32Clamp: + case NVPTXISD::Suld2DArrayI64Clamp: + case NVPTXISD::Suld2DArrayV2I8Clamp: + case NVPTXISD::Suld2DArrayV2I16Clamp: + case NVPTXISD::Suld2DArrayV2I32Clamp: + case NVPTXISD::Suld2DArrayV2I64Clamp: + case NVPTXISD::Suld2DArrayV4I8Clamp: + case NVPTXISD::Suld2DArrayV4I16Clamp: + case NVPTXISD::Suld2DArrayV4I32Clamp: + case NVPTXISD::Suld3DI8Clamp: + case NVPTXISD::Suld3DI16Clamp: + case NVPTXISD::Suld3DI32Clamp: + case NVPTXISD::Suld3DI64Clamp: + case NVPTXISD::Suld3DV2I8Clamp: + case NVPTXISD::Suld3DV2I16Clamp: + case NVPTXISD::Suld3DV2I32Clamp: + case NVPTXISD::Suld3DV2I64Clamp: + case NVPTXISD::Suld3DV4I8Clamp: + case NVPTXISD::Suld3DV4I16Clamp: + case NVPTXISD::Suld3DV4I32Clamp: case NVPTXISD::Suld1DI8Trap: case NVPTXISD::Suld1DI16Trap: case NVPTXISD::Suld1DI32Trap: + case NVPTXISD::Suld1DI64Trap: case NVPTXISD::Suld1DV2I8Trap: case NVPTXISD::Suld1DV2I16Trap: case NVPTXISD::Suld1DV2I32Trap: + case NVPTXISD::Suld1DV2I64Trap: case NVPTXISD::Suld1DV4I8Trap: case NVPTXISD::Suld1DV4I16Trap: case NVPTXISD::Suld1DV4I32Trap: case NVPTXISD::Suld1DArrayI8Trap: case NVPTXISD::Suld1DArrayI16Trap: case NVPTXISD::Suld1DArrayI32Trap: + case NVPTXISD::Suld1DArrayI64Trap: case NVPTXISD::Suld1DArrayV2I8Trap: case NVPTXISD::Suld1DArrayV2I16Trap: case NVPTXISD::Suld1DArrayV2I32Trap: + case NVPTXISD::Suld1DArrayV2I64Trap: case NVPTXISD::Suld1DArrayV4I8Trap: case NVPTXISD::Suld1DArrayV4I16Trap: case NVPTXISD::Suld1DArrayV4I32Trap: case NVPTXISD::Suld2DI8Trap: case NVPTXISD::Suld2DI16Trap: case NVPTXISD::Suld2DI32Trap: + case NVPTXISD::Suld2DI64Trap: case NVPTXISD::Suld2DV2I8Trap: case NVPTXISD::Suld2DV2I16Trap: case NVPTXISD::Suld2DV2I32Trap: + case NVPTXISD::Suld2DV2I64Trap: case NVPTXISD::Suld2DV4I8Trap: case NVPTXISD::Suld2DV4I16Trap: case NVPTXISD::Suld2DV4I32Trap: case NVPTXISD::Suld2DArrayI8Trap: case NVPTXISD::Suld2DArrayI16Trap: case NVPTXISD::Suld2DArrayI32Trap: + case NVPTXISD::Suld2DArrayI64Trap: case NVPTXISD::Suld2DArrayV2I8Trap: case NVPTXISD::Suld2DArrayV2I16Trap: case NVPTXISD::Suld2DArrayV2I32Trap: + case NVPTXISD::Suld2DArrayV2I64Trap: case NVPTXISD::Suld2DArrayV4I8Trap: case NVPTXISD::Suld2DArrayV4I16Trap: case NVPTXISD::Suld2DArrayV4I32Trap: case NVPTXISD::Suld3DI8Trap: case NVPTXISD::Suld3DI16Trap: case NVPTXISD::Suld3DI32Trap: + case NVPTXISD::Suld3DI64Trap: case NVPTXISD::Suld3DV2I8Trap: case NVPTXISD::Suld3DV2I16Trap: case NVPTXISD::Suld3DV2I32Trap: + case NVPTXISD::Suld3DV2I64Trap: case NVPTXISD::Suld3DV4I8Trap: case NVPTXISD::Suld3DV4I16Trap: case NVPTXISD::Suld3DV4I32Trap: + case NVPTXISD::Suld1DI8Zero: + case NVPTXISD::Suld1DI16Zero: + case NVPTXISD::Suld1DI32Zero: + case NVPTXISD::Suld1DI64Zero: + case NVPTXISD::Suld1DV2I8Zero: + case NVPTXISD::Suld1DV2I16Zero: + case NVPTXISD::Suld1DV2I32Zero: + case NVPTXISD::Suld1DV2I64Zero: + case NVPTXISD::Suld1DV4I8Zero: + case NVPTXISD::Suld1DV4I16Zero: + case NVPTXISD::Suld1DV4I32Zero: + case NVPTXISD::Suld1DArrayI8Zero: + case NVPTXISD::Suld1DArrayI16Zero: + case NVPTXISD::Suld1DArrayI32Zero: + case NVPTXISD::Suld1DArrayI64Zero: + case NVPTXISD::Suld1DArrayV2I8Zero: + case NVPTXISD::Suld1DArrayV2I16Zero: + case NVPTXISD::Suld1DArrayV2I32Zero: + case NVPTXISD::Suld1DArrayV2I64Zero: + case NVPTXISD::Suld1DArrayV4I8Zero: + case NVPTXISD::Suld1DArrayV4I16Zero: + case NVPTXISD::Suld1DArrayV4I32Zero: + case NVPTXISD::Suld2DI8Zero: + case NVPTXISD::Suld2DI16Zero: + case NVPTXISD::Suld2DI32Zero: + case NVPTXISD::Suld2DI64Zero: + case NVPTXISD::Suld2DV2I8Zero: + case NVPTXISD::Suld2DV2I16Zero: + case NVPTXISD::Suld2DV2I32Zero: + case NVPTXISD::Suld2DV2I64Zero: + case NVPTXISD::Suld2DV4I8Zero: + case NVPTXISD::Suld2DV4I16Zero: + case NVPTXISD::Suld2DV4I32Zero: + case NVPTXISD::Suld2DArrayI8Zero: + case NVPTXISD::Suld2DArrayI16Zero: + case NVPTXISD::Suld2DArrayI32Zero: + case NVPTXISD::Suld2DArrayI64Zero: + case NVPTXISD::Suld2DArrayV2I8Zero: + case NVPTXISD::Suld2DArrayV2I16Zero: + case NVPTXISD::Suld2DArrayV2I32Zero: + case NVPTXISD::Suld2DArrayV2I64Zero: + case NVPTXISD::Suld2DArrayV4I8Zero: + case NVPTXISD::Suld2DArrayV4I16Zero: + case NVPTXISD::Suld2DArrayV4I32Zero: + case NVPTXISD::Suld3DI8Zero: + case NVPTXISD::Suld3DI16Zero: + case NVPTXISD::Suld3DI32Zero: + case NVPTXISD::Suld3DI64Zero: + case NVPTXISD::Suld3DV2I8Zero: + case NVPTXISD::Suld3DV2I16Zero: + case NVPTXISD::Suld3DV2I32Zero: + case NVPTXISD::Suld3DV2I64Zero: + case NVPTXISD::Suld3DV4I8Zero: + case NVPTXISD::Suld3DV4I16Zero: + case NVPTXISD::Suld3DV4I32Zero: ResNode = SelectSurfaceIntrinsic(N); break; + case ISD::AND: + case ISD::SRA: + case ISD::SRL: + // Try to select BFE + ResNode = SelectBFE(N); + break; case ISD::ADDRSPACECAST: ResNode = SelectAddrSpaceCast(N); break; @@ -264,6 +510,21 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { return SelectCode(N); } +SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) { + unsigned IID = cast(N->getOperand(1))->getZExtValue(); + switch (IID) { + default: + return NULL; + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_p: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_p: + return SelectLDGLDU(N); + } +} + static unsigned int getCodeAddrSpace(MemSDNode *N, const NVPTXSubtarget &Subtarget) { const Value *Src = N->getMemOperand()->getValue(); @@ -981,22 +1242,101 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { return LD; } -SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { +SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { SDValue Chain = N->getOperand(0); - SDValue Op1 = N->getOperand(1); + SDValue Op1; + MemSDNode *Mem; + bool IsLDG = true; + + // If this is an LDG intrinsic, the address is the third operand. Its its an + // LDG/LDU SD node (from custom vector handling), then its the second operand + if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + Op1 = N->getOperand(2); + Mem = cast(N); + unsigned IID = cast(N->getOperand(1))->getZExtValue(); + switch (IID) { + default: + return NULL; + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_p: + IsLDG = true; + break; + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_p: + IsLDG = false; + break; + } + } else { + Op1 = N->getOperand(1); + Mem = cast(N); + } + unsigned Opcode; SDLoc DL(N); SDNode *LD; - MemSDNode *Mem = cast(N); SDValue Base, Offset, Addr; - EVT EltVT = Mem->getMemoryVT().getVectorElementType(); + EVT EltVT = Mem->getMemoryVT(); + if (EltVT.isVector()) { + EltVT = EltVT.getVectorElementType(); + } if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1092,6 +1432,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1181,6 +1570,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1276,6 +1714,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1365,6 +1852,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1457,7 +1993,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { } MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); - MemRefs0[0] = cast(N)->getMemOperand(); + MemRefs0[0] = Mem->getMemOperand(); cast(LD)->setMemRefs(MemRefs0, MemRefs0 + 1); return LD; @@ -2479,16 +3015,14 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) { SDValue Chain = N->getOperand(0); - SDValue TexRef = N->getOperand(1); - SDValue SampRef = N->getOperand(2); SDNode *Ret = nullptr; unsigned Opc = 0; SmallVector Ops; switch (N->getOpcode()) { default: return nullptr; - case NVPTXISD::Tex1DFloatI32: - Opc = NVPTX::TEX_1D_F32_I32; + case NVPTXISD::Tex1DFloatS32: + Opc = NVPTX::TEX_1D_F32_S32; break; case NVPTXISD::Tex1DFloatFloat: Opc = NVPTX::TEX_1D_F32_F32; @@ -2499,20 +3033,32 @@ SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) { case NVPTXISD::Tex1DFloatFloatGrad: Opc = NVPTX::TEX_1D_F32_F32_GRAD; break; - case NVPTXISD::Tex1DI32I32: - Opc = NVPTX::TEX_1D_I32_I32; + case NVPTXISD::Tex1DS32S32: + Opc = NVPTX::TEX_1D_S32_S32; + break; + case NVPTXISD::Tex1DS32Float: + Opc = NVPTX::TEX_1D_S32_F32; break; - case NVPTXISD::Tex1DI32Float: - Opc = NVPTX::TEX_1D_I32_F32; + case NVPTXISD::Tex1DS32FloatLevel: + Opc = NVPTX::TEX_1D_S32_F32_LEVEL; break; - case NVPTXISD::Tex1DI32FloatLevel: - Opc = NVPTX::TEX_1D_I32_F32_LEVEL; + case NVPTXISD::Tex1DS32FloatGrad: + Opc = NVPTX::TEX_1D_S32_F32_GRAD; break; - case NVPTXISD::Tex1DI32FloatGrad: - Opc = NVPTX::TEX_1D_I32_F32_GRAD; + case NVPTXISD::Tex1DU32S32: + Opc = NVPTX::TEX_1D_U32_S32; break; - case NVPTXISD::Tex1DArrayFloatI32: - Opc = NVPTX::TEX_1D_ARRAY_F32_I32; + case NVPTXISD::Tex1DU32Float: + Opc = NVPTX::TEX_1D_U32_F32; + break; + case NVPTXISD::Tex1DU32FloatLevel: + Opc = NVPTX::TEX_1D_U32_F32_LEVEL; + break; + case NVPTXISD::Tex1DU32FloatGrad: + Opc = NVPTX::TEX_1D_U32_F32_GRAD; + break; + case NVPTXISD::Tex1DArrayFloatS32: + Opc = NVPTX::TEX_1D_ARRAY_F32_S32; break; case NVPTXISD::Tex1DArrayFloatFloat: Opc = NVPTX::TEX_1D_ARRAY_F32_F32; @@ -2523,20 +3069,32 @@ SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) { case NVPTXISD::Tex1DArrayFloatFloatGrad: Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD; break; - case NVPTXISD::Tex1DArrayI32I32: - Opc = NVPTX::TEX_1D_ARRAY_I32_I32; + case NVPTXISD::Tex1DArrayS32S32: + Opc = NVPTX::TEX_1D_ARRAY_S32_S32; + break; + case NVPTXISD::Tex1DArrayS32Float: + Opc = NVPTX::TEX_1D_ARRAY_S32_F32; break; - case NVPTXISD::Tex1DArrayI32Float: - Opc = NVPTX::TEX_1D_ARRAY_I32_F32; + case NVPTXISD::Tex1DArrayS32FloatLevel: + Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL; break; - case NVPTXISD::Tex1DArrayI32FloatLevel: - Opc = NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL; + case NVPTXISD::Tex1DArrayS32FloatGrad: + Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD; break; - case NVPTXISD::Tex1DArrayI32FloatGrad: - Opc = NVPTX::TEX_1D_ARRAY_I32_F32_GRAD; + case NVPTXISD::Tex1DArrayU32S32: + Opc = NVPTX::TEX_1D_ARRAY_U32_S32; break; - case NVPTXISD::Tex2DFloatI32: - Opc = NVPTX::TEX_2D_F32_I32; + case NVPTXISD::Tex1DArrayU32Float: + Opc = NVPTX::TEX_1D_ARRAY_U32_F32; + break; + case NVPTXISD::Tex1DArrayU32FloatLevel: + Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::Tex1DArrayU32FloatGrad: + Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD; + break; + case NVPTXISD::Tex2DFloatS32: + Opc = NVPTX::TEX_2D_F32_S32; break; case NVPTXISD::Tex2DFloatFloat: Opc = NVPTX::TEX_2D_F32_F32; @@ -2547,20 +3105,32 @@ SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) { case NVPTXISD::Tex2DFloatFloatGrad: Opc = NVPTX::TEX_2D_F32_F32_GRAD; break; - case NVPTXISD::Tex2DI32I32: - Opc = NVPTX::TEX_2D_I32_I32; + case NVPTXISD::Tex2DS32S32: + Opc = NVPTX::TEX_2D_S32_S32; + break; + case NVPTXISD::Tex2DS32Float: + Opc = NVPTX::TEX_2D_S32_F32; break; - case NVPTXISD::Tex2DI32Float: - Opc = NVPTX::TEX_2D_I32_F32; + case NVPTXISD::Tex2DS32FloatLevel: + Opc = NVPTX::TEX_2D_S32_F32_LEVEL; break; - case NVPTXISD::Tex2DI32FloatLevel: - Opc = NVPTX::TEX_2D_I32_F32_LEVEL; + case NVPTXISD::Tex2DS32FloatGrad: + Opc = NVPTX::TEX_2D_S32_F32_GRAD; break; - case NVPTXISD::Tex2DI32FloatGrad: - Opc = NVPTX::TEX_2D_I32_F32_GRAD; + case NVPTXISD::Tex2DU32S32: + Opc = NVPTX::TEX_2D_U32_S32; break; - case NVPTXISD::Tex2DArrayFloatI32: - Opc = NVPTX::TEX_2D_ARRAY_F32_I32; + case NVPTXISD::Tex2DU32Float: + Opc = NVPTX::TEX_2D_U32_F32; + break; + case NVPTXISD::Tex2DU32FloatLevel: + Opc = NVPTX::TEX_2D_U32_F32_LEVEL; + break; + case NVPTXISD::Tex2DU32FloatGrad: + Opc = NVPTX::TEX_2D_U32_F32_GRAD; + break; + case NVPTXISD::Tex2DArrayFloatS32: + Opc = NVPTX::TEX_2D_ARRAY_F32_S32; break; case NVPTXISD::Tex2DArrayFloatFloat: Opc = NVPTX::TEX_2D_ARRAY_F32_F32; @@ -2571,20 +3141,32 @@ SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) { case NVPTXISD::Tex2DArrayFloatFloatGrad: Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD; break; - case NVPTXISD::Tex2DArrayI32I32: - Opc = NVPTX::TEX_2D_ARRAY_I32_I32; + case NVPTXISD::Tex2DArrayS32S32: + Opc = NVPTX::TEX_2D_ARRAY_S32_S32; + break; + case NVPTXISD::Tex2DArrayS32Float: + Opc = NVPTX::TEX_2D_ARRAY_S32_F32; break; - case NVPTXISD::Tex2DArrayI32Float: - Opc = NVPTX::TEX_2D_ARRAY_I32_F32; + case NVPTXISD::Tex2DArrayS32FloatLevel: + Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL; break; - case NVPTXISD::Tex2DArrayI32FloatLevel: - Opc = NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL; + case NVPTXISD::Tex2DArrayS32FloatGrad: + Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD; break; - case NVPTXISD::Tex2DArrayI32FloatGrad: - Opc = NVPTX::TEX_2D_ARRAY_I32_F32_GRAD; + case NVPTXISD::Tex2DArrayU32S32: + Opc = NVPTX::TEX_2D_ARRAY_U32_S32; break; - case NVPTXISD::Tex3DFloatI32: - Opc = NVPTX::TEX_3D_F32_I32; + case NVPTXISD::Tex2DArrayU32Float: + Opc = NVPTX::TEX_2D_ARRAY_U32_F32; + break; + case NVPTXISD::Tex2DArrayU32FloatLevel: + Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::Tex2DArrayU32FloatGrad: + Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD; + break; + case NVPTXISD::Tex3DFloatS32: + Opc = NVPTX::TEX_3D_F32_S32; break; case NVPTXISD::Tex3DFloatFloat: Opc = NVPTX::TEX_3D_F32_F32; @@ -2595,25 +3177,358 @@ SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) { case NVPTXISD::Tex3DFloatFloatGrad: Opc = NVPTX::TEX_3D_F32_F32_GRAD; break; - case NVPTXISD::Tex3DI32I32: - Opc = NVPTX::TEX_3D_I32_I32; + case NVPTXISD::Tex3DS32S32: + Opc = NVPTX::TEX_3D_S32_S32; + break; + case NVPTXISD::Tex3DS32Float: + Opc = NVPTX::TEX_3D_S32_F32; + break; + case NVPTXISD::Tex3DS32FloatLevel: + Opc = NVPTX::TEX_3D_S32_F32_LEVEL; + break; + case NVPTXISD::Tex3DS32FloatGrad: + Opc = NVPTX::TEX_3D_S32_F32_GRAD; + break; + case NVPTXISD::Tex3DU32S32: + Opc = NVPTX::TEX_3D_U32_S32; + break; + case NVPTXISD::Tex3DU32Float: + Opc = NVPTX::TEX_3D_U32_F32; + break; + case NVPTXISD::Tex3DU32FloatLevel: + Opc = NVPTX::TEX_3D_U32_F32_LEVEL; + break; + case NVPTXISD::Tex3DU32FloatGrad: + Opc = NVPTX::TEX_3D_U32_F32_GRAD; + break; + case NVPTXISD::TexCubeFloatFloat: + Opc = NVPTX::TEX_CUBE_F32_F32; + break; + case NVPTXISD::TexCubeFloatFloatLevel: + Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL; + break; + case NVPTXISD::TexCubeS32Float: + Opc = NVPTX::TEX_CUBE_S32_F32; + break; + case NVPTXISD::TexCubeS32FloatLevel: + Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL; + break; + case NVPTXISD::TexCubeU32Float: + Opc = NVPTX::TEX_CUBE_U32_F32; + break; + case NVPTXISD::TexCubeU32FloatLevel: + Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL; + break; + case NVPTXISD::TexCubeArrayFloatFloat: + Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32; + break; + case NVPTXISD::TexCubeArrayFloatFloatLevel: + Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL; + break; + case NVPTXISD::TexCubeArrayS32Float: + Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32; + break; + case NVPTXISD::TexCubeArrayS32FloatLevel: + Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL; + break; + case NVPTXISD::TexCubeArrayU32Float: + Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32; + break; + case NVPTXISD::TexCubeArrayU32FloatLevel: + Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::Tld4R2DFloatFloat: + Opc = NVPTX::TLD4_R_2D_F32_F32; + break; + case NVPTXISD::Tld4G2DFloatFloat: + Opc = NVPTX::TLD4_G_2D_F32_F32; + break; + case NVPTXISD::Tld4B2DFloatFloat: + Opc = NVPTX::TLD4_B_2D_F32_F32; + break; + case NVPTXISD::Tld4A2DFloatFloat: + Opc = NVPTX::TLD4_A_2D_F32_F32; + break; + case NVPTXISD::Tld4R2DS64Float: + Opc = NVPTX::TLD4_R_2D_S32_F32; + break; + case NVPTXISD::Tld4G2DS64Float: + Opc = NVPTX::TLD4_G_2D_S32_F32; + break; + case NVPTXISD::Tld4B2DS64Float: + Opc = NVPTX::TLD4_B_2D_S32_F32; + break; + case NVPTXISD::Tld4A2DS64Float: + Opc = NVPTX::TLD4_A_2D_S32_F32; + break; + case NVPTXISD::Tld4R2DU64Float: + Opc = NVPTX::TLD4_R_2D_U32_F32; + break; + case NVPTXISD::Tld4G2DU64Float: + Opc = NVPTX::TLD4_G_2D_U32_F32; + break; + case NVPTXISD::Tld4B2DU64Float: + Opc = NVPTX::TLD4_B_2D_U32_F32; + break; + case NVPTXISD::Tld4A2DU64Float: + Opc = NVPTX::TLD4_A_2D_U32_F32; + break; + case NVPTXISD::TexUnified1DFloatS32: + Opc = NVPTX::TEX_UNIFIED_1D_F32_S32; + break; + case NVPTXISD::TexUnified1DFloatFloat: + Opc = NVPTX::TEX_UNIFIED_1D_F32_F32; + break; + case NVPTXISD::TexUnified1DFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD; + break; + case NVPTXISD::TexUnified1DS32S32: + Opc = NVPTX::TEX_UNIFIED_1D_S32_S32; + break; + case NVPTXISD::TexUnified1DS32Float: + Opc = NVPTX::TEX_UNIFIED_1D_S32_F32; + break; + case NVPTXISD::TexUnified1DS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD; + break; + case NVPTXISD::TexUnified1DU32S32: + Opc = NVPTX::TEX_UNIFIED_1D_U32_S32; + break; + case NVPTXISD::TexUnified1DU32Float: + Opc = NVPTX::TEX_UNIFIED_1D_U32_F32; + break; + case NVPTXISD::TexUnified1DU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD; + break; + case NVPTXISD::TexUnified1DArrayFloatS32: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32; + break; + case NVPTXISD::TexUnified1DArrayFloatFloat: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32; break; - case NVPTXISD::Tex3DI32Float: - Opc = NVPTX::TEX_3D_I32_F32; + case NVPTXISD::TexUnified1DArrayFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL; break; - case NVPTXISD::Tex3DI32FloatLevel: - Opc = NVPTX::TEX_3D_I32_F32_LEVEL; + case NVPTXISD::TexUnified1DArrayFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD; break; - case NVPTXISD::Tex3DI32FloatGrad: - Opc = NVPTX::TEX_3D_I32_F32_GRAD; + case NVPTXISD::TexUnified1DArrayS32S32: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32; + break; + case NVPTXISD::TexUnified1DArrayS32Float: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32; + break; + case NVPTXISD::TexUnified1DArrayS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DArrayS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD; + break; + case NVPTXISD::TexUnified1DArrayU32S32: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32; + break; + case NVPTXISD::TexUnified1DArrayU32Float: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32; + break; + case NVPTXISD::TexUnified1DArrayU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DArrayU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DFloatS32: + Opc = NVPTX::TEX_UNIFIED_2D_F32_S32; + break; + case NVPTXISD::TexUnified2DFloatFloat: + Opc = NVPTX::TEX_UNIFIED_2D_F32_F32; + break; + case NVPTXISD::TexUnified2DFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DS32S32: + Opc = NVPTX::TEX_UNIFIED_2D_S32_S32; + break; + case NVPTXISD::TexUnified2DS32Float: + Opc = NVPTX::TEX_UNIFIED_2D_S32_F32; + break; + case NVPTXISD::TexUnified2DS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DU32S32: + Opc = NVPTX::TEX_UNIFIED_2D_U32_S32; + break; + case NVPTXISD::TexUnified2DU32Float: + Opc = NVPTX::TEX_UNIFIED_2D_U32_F32; + break; + case NVPTXISD::TexUnified2DU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DArrayFloatS32: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32; + break; + case NVPTXISD::TexUnified2DArrayFloatFloat: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32; + break; + case NVPTXISD::TexUnified2DArrayFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DArrayFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DArrayS32S32: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32; + break; + case NVPTXISD::TexUnified2DArrayS32Float: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32; + break; + case NVPTXISD::TexUnified2DArrayS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DArrayS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DArrayU32S32: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32; + break; + case NVPTXISD::TexUnified2DArrayU32Float: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32; + break; + case NVPTXISD::TexUnified2DArrayU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DArrayU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD; + break; + case NVPTXISD::TexUnified3DFloatS32: + Opc = NVPTX::TEX_UNIFIED_3D_F32_S32; + break; + case NVPTXISD::TexUnified3DFloatFloat: + Opc = NVPTX::TEX_UNIFIED_3D_F32_F32; + break; + case NVPTXISD::TexUnified3DFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnified3DFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD; + break; + case NVPTXISD::TexUnified3DS32S32: + Opc = NVPTX::TEX_UNIFIED_3D_S32_S32; + break; + case NVPTXISD::TexUnified3DS32Float: + Opc = NVPTX::TEX_UNIFIED_3D_S32_F32; + break; + case NVPTXISD::TexUnified3DS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnified3DS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD; + break; + case NVPTXISD::TexUnified3DU32S32: + Opc = NVPTX::TEX_UNIFIED_3D_U32_S32; + break; + case NVPTXISD::TexUnified3DU32Float: + Opc = NVPTX::TEX_UNIFIED_3D_U32_F32; + break; + case NVPTXISD::TexUnified3DU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnified3DU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD; + break; + case NVPTXISD::TexUnifiedCubeFloatFloat: + Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32; + break; + case NVPTXISD::TexUnifiedCubeFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnifiedCubeS32Float: + Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32; + break; + case NVPTXISD::TexUnifiedCubeS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnifiedCubeU32Float: + Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32; + break; + case NVPTXISD::TexUnifiedCubeU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnifiedCubeArrayFloatFloat: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32; + break; + case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnifiedCubeArrayS32Float: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32; + break; + case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnifiedCubeArrayU32Float: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32; + break; + case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::Tld4UnifiedR2DFloatFloat: + Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32; + break; + case NVPTXISD::Tld4UnifiedG2DFloatFloat: + Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32; + break; + case NVPTXISD::Tld4UnifiedB2DFloatFloat: + Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32; + break; + case NVPTXISD::Tld4UnifiedA2DFloatFloat: + Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32; + break; + case NVPTXISD::Tld4UnifiedR2DS64Float: + Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32; + break; + case NVPTXISD::Tld4UnifiedG2DS64Float: + Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32; + break; + case NVPTXISD::Tld4UnifiedB2DS64Float: + Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32; + break; + case NVPTXISD::Tld4UnifiedA2DS64Float: + Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32; + break; + case NVPTXISD::Tld4UnifiedR2DU64Float: + Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32; + break; + case NVPTXISD::Tld4UnifiedG2DU64Float: + Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32; + break; + case NVPTXISD::Tld4UnifiedB2DU64Float: + Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32; + break; + case NVPTXISD::Tld4UnifiedA2DU64Float: + Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32; break; } - Ops.push_back(TexRef); - Ops.push_back(SampRef); - - // Copy over indices - for (unsigned i = 3; i < N->getNumOperands(); ++i) { + // Copy over operands + for (unsigned i = 1; i < N->getNumOperands(); ++i) { Ops.push_back(N->getOperand(i)); } @@ -2630,195 +3545,631 @@ SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) { SmallVector Ops; switch (N->getOpcode()) { default: return nullptr; - case NVPTXISD::Suld1DI8Trap: - Opc = NVPTX::SULD_1D_I8_TRAP; + case NVPTXISD::Suld1DI8Clamp: + Opc = NVPTX::SULD_1D_I8_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DI16Trap: - Opc = NVPTX::SULD_1D_I16_TRAP; + case NVPTXISD::Suld1DI16Clamp: + Opc = NVPTX::SULD_1D_I16_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DI32Trap: - Opc = NVPTX::SULD_1D_I32_TRAP; + case NVPTXISD::Suld1DI32Clamp: + Opc = NVPTX::SULD_1D_I32_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DV2I8Trap: - Opc = NVPTX::SULD_1D_V2I8_TRAP; + case NVPTXISD::Suld1DI64Clamp: + Opc = NVPTX::SULD_1D_I64_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DV2I16Trap: - Opc = NVPTX::SULD_1D_V2I16_TRAP; + case NVPTXISD::Suld1DV2I8Clamp: + Opc = NVPTX::SULD_1D_V2I8_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DV2I32Trap: - Opc = NVPTX::SULD_1D_V2I32_TRAP; + case NVPTXISD::Suld1DV2I16Clamp: + Opc = NVPTX::SULD_1D_V2I16_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DV4I8Trap: - Opc = NVPTX::SULD_1D_V4I8_TRAP; + case NVPTXISD::Suld1DV2I32Clamp: + Opc = NVPTX::SULD_1D_V2I32_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DV4I16Trap: - Opc = NVPTX::SULD_1D_V4I16_TRAP; + case NVPTXISD::Suld1DV2I64Clamp: + Opc = NVPTX::SULD_1D_V2I64_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DV4I32Trap: - Opc = NVPTX::SULD_1D_V4I32_TRAP; + case NVPTXISD::Suld1DV4I8Clamp: + Opc = NVPTX::SULD_1D_V4I8_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DArrayI8Trap: - Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP; + case NVPTXISD::Suld1DV4I16Clamp: + Opc = NVPTX::SULD_1D_V4I16_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DArrayI16Trap: - Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP; + case NVPTXISD::Suld1DV4I32Clamp: + Opc = NVPTX::SULD_1D_V4I32_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DArrayI32Trap: - Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP; + case NVPTXISD::Suld1DArrayI8Clamp: + Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DArrayV2I8Trap: - Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP; + case NVPTXISD::Suld1DArrayI16Clamp: + Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DArrayV2I16Trap: - Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP; + case NVPTXISD::Suld1DArrayI32Clamp: + Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DArrayV2I32Trap: - Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP; + case NVPTXISD::Suld1DArrayI64Clamp: + Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DArrayV4I8Trap: - Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP; + case NVPTXISD::Suld1DArrayV2I8Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DArrayV4I16Trap: - Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP; + case NVPTXISD::Suld1DArrayV2I16Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld1DArrayV4I32Trap: - Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP; + case NVPTXISD::Suld1DArrayV2I32Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DI8Trap: - Opc = NVPTX::SULD_2D_I8_TRAP; + case NVPTXISD::Suld1DArrayV2I64Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DI16Trap: - Opc = NVPTX::SULD_2D_I16_TRAP; + case NVPTXISD::Suld1DArrayV4I8Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DI32Trap: - Opc = NVPTX::SULD_2D_I32_TRAP; + case NVPTXISD::Suld1DArrayV4I16Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DV2I8Trap: - Opc = NVPTX::SULD_2D_V2I8_TRAP; + case NVPTXISD::Suld1DArrayV4I32Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DV2I16Trap: - Opc = NVPTX::SULD_2D_V2I16_TRAP; + case NVPTXISD::Suld2DI8Clamp: + Opc = NVPTX::SULD_2D_I8_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DV2I32Trap: - Opc = NVPTX::SULD_2D_V2I32_TRAP; + case NVPTXISD::Suld2DI16Clamp: + Opc = NVPTX::SULD_2D_I16_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DV4I8Trap: - Opc = NVPTX::SULD_2D_V4I8_TRAP; + case NVPTXISD::Suld2DI32Clamp: + Opc = NVPTX::SULD_2D_I32_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DV4I16Trap: - Opc = NVPTX::SULD_2D_V4I16_TRAP; + case NVPTXISD::Suld2DI64Clamp: + Opc = NVPTX::SULD_2D_I64_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DV4I32Trap: - Opc = NVPTX::SULD_2D_V4I32_TRAP; + case NVPTXISD::Suld2DV2I8Clamp: + Opc = NVPTX::SULD_2D_V2I8_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DArrayI8Trap: - Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP; + case NVPTXISD::Suld2DV2I16Clamp: + Opc = NVPTX::SULD_2D_V2I16_CLAMP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DArrayI16Trap: + case NVPTXISD::Suld2DV2I32Clamp: + Opc = NVPTX::SULD_2D_V2I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I64Clamp: + Opc = NVPTX::SULD_2D_V2I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I8Clamp: + Opc = NVPTX::SULD_2D_V4I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I16Clamp: + Opc = NVPTX::SULD_2D_V4I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I32Clamp: + Opc = NVPTX::SULD_2D_V4I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI8Clamp: + Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI16Clamp: + Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI32Clamp: + Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI64Clamp: + Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I8Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I16Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I32Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I64Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I8Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I16Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I32Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI8Clamp: + Opc = NVPTX::SULD_3D_I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI16Clamp: + Opc = NVPTX::SULD_3D_I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI32Clamp: + Opc = NVPTX::SULD_3D_I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI64Clamp: + Opc = NVPTX::SULD_3D_I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I8Clamp: + Opc = NVPTX::SULD_3D_V2I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I16Clamp: + Opc = NVPTX::SULD_3D_V2I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I32Clamp: + Opc = NVPTX::SULD_3D_V2I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I64Clamp: + Opc = NVPTX::SULD_3D_V2I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I8Clamp: + Opc = NVPTX::SULD_3D_V4I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I16Clamp: + Opc = NVPTX::SULD_3D_V4I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I32Clamp: + Opc = NVPTX::SULD_3D_V4I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI8Trap: + Opc = NVPTX::SULD_1D_I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI16Trap: + Opc = NVPTX::SULD_1D_I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI32Trap: + Opc = NVPTX::SULD_1D_I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI64Trap: + Opc = NVPTX::SULD_1D_I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I8Trap: + Opc = NVPTX::SULD_1D_V2I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I16Trap: + Opc = NVPTX::SULD_1D_V2I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I32Trap: + Opc = NVPTX::SULD_1D_V2I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I64Trap: + Opc = NVPTX::SULD_1D_V2I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I8Trap: + Opc = NVPTX::SULD_1D_V4I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I16Trap: + Opc = NVPTX::SULD_1D_V4I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I32Trap: + Opc = NVPTX::SULD_1D_V4I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI8Trap: + Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI16Trap: + Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI32Trap: + Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI64Trap: + Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I8Trap: + Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I16Trap: + Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I32Trap: + Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I64Trap: + Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I8Trap: + Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I16Trap: + Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I32Trap: + Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI8Trap: + Opc = NVPTX::SULD_2D_I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI16Trap: + Opc = NVPTX::SULD_2D_I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI32Trap: + Opc = NVPTX::SULD_2D_I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI64Trap: + Opc = NVPTX::SULD_2D_I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I8Trap: + Opc = NVPTX::SULD_2D_V2I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I16Trap: + Opc = NVPTX::SULD_2D_V2I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I32Trap: + Opc = NVPTX::SULD_2D_V2I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I64Trap: + Opc = NVPTX::SULD_2D_V2I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I8Trap: + Opc = NVPTX::SULD_2D_V4I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I16Trap: + Opc = NVPTX::SULD_2D_V4I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I32Trap: + Opc = NVPTX::SULD_2D_V4I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI8Trap: + Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI16Trap: Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); @@ -2826,128 +4177,556 @@ SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) { Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DArrayI32Trap: - Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP; + case NVPTXISD::Suld2DArrayI32Trap: + Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI64Trap: + Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I8Trap: + Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I16Trap: + Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I32Trap: + Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I64Trap: + Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I8Trap: + Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I16Trap: + Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I32Trap: + Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI8Trap: + Opc = NVPTX::SULD_3D_I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI16Trap: + Opc = NVPTX::SULD_3D_I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI32Trap: + Opc = NVPTX::SULD_3D_I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI64Trap: + Opc = NVPTX::SULD_3D_I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I8Trap: + Opc = NVPTX::SULD_3D_V2I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I16Trap: + Opc = NVPTX::SULD_3D_V2I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I32Trap: + Opc = NVPTX::SULD_3D_V2I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I64Trap: + Opc = NVPTX::SULD_3D_V2I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I8Trap: + Opc = NVPTX::SULD_3D_V4I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I16Trap: + Opc = NVPTX::SULD_3D_V4I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I32Trap: + Opc = NVPTX::SULD_3D_V4I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI8Zero: + Opc = NVPTX::SULD_1D_I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI16Zero: + Opc = NVPTX::SULD_1D_I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI32Zero: + Opc = NVPTX::SULD_1D_I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI64Zero: + Opc = NVPTX::SULD_1D_I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I8Zero: + Opc = NVPTX::SULD_1D_V2I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I16Zero: + Opc = NVPTX::SULD_1D_V2I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I32Zero: + Opc = NVPTX::SULD_1D_V2I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I64Zero: + Opc = NVPTX::SULD_1D_V2I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I8Zero: + Opc = NVPTX::SULD_1D_V4I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I16Zero: + Opc = NVPTX::SULD_1D_V4I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I32Zero: + Opc = NVPTX::SULD_1D_V4I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI8Zero: + Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI16Zero: + Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI32Zero: + Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI64Zero: + Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I8Zero: + Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I16Zero: + Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I32Zero: + Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I64Zero: + Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I8Zero: + Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I16Zero: + Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I32Zero: + Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI8Zero: + Opc = NVPTX::SULD_2D_I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI16Zero: + Opc = NVPTX::SULD_2D_I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI32Zero: + Opc = NVPTX::SULD_2D_I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI64Zero: + Opc = NVPTX::SULD_2D_I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I8Zero: + Opc = NVPTX::SULD_2D_V2I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I16Zero: + Opc = NVPTX::SULD_2D_V2I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I32Zero: + Opc = NVPTX::SULD_2D_V2I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I64Zero: + Opc = NVPTX::SULD_2D_V2I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I8Zero: + Opc = NVPTX::SULD_2D_V4I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I16Zero: + Opc = NVPTX::SULD_2D_V4I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I32Zero: + Opc = NVPTX::SULD_2D_V4I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI8Zero: + Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI16Zero: + Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DArrayV2I8Trap: - Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP; + case NVPTXISD::Suld2DArrayI32Zero: + Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DArrayV2I16Trap: - Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP; + case NVPTXISD::Suld2DArrayI64Zero: + Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DArrayV2I32Trap: - Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP; + case NVPTXISD::Suld2DArrayV2I8Zero: + Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DArrayV4I8Trap: - Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP; + case NVPTXISD::Suld2DArrayV2I16Zero: + Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DArrayV4I16Trap: - Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP; + case NVPTXISD::Suld2DArrayV2I32Zero: + Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld2DArrayV4I32Trap: - Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP; + case NVPTXISD::Suld2DArrayV2I64Zero: + Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld3DI8Trap: - Opc = NVPTX::SULD_3D_I8_TRAP; + case NVPTXISD::Suld2DArrayV4I8Zero: + Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld3DI16Trap: - Opc = NVPTX::SULD_3D_I16_TRAP; + case NVPTXISD::Suld2DArrayV4I16Zero: + Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld3DI32Trap: - Opc = NVPTX::SULD_3D_I32_TRAP; + case NVPTXISD::Suld2DArrayV4I32Zero: + Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld3DV2I8Trap: - Opc = NVPTX::SULD_3D_V2I8_TRAP; + case NVPTXISD::Suld3DI8Zero: + Opc = NVPTX::SULD_3D_I8_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld3DV2I16Trap: - Opc = NVPTX::SULD_3D_V2I16_TRAP; + case NVPTXISD::Suld3DI16Zero: + Opc = NVPTX::SULD_3D_I16_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld3DV2I32Trap: - Opc = NVPTX::SULD_3D_V2I32_TRAP; + case NVPTXISD::Suld3DI32Zero: + Opc = NVPTX::SULD_3D_I32_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld3DV4I8Trap: - Opc = NVPTX::SULD_3D_V4I8_TRAP; + case NVPTXISD::Suld3DI64Zero: + Opc = NVPTX::SULD_3D_I64_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld3DV4I16Trap: - Opc = NVPTX::SULD_3D_V4I16_TRAP; + case NVPTXISD::Suld3DV2I8Zero: + Opc = NVPTX::SULD_3D_V2I8_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); Ops.push_back(N->getOperand(4)); Ops.push_back(Chain); break; - case NVPTXISD::Suld3DV4I32Trap: - Opc = NVPTX::SULD_3D_V4I32_TRAP; + case NVPTXISD::Suld3DV2I16Zero: + Opc = NVPTX::SULD_3D_V2I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I32Zero: + Opc = NVPTX::SULD_3D_V2I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I64Zero: + Opc = NVPTX::SULD_3D_V2I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I8Zero: + Opc = NVPTX::SULD_3D_V4I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I16Zero: + Opc = NVPTX::SULD_3D_V4I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I32Zero: + Opc = NVPTX::SULD_3D_V4I32_ZERO; Ops.push_back(TexHandle); Ops.push_back(N->getOperand(2)); Ops.push_back(N->getOperand(3)); @@ -2959,6 +4738,215 @@ SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) { return Ret; } + +/// SelectBFE - Look for instruction sequences that can be made more efficient +/// by using the 'bfe' (bit-field extract) PTX instruction +SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Len; + SDValue Start; + SDValue Val; + bool IsSigned = false; + + if (N->getOpcode() == ISD::AND) { + // Canonicalize the operands + // We want 'and %val, %mask' + if (isa(LHS) && !isa(RHS)) { + std::swap(LHS, RHS); + } + + ConstantSDNode *Mask = dyn_cast(RHS); + if (!Mask) { + // We need a constant mask on the RHS of the AND + return NULL; + } + + // Extract the mask bits + uint64_t MaskVal = Mask->getZExtValue(); + if (!isMask_64(MaskVal)) { + // We *could* handle shifted masks here, but doing so would require an + // 'and' operation to fix up the low-order bits so we would trade + // shr+and for bfe+and, which has the same throughput + return NULL; + } + + // How many bits are in our mask? + uint64_t NumBits = CountTrailingOnes_64(MaskVal); + Len = CurDAG->getTargetConstant(NumBits, MVT::i32); + + if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { + // We have a 'srl/and' pair, extract the effective start bit and length + Val = LHS.getNode()->getOperand(0); + Start = LHS.getNode()->getOperand(1); + ConstantSDNode *StartConst = dyn_cast(Start); + if (StartConst) { + uint64_t StartVal = StartConst->getZExtValue(); + // How many "good" bits do we have left? "good" is defined here as bits + // that exist in the original value, not shifted in. + uint64_t GoodBits = Start.getValueType().getSizeInBits() - StartVal; + if (NumBits > GoodBits) { + // Do not handle the case where bits have been shifted in. In theory + // we could handle this, but the cost is likely higher than just + // emitting the srl/and pair. + return NULL; + } + Start = CurDAG->getTargetConstant(StartVal, MVT::i32); + } else { + // Do not handle the case where the shift amount (can be zero if no srl + // was found) is not constant. We could handle this case, but it would + // require run-time logic that would be more expensive than just + // emitting the srl/and pair. + return NULL; + } + } else { + // Do not handle the case where the LHS of the and is not a shift. While + // it would be trivial to handle this case, it would just transform + // 'and' -> 'bfe', but 'and' has higher-throughput. + return NULL; + } + } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { + if (LHS->getOpcode() == ISD::AND) { + ConstantSDNode *ShiftCnst = dyn_cast(RHS); + if (!ShiftCnst) { + // Shift amount must be constant + return NULL; + } + + uint64_t ShiftAmt = ShiftCnst->getZExtValue(); + + SDValue AndLHS = LHS->getOperand(0); + SDValue AndRHS = LHS->getOperand(1); + + // Canonicalize the AND to have the mask on the RHS + if (isa(AndLHS)) { + std::swap(AndLHS, AndRHS); + } + + ConstantSDNode *MaskCnst = dyn_cast(AndRHS); + if (!MaskCnst) { + // Mask must be constant + return NULL; + } + + uint64_t MaskVal = MaskCnst->getZExtValue(); + uint64_t NumZeros; + uint64_t NumBits; + if (isMask_64(MaskVal)) { + NumZeros = 0; + // The number of bits in the result bitfield will be the number of + // trailing ones (the AND) minus the number of bits we shift off + NumBits = CountTrailingOnes_64(MaskVal) - ShiftAmt; + } else if (isShiftedMask_64(MaskVal)) { + NumZeros = countTrailingZeros(MaskVal); + unsigned NumOnes = CountTrailingOnes_64(MaskVal >> NumZeros); + // The number of bits in the result bitfield will be the number of + // trailing zeros plus the number of set bits in the mask minus the + // number of bits we shift off + NumBits = NumZeros + NumOnes - ShiftAmt; + } else { + // This is not a mask we can handle + return NULL; + } + + if (ShiftAmt < NumZeros) { + // Handling this case would require extra logic that would make this + // transformation non-profitable + return NULL; + } + + Val = AndLHS; + Start = CurDAG->getTargetConstant(ShiftAmt, MVT::i32); + Len = CurDAG->getTargetConstant(NumBits, MVT::i32); + } else if (LHS->getOpcode() == ISD::SHL) { + // Here, we have a pattern like: + // + // (sra (shl val, NN), MM) + // or + // (srl (shl val, NN), MM) + // + // If MM >= NN, we can efficiently optimize this with bfe + Val = LHS->getOperand(0); + + SDValue ShlRHS = LHS->getOperand(1); + ConstantSDNode *ShlCnst = dyn_cast(ShlRHS); + if (!ShlCnst) { + // Shift amount must be constant + return NULL; + } + uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); + + SDValue ShrRHS = RHS; + ConstantSDNode *ShrCnst = dyn_cast(ShrRHS); + if (!ShrCnst) { + // Shift amount must be constant + return NULL; + } + uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); + + // To avoid extra codegen and be profitable, we need Outer >= Inner + if (OuterShiftAmt < InnerShiftAmt) { + return NULL; + } + + // If the outer shift is more than the type size, we have no bitfield to + // extract (since we also check that the inner shift is <= the outer shift + // then this also implies that the inner shift is < the type size) + if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) { + return NULL; + } + + Start = + CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, MVT::i32); + Len = + CurDAG->getTargetConstant(Val.getValueType().getSizeInBits() - + OuterShiftAmt, MVT::i32); + + if (N->getOpcode() == ISD::SRA) { + // If we have a arithmetic right shift, we need to use the signed bfe + // variant + IsSigned = true; + } + } else { + // No can do... + return NULL; + } + } else { + // No can do... + return NULL; + } + + + unsigned Opc; + // For the BFE operations we form here from "and" and "srl", always use the + // unsigned variants. + if (Val.getValueType() == MVT::i32) { + if (IsSigned) { + Opc = NVPTX::BFE_S32rii; + } else { + Opc = NVPTX::BFE_U32rii; + } + } else if (Val.getValueType() == MVT::i64) { + if (IsSigned) { + Opc = NVPTX::BFE_S64rii; + } else { + Opc = NVPTX::BFE_U64rii; + } + } else { + // We cannot handle this type + return NULL; + } + + SDValue Ops[] = { + Val, Start, Len + }; + + SDNode *Ret = + CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops); + + return Ret; +} + // SelectDirectAddr - Match a direct address for DAG. // A direct address could be a globaladdress or externalsymbol. bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 11f92e79d99c..c62fc253c33d 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -24,20 +24,13 @@ namespace { class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { - // If true, generate corresponding FPCONTRACT. This is - // language dependent (i.e. CUDA and OpenCL works differently). - bool doFMAF64; - bool doFMAF32; - bool doFMAF64AGG; - bool doFMAF32AGG; - bool allowFMA; - // If true, generate mul.wide from sext and mul bool doMulWide; int getDivF32Level() const; bool usePrecSqrtF32() const; bool useF32FTZ() const; + bool allowFMA() const; public: explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, @@ -59,10 +52,11 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { SDNode *Select(SDNode *N) override; SDNode *SelectIntrinsicNoChain(SDNode *N); + SDNode *SelectIntrinsicChain(SDNode *N); SDNode *SelectTexSurfHandle(SDNode *N); SDNode *SelectLoad(SDNode *N); SDNode *SelectLoadVector(SDNode *N); - SDNode *SelectLDGLDUVector(SDNode *N); + SDNode *SelectLDGLDU(SDNode *N); SDNode *SelectStore(SDNode *N); SDNode *SelectStoreVector(SDNode *N); SDNode *SelectLoadParam(SDNode *N); @@ -71,6 +65,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { SDNode *SelectAddrSpaceCast(SDNode *N); SDNode *SelectTextureIntrinsic(SDNode *N); SDNode *SelectSurfaceIntrinsic(SDNode *N); + SDNode *SelectBFE(SDNode *N); inline SDValue getI32Imm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, MVT::i32); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 0e3e0d50adee..05bad16ddd89 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include @@ -47,6 +48,12 @@ static cl::opt sched4reg( "nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); +static cl::opt +FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, + cl::desc("NVPTX Specific: FMA contraction (0: don't do it" + " 1: do it 2: do it aggressively"), + cl::init(2)); + static bool IsPTXVectorType(MVT VT) { switch (VT.SimpleTy) { default: @@ -111,6 +118,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Jump is Expensive. Don't create extra control flow for 'and', 'or' // condition branches. @@ -152,6 +160,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); + if (nvptxSubtarget.hasROT64()) { setOperationAction(ISD::ROTL, MVT::i64, Legal); setOperationAction(ISD::ROTR, MVT::i64, Legal); @@ -188,8 +203,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); // Turn FP extload into load/fextend + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); // Turn FP truncstore into trunc + store. + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); // PTX does not support load / store predicate registers @@ -243,6 +261,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setOperationAction(ISD::CTPOP, MVT::i32, Legal); setOperationAction(ISD::CTPOP, MVT::i64, Legal); + // We have some custom DAG combine patterns for these nodes + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SHL); + // Now deduce the information based on the above mentioned // actions computeRegisterProperties(); @@ -334,73 +359,389 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { return "NVPTXISD::StoreV2"; case NVPTXISD::StoreV4: return "NVPTXISD::StoreV4"; - case NVPTXISD::Tex1DFloatI32: return "NVPTXISD::Tex1DFloatI32"; + case NVPTXISD::FUN_SHFL_CLAMP: + return "NVPTXISD::FUN_SHFL_CLAMP"; + case NVPTXISD::FUN_SHFR_CLAMP: + return "NVPTXISD::FUN_SHFR_CLAMP"; + case NVPTXISD::IMAD: + return "NVPTXISD::IMAD"; + case NVPTXISD::MUL_WIDE_SIGNED: + return "NVPTXISD::MUL_WIDE_SIGNED"; + case NVPTXISD::MUL_WIDE_UNSIGNED: + return "NVPTXISD::MUL_WIDE_UNSIGNED"; + case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; case NVPTXISD::Tex1DFloatFloatLevel: return "NVPTXISD::Tex1DFloatFloatLevel"; case NVPTXISD::Tex1DFloatFloatGrad: return "NVPTXISD::Tex1DFloatFloatGrad"; - case NVPTXISD::Tex1DI32I32: return "NVPTXISD::Tex1DI32I32"; - case NVPTXISD::Tex1DI32Float: return "NVPTXISD::Tex1DI32Float"; - case NVPTXISD::Tex1DI32FloatLevel: - return "NVPTXISD::Tex1DI32FloatLevel"; - case NVPTXISD::Tex1DI32FloatGrad: - return "NVPTXISD::Tex1DI32FloatGrad"; - case NVPTXISD::Tex1DArrayFloatI32: return "NVPTXISD::Tex2DArrayFloatI32"; - case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; + case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; + case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; + case NVPTXISD::Tex1DS32FloatLevel: + return "NVPTXISD::Tex1DS32FloatLevel"; + case NVPTXISD::Tex1DS32FloatGrad: + return "NVPTXISD::Tex1DS32FloatGrad"; + case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; + case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; + case NVPTXISD::Tex1DU32FloatLevel: + return "NVPTXISD::Tex1DU32FloatLevel"; + case NVPTXISD::Tex1DU32FloatGrad: + return "NVPTXISD::Tex1DU32FloatGrad"; + case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; + case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; case NVPTXISD::Tex1DArrayFloatFloatLevel: - return "NVPTXISD::Tex2DArrayFloatFloatLevel"; + return "NVPTXISD::Tex1DArrayFloatFloatLevel"; case NVPTXISD::Tex1DArrayFloatFloatGrad: - return "NVPTXISD::Tex2DArrayFloatFloatGrad"; - case NVPTXISD::Tex1DArrayI32I32: return "NVPTXISD::Tex2DArrayI32I32"; - case NVPTXISD::Tex1DArrayI32Float: return "NVPTXISD::Tex2DArrayI32Float"; - case NVPTXISD::Tex1DArrayI32FloatLevel: - return "NVPTXISD::Tex2DArrayI32FloatLevel"; - case NVPTXISD::Tex1DArrayI32FloatGrad: - return "NVPTXISD::Tex2DArrayI32FloatGrad"; - case NVPTXISD::Tex2DFloatI32: return "NVPTXISD::Tex2DFloatI32"; + return "NVPTXISD::Tex1DArrayFloatFloatGrad"; + case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; + case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; + case NVPTXISD::Tex1DArrayS32FloatLevel: + return "NVPTXISD::Tex1DArrayS32FloatLevel"; + case NVPTXISD::Tex1DArrayS32FloatGrad: + return "NVPTXISD::Tex1DArrayS32FloatGrad"; + case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; + case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; + case NVPTXISD::Tex1DArrayU32FloatLevel: + return "NVPTXISD::Tex1DArrayU32FloatLevel"; + case NVPTXISD::Tex1DArrayU32FloatGrad: + return "NVPTXISD::Tex1DArrayU32FloatGrad"; + case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; case NVPTXISD::Tex2DFloatFloatLevel: return "NVPTXISD::Tex2DFloatFloatLevel"; case NVPTXISD::Tex2DFloatFloatGrad: return "NVPTXISD::Tex2DFloatFloatGrad"; - case NVPTXISD::Tex2DI32I32: return "NVPTXISD::Tex2DI32I32"; - case NVPTXISD::Tex2DI32Float: return "NVPTXISD::Tex2DI32Float"; - case NVPTXISD::Tex2DI32FloatLevel: - return "NVPTXISD::Tex2DI32FloatLevel"; - case NVPTXISD::Tex2DI32FloatGrad: - return "NVPTXISD::Tex2DI32FloatGrad"; - case NVPTXISD::Tex2DArrayFloatI32: return "NVPTXISD::Tex2DArrayFloatI32"; + case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; + case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; + case NVPTXISD::Tex2DS32FloatLevel: + return "NVPTXISD::Tex2DS32FloatLevel"; + case NVPTXISD::Tex2DS32FloatGrad: + return "NVPTXISD::Tex2DS32FloatGrad"; + case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; + case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; + case NVPTXISD::Tex2DU32FloatLevel: + return "NVPTXISD::Tex2DU32FloatLevel"; + case NVPTXISD::Tex2DU32FloatGrad: + return "NVPTXISD::Tex2DU32FloatGrad"; + case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; case NVPTXISD::Tex2DArrayFloatFloatLevel: return "NVPTXISD::Tex2DArrayFloatFloatLevel"; case NVPTXISD::Tex2DArrayFloatFloatGrad: return "NVPTXISD::Tex2DArrayFloatFloatGrad"; - case NVPTXISD::Tex2DArrayI32I32: return "NVPTXISD::Tex2DArrayI32I32"; - case NVPTXISD::Tex2DArrayI32Float: return "NVPTXISD::Tex2DArrayI32Float"; - case NVPTXISD::Tex2DArrayI32FloatLevel: - return "NVPTXISD::Tex2DArrayI32FloatLevel"; - case NVPTXISD::Tex2DArrayI32FloatGrad: - return "NVPTXISD::Tex2DArrayI32FloatGrad"; - case NVPTXISD::Tex3DFloatI32: return "NVPTXISD::Tex3DFloatI32"; + case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; + case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; + case NVPTXISD::Tex2DArrayS32FloatLevel: + return "NVPTXISD::Tex2DArrayS32FloatLevel"; + case NVPTXISD::Tex2DArrayS32FloatGrad: + return "NVPTXISD::Tex2DArrayS32FloatGrad"; + case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; + case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; + case NVPTXISD::Tex2DArrayU32FloatLevel: + return "NVPTXISD::Tex2DArrayU32FloatLevel"; + case NVPTXISD::Tex2DArrayU32FloatGrad: + return "NVPTXISD::Tex2DArrayU32FloatGrad"; + case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; case NVPTXISD::Tex3DFloatFloatLevel: return "NVPTXISD::Tex3DFloatFloatLevel"; case NVPTXISD::Tex3DFloatFloatGrad: return "NVPTXISD::Tex3DFloatFloatGrad"; - case NVPTXISD::Tex3DI32I32: return "NVPTXISD::Tex3DI32I32"; - case NVPTXISD::Tex3DI32Float: return "NVPTXISD::Tex3DI32Float"; - case NVPTXISD::Tex3DI32FloatLevel: - return "NVPTXISD::Tex3DI32FloatLevel"; - case NVPTXISD::Tex3DI32FloatGrad: - return "NVPTXISD::Tex3DI32FloatGrad"; + case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; + case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; + case NVPTXISD::Tex3DS32FloatLevel: + return "NVPTXISD::Tex3DS32FloatLevel"; + case NVPTXISD::Tex3DS32FloatGrad: + return "NVPTXISD::Tex3DS32FloatGrad"; + case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; + case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; + case NVPTXISD::Tex3DU32FloatLevel: + return "NVPTXISD::Tex3DU32FloatLevel"; + case NVPTXISD::Tex3DU32FloatGrad: + return "NVPTXISD::Tex3DU32FloatGrad"; + case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; + case NVPTXISD::TexCubeFloatFloatLevel: + return "NVPTXISD::TexCubeFloatFloatLevel"; + case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; + case NVPTXISD::TexCubeS32FloatLevel: + return "NVPTXISD::TexCubeS32FloatLevel"; + case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; + case NVPTXISD::TexCubeU32FloatLevel: + return "NVPTXISD::TexCubeU32FloatLevel"; + case NVPTXISD::TexCubeArrayFloatFloat: + return "NVPTXISD::TexCubeArrayFloatFloat"; + case NVPTXISD::TexCubeArrayFloatFloatLevel: + return "NVPTXISD::TexCubeArrayFloatFloatLevel"; + case NVPTXISD::TexCubeArrayS32Float: + return "NVPTXISD::TexCubeArrayS32Float"; + case NVPTXISD::TexCubeArrayS32FloatLevel: + return "NVPTXISD::TexCubeArrayS32FloatLevel"; + case NVPTXISD::TexCubeArrayU32Float: + return "NVPTXISD::TexCubeArrayU32Float"; + case NVPTXISD::TexCubeArrayU32FloatLevel: + return "NVPTXISD::TexCubeArrayU32FloatLevel"; + case NVPTXISD::Tld4R2DFloatFloat: + return "NVPTXISD::Tld4R2DFloatFloat"; + case NVPTXISD::Tld4G2DFloatFloat: + return "NVPTXISD::Tld4G2DFloatFloat"; + case NVPTXISD::Tld4B2DFloatFloat: + return "NVPTXISD::Tld4B2DFloatFloat"; + case NVPTXISD::Tld4A2DFloatFloat: + return "NVPTXISD::Tld4A2DFloatFloat"; + case NVPTXISD::Tld4R2DS64Float: + return "NVPTXISD::Tld4R2DS64Float"; + case NVPTXISD::Tld4G2DS64Float: + return "NVPTXISD::Tld4G2DS64Float"; + case NVPTXISD::Tld4B2DS64Float: + return "NVPTXISD::Tld4B2DS64Float"; + case NVPTXISD::Tld4A2DS64Float: + return "NVPTXISD::Tld4A2DS64Float"; + case NVPTXISD::Tld4R2DU64Float: + return "NVPTXISD::Tld4R2DU64Float"; + case NVPTXISD::Tld4G2DU64Float: + return "NVPTXISD::Tld4G2DU64Float"; + case NVPTXISD::Tld4B2DU64Float: + return "NVPTXISD::Tld4B2DU64Float"; + case NVPTXISD::Tld4A2DU64Float: + return "NVPTXISD::Tld4A2DU64Float"; + + case NVPTXISD::TexUnified1DFloatS32: + return "NVPTXISD::TexUnified1DFloatS32"; + case NVPTXISD::TexUnified1DFloatFloat: + return "NVPTXISD::TexUnified1DFloatFloat"; + case NVPTXISD::TexUnified1DFloatFloatLevel: + return "NVPTXISD::TexUnified1DFloatFloatLevel"; + case NVPTXISD::TexUnified1DFloatFloatGrad: + return "NVPTXISD::TexUnified1DFloatFloatGrad"; + case NVPTXISD::TexUnified1DS32S32: + return "NVPTXISD::TexUnified1DS32S32"; + case NVPTXISD::TexUnified1DS32Float: + return "NVPTXISD::TexUnified1DS32Float"; + case NVPTXISD::TexUnified1DS32FloatLevel: + return "NVPTXISD::TexUnified1DS32FloatLevel"; + case NVPTXISD::TexUnified1DS32FloatGrad: + return "NVPTXISD::TexUnified1DS32FloatGrad"; + case NVPTXISD::TexUnified1DU32S32: + return "NVPTXISD::TexUnified1DU32S32"; + case NVPTXISD::TexUnified1DU32Float: + return "NVPTXISD::TexUnified1DU32Float"; + case NVPTXISD::TexUnified1DU32FloatLevel: + return "NVPTXISD::TexUnified1DU32FloatLevel"; + case NVPTXISD::TexUnified1DU32FloatGrad: + return "NVPTXISD::TexUnified1DU32FloatGrad"; + case NVPTXISD::TexUnified1DArrayFloatS32: + return "NVPTXISD::TexUnified1DArrayFloatS32"; + case NVPTXISD::TexUnified1DArrayFloatFloat: + return "NVPTXISD::TexUnified1DArrayFloatFloat"; + case NVPTXISD::TexUnified1DArrayFloatFloatLevel: + return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; + case NVPTXISD::TexUnified1DArrayFloatFloatGrad: + return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; + case NVPTXISD::TexUnified1DArrayS32S32: + return "NVPTXISD::TexUnified1DArrayS32S32"; + case NVPTXISD::TexUnified1DArrayS32Float: + return "NVPTXISD::TexUnified1DArrayS32Float"; + case NVPTXISD::TexUnified1DArrayS32FloatLevel: + return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; + case NVPTXISD::TexUnified1DArrayS32FloatGrad: + return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; + case NVPTXISD::TexUnified1DArrayU32S32: + return "NVPTXISD::TexUnified1DArrayU32S32"; + case NVPTXISD::TexUnified1DArrayU32Float: + return "NVPTXISD::TexUnified1DArrayU32Float"; + case NVPTXISD::TexUnified1DArrayU32FloatLevel: + return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; + case NVPTXISD::TexUnified1DArrayU32FloatGrad: + return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; + case NVPTXISD::TexUnified2DFloatS32: + return "NVPTXISD::TexUnified2DFloatS32"; + case NVPTXISD::TexUnified2DFloatFloat: + return "NVPTXISD::TexUnified2DFloatFloat"; + case NVPTXISD::TexUnified2DFloatFloatLevel: + return "NVPTXISD::TexUnified2DFloatFloatLevel"; + case NVPTXISD::TexUnified2DFloatFloatGrad: + return "NVPTXISD::TexUnified2DFloatFloatGrad"; + case NVPTXISD::TexUnified2DS32S32: + return "NVPTXISD::TexUnified2DS32S32"; + case NVPTXISD::TexUnified2DS32Float: + return "NVPTXISD::TexUnified2DS32Float"; + case NVPTXISD::TexUnified2DS32FloatLevel: + return "NVPTXISD::TexUnified2DS32FloatLevel"; + case NVPTXISD::TexUnified2DS32FloatGrad: + return "NVPTXISD::TexUnified2DS32FloatGrad"; + case NVPTXISD::TexUnified2DU32S32: + return "NVPTXISD::TexUnified2DU32S32"; + case NVPTXISD::TexUnified2DU32Float: + return "NVPTXISD::TexUnified2DU32Float"; + case NVPTXISD::TexUnified2DU32FloatLevel: + return "NVPTXISD::TexUnified2DU32FloatLevel"; + case NVPTXISD::TexUnified2DU32FloatGrad: + return "NVPTXISD::TexUnified2DU32FloatGrad"; + case NVPTXISD::TexUnified2DArrayFloatS32: + return "NVPTXISD::TexUnified2DArrayFloatS32"; + case NVPTXISD::TexUnified2DArrayFloatFloat: + return "NVPTXISD::TexUnified2DArrayFloatFloat"; + case NVPTXISD::TexUnified2DArrayFloatFloatLevel: + return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; + case NVPTXISD::TexUnified2DArrayFloatFloatGrad: + return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; + case NVPTXISD::TexUnified2DArrayS32S32: + return "NVPTXISD::TexUnified2DArrayS32S32"; + case NVPTXISD::TexUnified2DArrayS32Float: + return "NVPTXISD::TexUnified2DArrayS32Float"; + case NVPTXISD::TexUnified2DArrayS32FloatLevel: + return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; + case NVPTXISD::TexUnified2DArrayS32FloatGrad: + return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; + case NVPTXISD::TexUnified2DArrayU32S32: + return "NVPTXISD::TexUnified2DArrayU32S32"; + case NVPTXISD::TexUnified2DArrayU32Float: + return "NVPTXISD::TexUnified2DArrayU32Float"; + case NVPTXISD::TexUnified2DArrayU32FloatLevel: + return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; + case NVPTXISD::TexUnified2DArrayU32FloatGrad: + return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; + case NVPTXISD::TexUnified3DFloatS32: + return "NVPTXISD::TexUnified3DFloatS32"; + case NVPTXISD::TexUnified3DFloatFloat: + return "NVPTXISD::TexUnified3DFloatFloat"; + case NVPTXISD::TexUnified3DFloatFloatLevel: + return "NVPTXISD::TexUnified3DFloatFloatLevel"; + case NVPTXISD::TexUnified3DFloatFloatGrad: + return "NVPTXISD::TexUnified3DFloatFloatGrad"; + case NVPTXISD::TexUnified3DS32S32: + return "NVPTXISD::TexUnified3DS32S32"; + case NVPTXISD::TexUnified3DS32Float: + return "NVPTXISD::TexUnified3DS32Float"; + case NVPTXISD::TexUnified3DS32FloatLevel: + return "NVPTXISD::TexUnified3DS32FloatLevel"; + case NVPTXISD::TexUnified3DS32FloatGrad: + return "NVPTXISD::TexUnified3DS32FloatGrad"; + case NVPTXISD::TexUnified3DU32S32: + return "NVPTXISD::TexUnified3DU32S32"; + case NVPTXISD::TexUnified3DU32Float: + return "NVPTXISD::TexUnified3DU32Float"; + case NVPTXISD::TexUnified3DU32FloatLevel: + return "NVPTXISD::TexUnified3DU32FloatLevel"; + case NVPTXISD::TexUnified3DU32FloatGrad: + return "NVPTXISD::TexUnified3DU32FloatGrad"; + case NVPTXISD::TexUnifiedCubeFloatFloat: + return "NVPTXISD::TexUnifiedCubeFloatFloat"; + case NVPTXISD::TexUnifiedCubeFloatFloatLevel: + return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; + case NVPTXISD::TexUnifiedCubeS32Float: + return "NVPTXISD::TexUnifiedCubeS32Float"; + case NVPTXISD::TexUnifiedCubeS32FloatLevel: + return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; + case NVPTXISD::TexUnifiedCubeU32Float: + return "NVPTXISD::TexUnifiedCubeU32Float"; + case NVPTXISD::TexUnifiedCubeU32FloatLevel: + return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayFloatFloat: + return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; + case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayS32Float: + return "NVPTXISD::TexUnifiedCubeArrayS32Float"; + case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayU32Float: + return "NVPTXISD::TexUnifiedCubeArrayU32Float"; + case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; + case NVPTXISD::Tld4UnifiedR2DFloatFloat: + return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; + case NVPTXISD::Tld4UnifiedG2DFloatFloat: + return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; + case NVPTXISD::Tld4UnifiedB2DFloatFloat: + return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; + case NVPTXISD::Tld4UnifiedA2DFloatFloat: + return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; + case NVPTXISD::Tld4UnifiedR2DS64Float: + return "NVPTXISD::Tld4UnifiedR2DS64Float"; + case NVPTXISD::Tld4UnifiedG2DS64Float: + return "NVPTXISD::Tld4UnifiedG2DS64Float"; + case NVPTXISD::Tld4UnifiedB2DS64Float: + return "NVPTXISD::Tld4UnifiedB2DS64Float"; + case NVPTXISD::Tld4UnifiedA2DS64Float: + return "NVPTXISD::Tld4UnifiedA2DS64Float"; + case NVPTXISD::Tld4UnifiedR2DU64Float: + return "NVPTXISD::Tld4UnifiedR2DU64Float"; + case NVPTXISD::Tld4UnifiedG2DU64Float: + return "NVPTXISD::Tld4UnifiedG2DU64Float"; + case NVPTXISD::Tld4UnifiedB2DU64Float: + return "NVPTXISD::Tld4UnifiedB2DU64Float"; + case NVPTXISD::Tld4UnifiedA2DU64Float: + return "NVPTXISD::Tld4UnifiedA2DU64Float"; + + case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; + case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; + case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; + case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; + case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; + case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; + case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; + case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; + case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; + case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; + case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; + + case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; + case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; + case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; + case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; + case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; + case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; + case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; + case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; + case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; + case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; + case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; + + case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; + case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; + case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; + case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; + case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; + case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; + case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; + case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; + case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; + case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; + case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; + + case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; + case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; + case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; + case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; + case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; + case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; + case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; + case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; + case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; + case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; + case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; + + case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; + case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; + case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; + case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; + case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; + case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; + case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; + case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; + case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; + case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; + case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; + case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; + case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; @@ -408,9 +749,11 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; + case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; + case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; @@ -418,9 +761,11 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; + case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; + case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; @@ -428,9 +773,11 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; + case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; + case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; @@ -438,17 +785,83 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; + case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; + case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; + + case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; + case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; + case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; + case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; + case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; + case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; + case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; + case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; + case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; + case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; + case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; + + case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; + case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; + case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; + case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; + case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; + case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; + case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; + case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; + case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; + case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; + case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; + + case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; + case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; + case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; + case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; + case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; + case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; + case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; + case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; + case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; + case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; + case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; + + case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; + case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; + case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; + case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; + case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; + case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; + case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; + case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; + case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; + case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; + case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; + + case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; + case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; + case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; + case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; + case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; + case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; + case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; + case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; + case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; + case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; + case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; } } -bool NVPTXTargetLowering::shouldSplitVectorType(EVT VT) const { - return VT.getScalarType() == MVT::i1; +TargetLoweringBase::LegalizeTypeAction +NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); } SDValue @@ -493,26 +906,12 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, } else if (isa(retTy)) { O << ".param .b" << getPointerTy().getSizeInBits() << " _"; } else { - if ((retTy->getTypeID() == Type::StructTyID) || isa(retTy)) { - SmallVector vtparts; - ComputeValueVTs(*this, retTy, vtparts); - unsigned totalsz = 0; - for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { - unsigned elems = 1; - EVT elemtype = vtparts[i]; - if (vtparts[i].isVector()) { - elems = vtparts[i].getVectorNumElements(); - elemtype = vtparts[i].getVectorElementType(); - } - // TODO: no need to loop - for (unsigned j = 0, je = elems; j != je; ++j) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - totalsz += sz / 8; - } - } - O << ".param .align " << retAlignment << " .b8 _[" << totalsz << "]"; + if((retTy->getTypeID() == Type::StructTyID) || + isa(retTy)) { + O << ".param .align " + << retAlignment + << " .b8 _[" + << getDataLayout()->getTypeAllocSize(retTy) << "]"; } else { assert(false && "Unknown return type"); } @@ -681,7 +1080,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Ty->isAggregateType()) { // aggregate SmallVector vtparts; - ComputeValueVTs(*this, Ty, vtparts); + SmallVector Offsets; + ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0); unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); // declare .param .align .b8 .param[]; @@ -693,34 +1093,26 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); - unsigned curOffset = 0; for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - unsigned elems = 1; EVT elemtype = vtparts[j]; - if (vtparts[j].isVector()) { - elems = vtparts[j].getVectorNumElements(); - elemtype = vtparts[j].getVectorElementType(); - } - for (unsigned k = 0, ke = elems; k != ke; ++k) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue StVal = OutVals[OIdx]; - if (elemtype.getSizeInBits() < 16) { - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(curOffset, MVT::i32), - StVal, InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, - CopyParamVTs, CopyParamOps, - elemtype, MachinePointerInfo()); - InFlag = Chain.getValue(1); - curOffset += sz / 8; - ++OIdx; + unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]); + if (elemtype.isInteger() && (sz < 8)) + sz = 8; + SDValue StVal = OutVals[OIdx]; + if (elemtype.getSizeInBits() < 16) { + StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); } + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(Offsets[j], MVT::i32), + StVal, InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, + CopyParamVTs, CopyParamOps, + elemtype, MachinePointerInfo(), + ArgAlign); + InFlag = Chain.getValue(1); + ++OIdx; } if (vtparts.size() > 0) --OIdx; @@ -905,13 +1297,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } // struct or vector SmallVector vtparts; + SmallVector Offsets; const PointerType *PTy = dyn_cast(Args[i].Ty); assert(PTy && "Type of a byval parameter should be pointer"); - ComputeValueVTs(*this, PTy->getElementType(), vtparts); + ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0); // declare .param .align .b8 .param[]; unsigned sz = Outs[OIdx].Flags.getByValSize(); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, // so we don't need to worry about natural alignment or not. // See TargetLowering::LowerCallTo(). @@ -923,38 +1317,28 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); - unsigned curOffset = 0; for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - unsigned elems = 1; EVT elemtype = vtparts[j]; - if (vtparts[j].isVector()) { - elems = vtparts[j].getVectorNumElements(); - elemtype = vtparts[j].getVectorElementType(); + int curOffset = Offsets[j]; + unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); + SDValue srcAddr = + DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], + DAG.getConstant(curOffset, getPointerTy())); + SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, + MachinePointerInfo(), false, false, false, + PartAlign); + if (elemtype.getSizeInBits() < 16) { + theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); } - for (unsigned k = 0, ke = elems; k != ke; ++k) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue srcAddr = - DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], - DAG.getConstant(curOffset, getPointerTy())); - SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, - MachinePointerInfo(), false, false, false, - 0); - if (elemtype.getSizeInBits() < 16) { - theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(curOffset, MVT::i32), theVal, - InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, - CopyParamOps, elemtype, - MachinePointerInfo()); + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(curOffset, MVT::i32), theVal, + InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, + CopyParamOps, elemtype, + MachinePointerInfo()); - InFlag = Chain.getValue(1); - curOffset += sz / 8; - } + InFlag = Chain.getValue(1); } ++paramCount; } @@ -1063,7 +1447,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { - unsigned resoffset = 0; if (retTy && retTy->isVectorTy()) { EVT ObjectVT = getValueType(retTy); unsigned NumElts = ObjectVT.getVectorNumElements(); @@ -1072,14 +1455,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ObjectVT) == NumElts && "Vector was not scalarized"); unsigned sz = EltVT.getSizeInBits(); - bool needTruncate = sz < 16 ? true : false; + bool needTruncate = sz < 8 ? true : false; if (NumElts == 1) { // Just a simple load SmallVector LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 LoadRetVTs.push_back(MVT::i16); } else @@ -1103,9 +1487,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } else if (NumElts == 2) { // LoadV2 SmallVector LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 LoadRetVTs.push_back(MVT::i16); LoadRetVTs.push_back(MVT::i16); @@ -1148,9 +1533,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); for (unsigned i = 0; i < NumElts; i += VecSize) { SmallVector LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 for (unsigned j = 0; j < VecSize; ++j) LoadRetVTs.push_back(MVT::i16); @@ -1189,10 +1575,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } else { SmallVector VTs; - ComputePTXValueVTs(*this, retTy, VTs); + SmallVector Offsets; + ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0); assert(VTs.size() == Ins.size() && "Bad value decomposition"); + unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0); for (unsigned i = 0, e = Ins.size(); i != e; ++i) { unsigned sz = VTs[i].getSizeInBits(); + unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]); bool needTruncate = sz < 8 ? true : false; if (VTs[i].isInteger() && (sz < 8)) sz = 8; @@ -1218,19 +1607,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector LoadRetOps; LoadRetOps.push_back(Chain); LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); - LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32)); + LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32)); LoadRetOps.push_back(InFlag); SDValue retval = DAG.getMemIntrinsicNode( NVPTXISD::LoadParam, dl, DAG.getVTList(LoadRetVTs), LoadRetOps, - TheLoadType, MachinePointerInfo()); + TheLoadType, MachinePointerInfo(), AlignI); Chain = retval.getValue(1); InFlag = retval.getValue(2); SDValue Ret0 = retval.getValue(0); if (needTruncate) Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0); InVals.push_back(Ret0); - resoffset += sz / 8; } } } @@ -1268,6 +1656,127 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops); } +/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) { + + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} >> Amt + // dHi = aHi >> Amt + // dLo = shf.r.clamp aLo, aHi, Amt + + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + + // {dHi, dLo} = {aHi, aLo} >> Amt + // - if (Amt>=size) then + // dLo = aHi >> (Amt-size) + // dHi = aHi >> Amt (this is either all 0 or all 1) + // else + // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) + // dHi = aHi >> Amt + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, MVT::i32), ISD::SETGE); + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + +/// LowerShiftLeftParts - Lower SHL_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SHL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + + if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) { + + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} << Amt + // dHi = shf.l.clamp aLo, aHi, Amt + // dLo = aLo << Amt + + SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + + // {dHi, dLo} = {aHi, aLo} << Amt + // - if (Amt>=size) then + // dLo = aLo << Amt (all 0) + // dLo = aLo << (Amt-size) + // else + // dLo = aLo << Amt + // dHi = (aHi << Amt) | (aLo >> (size-Amt)) + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, MVT::i32), ISD::SETGE); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + SDValue NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -1288,6 +1797,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSTORE(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::SHL_PARTS: + return LowerShiftLeftParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: + return LowerShiftRightParts(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); } @@ -1363,6 +1877,21 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { break; } + MemSDNode *MemSD = cast(N); + const DataLayout *TD = getDataLayout(); + + unsigned Align = MemSD->getAlignment(); + unsigned PrefAlign = + TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); + if (Align < PrefAlign) { + // This store is not sufficiently aligned, so bail out and let this vector + // store be scalarized. Note that we may still be able to emit smaller + // vector stores. For example, if we are storing a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return SDValue(); + } + unsigned Opcode = 0; EVT EltVT = ValVT.getVectorElementType(); unsigned NumElts = ValVT.getVectorNumElements(); @@ -1405,8 +1934,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { Ops.push_back(N->getOperand(i)); } - MemSDNode *MemSD = cast(N); - SDValue NewSt = DAG.getMemIntrinsicNode( Opcode, DL, DAG.getVTList(MVT::Other), Ops, MemSD->getMemoryVT(), MemSD->getMemOperand()); @@ -1501,7 +2028,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( const Function *F = MF.getFunction(); const AttributeSet &PAL = F->getAttributes(); - const TargetLowering *TLI = nvTM->getTargetLowering(); + const TargetLowering *TLI = DAG.getTarget().getTargetLowering(); SDValue Root = DAG.getRoot(); std::vector OutChains; @@ -1555,8 +2082,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( assert(vtparts.size() > 0 && "empty aggregate type not expected"); for (unsigned parti = 0, parte = vtparts.size(); parti != parte; ++parti) { - EVT partVT = vtparts[parti]; - InVals.push_back(DAG.getNode(ISD::UNDEF, dl, partVT)); + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); ++InsIdx; } if (vtparts.size() > 0) @@ -1872,7 +2398,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, unsigned Offset = 0; EVT VecVT = - EVT::getVectorVT(F->getContext(), OutVals[0].getValueType(), VecSize); + EVT::getVectorVT(F->getContext(), EltVT, VecSize); unsigned PerStoreOffset = TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); @@ -1931,12 +2457,10 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } } else { SmallVector ValVTs; - // const_cast is necessary since we are still using an LLVM version from - // before the type system re-write. - ComputePTXValueVTs(*this, RetTy, ValVTs); + SmallVector Offsets; + ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0); assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition"); - unsigned SizeSoFar = 0; for (unsigned i = 0, e = Outs.size(); i != e; ++i) { SDValue theVal = OutVals[i]; EVT TheValType = theVal.getValueType(); @@ -1960,16 +2484,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, else if (TmpVal.getValueType().getSizeInBits() < 16) TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal); - SDValue Ops[] = { Chain, DAG.getConstant(SizeSoFar, MVT::i32), TmpVal }; + SDValue Ops[] = { + Chain, + DAG.getConstant(Offsets[i], MVT::i32), + TmpVal }; Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, DAG.getVTList(MVT::Other), Ops, TheStoreType, MachinePointerInfo()); - if(TheValType.isVector()) - SizeSoFar += - TheStoreType.getVectorElementType().getStoreSizeInBits() / 8; - else - SizeSoFar += TheStoreType.getStoreSizeInBits()/8; } } } @@ -2006,90 +2528,357 @@ static unsigned getOpcForTextureInstr(unsigned Intrinsic) { default: return 0; - case Intrinsic::nvvm_tex_1d_v4f32_i32: - return NVPTXISD::Tex1DFloatI32; + case Intrinsic::nvvm_tex_1d_v4f32_s32: + return NVPTXISD::Tex1DFloatS32; case Intrinsic::nvvm_tex_1d_v4f32_f32: return NVPTXISD::Tex1DFloatFloat; case Intrinsic::nvvm_tex_1d_level_v4f32_f32: return NVPTXISD::Tex1DFloatFloatLevel; case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: return NVPTXISD::Tex1DFloatFloatGrad; - case Intrinsic::nvvm_tex_1d_v4i32_i32: - return NVPTXISD::Tex1DI32I32; - case Intrinsic::nvvm_tex_1d_v4i32_f32: - return NVPTXISD::Tex1DI32Float; - case Intrinsic::nvvm_tex_1d_level_v4i32_f32: - return NVPTXISD::Tex1DI32FloatLevel; - case Intrinsic::nvvm_tex_1d_grad_v4i32_f32: - return NVPTXISD::Tex1DI32FloatGrad; - - case Intrinsic::nvvm_tex_1d_array_v4f32_i32: - return NVPTXISD::Tex1DArrayFloatI32; + case Intrinsic::nvvm_tex_1d_v4s32_s32: + return NVPTXISD::Tex1DS32S32; + case Intrinsic::nvvm_tex_1d_v4s32_f32: + return NVPTXISD::Tex1DS32Float; + case Intrinsic::nvvm_tex_1d_level_v4s32_f32: + return NVPTXISD::Tex1DS32FloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: + return NVPTXISD::Tex1DS32FloatGrad; + case Intrinsic::nvvm_tex_1d_v4u32_s32: + return NVPTXISD::Tex1DU32S32; + case Intrinsic::nvvm_tex_1d_v4u32_f32: + return NVPTXISD::Tex1DU32Float; + case Intrinsic::nvvm_tex_1d_level_v4u32_f32: + return NVPTXISD::Tex1DU32FloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: + return NVPTXISD::Tex1DU32FloatGrad; + + case Intrinsic::nvvm_tex_1d_array_v4f32_s32: + return NVPTXISD::Tex1DArrayFloatS32; case Intrinsic::nvvm_tex_1d_array_v4f32_f32: return NVPTXISD::Tex1DArrayFloatFloat; case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: return NVPTXISD::Tex1DArrayFloatFloatLevel; case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: return NVPTXISD::Tex1DArrayFloatFloatGrad; - case Intrinsic::nvvm_tex_1d_array_v4i32_i32: - return NVPTXISD::Tex1DArrayI32I32; - case Intrinsic::nvvm_tex_1d_array_v4i32_f32: - return NVPTXISD::Tex1DArrayI32Float; - case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32: - return NVPTXISD::Tex1DArrayI32FloatLevel; - case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32: - return NVPTXISD::Tex1DArrayI32FloatGrad; - - case Intrinsic::nvvm_tex_2d_v4f32_i32: - return NVPTXISD::Tex2DFloatI32; + case Intrinsic::nvvm_tex_1d_array_v4s32_s32: + return NVPTXISD::Tex1DArrayS32S32; + case Intrinsic::nvvm_tex_1d_array_v4s32_f32: + return NVPTXISD::Tex1DArrayS32Float; + case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: + return NVPTXISD::Tex1DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: + return NVPTXISD::Tex1DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_1d_array_v4u32_s32: + return NVPTXISD::Tex1DArrayU32S32; + case Intrinsic::nvvm_tex_1d_array_v4u32_f32: + return NVPTXISD::Tex1DArrayU32Float; + case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: + return NVPTXISD::Tex1DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: + return NVPTXISD::Tex1DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_2d_v4f32_s32: + return NVPTXISD::Tex2DFloatS32; case Intrinsic::nvvm_tex_2d_v4f32_f32: return NVPTXISD::Tex2DFloatFloat; case Intrinsic::nvvm_tex_2d_level_v4f32_f32: return NVPTXISD::Tex2DFloatFloatLevel; case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: return NVPTXISD::Tex2DFloatFloatGrad; - case Intrinsic::nvvm_tex_2d_v4i32_i32: - return NVPTXISD::Tex2DI32I32; - case Intrinsic::nvvm_tex_2d_v4i32_f32: - return NVPTXISD::Tex2DI32Float; - case Intrinsic::nvvm_tex_2d_level_v4i32_f32: - return NVPTXISD::Tex2DI32FloatLevel; - case Intrinsic::nvvm_tex_2d_grad_v4i32_f32: - return NVPTXISD::Tex2DI32FloatGrad; - - case Intrinsic::nvvm_tex_2d_array_v4f32_i32: - return NVPTXISD::Tex2DArrayFloatI32; + case Intrinsic::nvvm_tex_2d_v4s32_s32: + return NVPTXISD::Tex2DS32S32; + case Intrinsic::nvvm_tex_2d_v4s32_f32: + return NVPTXISD::Tex2DS32Float; + case Intrinsic::nvvm_tex_2d_level_v4s32_f32: + return NVPTXISD::Tex2DS32FloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: + return NVPTXISD::Tex2DS32FloatGrad; + case Intrinsic::nvvm_tex_2d_v4u32_s32: + return NVPTXISD::Tex2DU32S32; + case Intrinsic::nvvm_tex_2d_v4u32_f32: + return NVPTXISD::Tex2DU32Float; + case Intrinsic::nvvm_tex_2d_level_v4u32_f32: + return NVPTXISD::Tex2DU32FloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: + return NVPTXISD::Tex2DU32FloatGrad; + + case Intrinsic::nvvm_tex_2d_array_v4f32_s32: + return NVPTXISD::Tex2DArrayFloatS32; case Intrinsic::nvvm_tex_2d_array_v4f32_f32: return NVPTXISD::Tex2DArrayFloatFloat; case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: return NVPTXISD::Tex2DArrayFloatFloatLevel; case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: return NVPTXISD::Tex2DArrayFloatFloatGrad; - case Intrinsic::nvvm_tex_2d_array_v4i32_i32: - return NVPTXISD::Tex2DArrayI32I32; - case Intrinsic::nvvm_tex_2d_array_v4i32_f32: - return NVPTXISD::Tex2DArrayI32Float; - case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32: - return NVPTXISD::Tex2DArrayI32FloatLevel; - case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32: - return NVPTXISD::Tex2DArrayI32FloatGrad; - - case Intrinsic::nvvm_tex_3d_v4f32_i32: - return NVPTXISD::Tex3DFloatI32; + case Intrinsic::nvvm_tex_2d_array_v4s32_s32: + return NVPTXISD::Tex2DArrayS32S32; + case Intrinsic::nvvm_tex_2d_array_v4s32_f32: + return NVPTXISD::Tex2DArrayS32Float; + case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: + return NVPTXISD::Tex2DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: + return NVPTXISD::Tex2DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_2d_array_v4u32_s32: + return NVPTXISD::Tex2DArrayU32S32; + case Intrinsic::nvvm_tex_2d_array_v4u32_f32: + return NVPTXISD::Tex2DArrayU32Float; + case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: + return NVPTXISD::Tex2DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: + return NVPTXISD::Tex2DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_3d_v4f32_s32: + return NVPTXISD::Tex3DFloatS32; case Intrinsic::nvvm_tex_3d_v4f32_f32: return NVPTXISD::Tex3DFloatFloat; case Intrinsic::nvvm_tex_3d_level_v4f32_f32: return NVPTXISD::Tex3DFloatFloatLevel; case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: return NVPTXISD::Tex3DFloatFloatGrad; - case Intrinsic::nvvm_tex_3d_v4i32_i32: - return NVPTXISD::Tex3DI32I32; - case Intrinsic::nvvm_tex_3d_v4i32_f32: - return NVPTXISD::Tex3DI32Float; - case Intrinsic::nvvm_tex_3d_level_v4i32_f32: - return NVPTXISD::Tex3DI32FloatLevel; - case Intrinsic::nvvm_tex_3d_grad_v4i32_f32: - return NVPTXISD::Tex3DI32FloatGrad; + case Intrinsic::nvvm_tex_3d_v4s32_s32: + return NVPTXISD::Tex3DS32S32; + case Intrinsic::nvvm_tex_3d_v4s32_f32: + return NVPTXISD::Tex3DS32Float; + case Intrinsic::nvvm_tex_3d_level_v4s32_f32: + return NVPTXISD::Tex3DS32FloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: + return NVPTXISD::Tex3DS32FloatGrad; + case Intrinsic::nvvm_tex_3d_v4u32_s32: + return NVPTXISD::Tex3DU32S32; + case Intrinsic::nvvm_tex_3d_v4u32_f32: + return NVPTXISD::Tex3DU32Float; + case Intrinsic::nvvm_tex_3d_level_v4u32_f32: + return NVPTXISD::Tex3DU32FloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: + return NVPTXISD::Tex3DU32FloatGrad; + + case Intrinsic::nvvm_tex_cube_v4f32_f32: + return NVPTXISD::TexCubeFloatFloat; + case Intrinsic::nvvm_tex_cube_level_v4f32_f32: + return NVPTXISD::TexCubeFloatFloatLevel; + case Intrinsic::nvvm_tex_cube_v4s32_f32: + return NVPTXISD::TexCubeS32Float; + case Intrinsic::nvvm_tex_cube_level_v4s32_f32: + return NVPTXISD::TexCubeS32FloatLevel; + case Intrinsic::nvvm_tex_cube_v4u32_f32: + return NVPTXISD::TexCubeU32Float; + case Intrinsic::nvvm_tex_cube_level_v4u32_f32: + return NVPTXISD::TexCubeU32FloatLevel; + + case Intrinsic::nvvm_tex_cube_array_v4f32_f32: + return NVPTXISD::TexCubeArrayFloatFloat; + case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: + return NVPTXISD::TexCubeArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_cube_array_v4s32_f32: + return NVPTXISD::TexCubeArrayS32Float; + case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: + return NVPTXISD::TexCubeArrayS32FloatLevel; + case Intrinsic::nvvm_tex_cube_array_v4u32_f32: + return NVPTXISD::TexCubeArrayU32Float; + case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: + return NVPTXISD::TexCubeArrayU32FloatLevel; + + case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: + return NVPTXISD::Tld4R2DFloatFloat; + case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: + return NVPTXISD::Tld4G2DFloatFloat; + case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: + return NVPTXISD::Tld4B2DFloatFloat; + case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: + return NVPTXISD::Tld4A2DFloatFloat; + case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: + return NVPTXISD::Tld4R2DS64Float; + case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: + return NVPTXISD::Tld4G2DS64Float; + case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: + return NVPTXISD::Tld4B2DS64Float; + case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: + return NVPTXISD::Tld4A2DS64Float; + case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: + return NVPTXISD::Tld4R2DU64Float; + case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: + return NVPTXISD::Tld4G2DU64Float; + case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: + return NVPTXISD::Tld4B2DU64Float; + case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: + return NVPTXISD::Tld4A2DU64Float; + + case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: + return NVPTXISD::TexUnified1DFloatS32; + case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloat; + case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: + return NVPTXISD::TexUnified1DS32S32; + case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: + return NVPTXISD::TexUnified1DS32Float; + case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: + return NVPTXISD::TexUnified1DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: + return NVPTXISD::TexUnified1DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: + return NVPTXISD::TexUnified1DU32S32; + case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: + return NVPTXISD::TexUnified1DU32Float; + case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: + return NVPTXISD::TexUnified1DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: + return NVPTXISD::TexUnified1DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: + return NVPTXISD::TexUnified1DArrayFloatS32; + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: + return NVPTXISD::TexUnified1DArrayS32S32; + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32Float; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: + return NVPTXISD::TexUnified1DArrayU32S32; + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32Float; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: + return NVPTXISD::TexUnified2DFloatS32; + case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloat; + case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: + return NVPTXISD::TexUnified2DS32S32; + case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: + return NVPTXISD::TexUnified2DS32Float; + case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: + return NVPTXISD::TexUnified2DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: + return NVPTXISD::TexUnified2DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: + return NVPTXISD::TexUnified2DU32S32; + case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: + return NVPTXISD::TexUnified2DU32Float; + case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: + return NVPTXISD::TexUnified2DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: + return NVPTXISD::TexUnified2DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: + return NVPTXISD::TexUnified2DArrayFloatS32; + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: + return NVPTXISD::TexUnified2DArrayS32S32; + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32Float; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: + return NVPTXISD::TexUnified2DArrayU32S32; + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32Float; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: + return NVPTXISD::TexUnified3DFloatS32; + case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloat; + case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: + return NVPTXISD::TexUnified3DS32S32; + case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: + return NVPTXISD::TexUnified3DS32Float; + case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: + return NVPTXISD::TexUnified3DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: + return NVPTXISD::TexUnified3DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: + return NVPTXISD::TexUnified3DU32S32; + case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: + return NVPTXISD::TexUnified3DU32Float; + case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: + return NVPTXISD::TexUnified3DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: + return NVPTXISD::TexUnified3DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: + return NVPTXISD::TexUnifiedCubeFloatFloat; + case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: + return NVPTXISD::TexUnifiedCubeFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: + return NVPTXISD::TexUnifiedCubeS32Float; + case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: + return NVPTXISD::TexUnifiedCubeS32FloatLevel; + case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: + return NVPTXISD::TexUnifiedCubeU32Float; + case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: + return NVPTXISD::TexUnifiedCubeU32FloatLevel; + + case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: + return NVPTXISD::TexUnifiedCubeArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: + return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: + return NVPTXISD::TexUnifiedCubeArrayS32Float; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: + return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: + return NVPTXISD::TexUnifiedCubeArrayU32Float; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: + return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; + + case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedR2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedG2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedB2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedA2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedR2DS64Float; + case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedG2DS64Float; + case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedB2DS64Float; + case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedA2DS64Float; + case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedR2DU64Float; + case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedG2DU64Float; + case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedB2DU64Float; + case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedA2DU64Float; } } @@ -2097,18 +2886,132 @@ static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { switch (Intrinsic) { default: return 0; + case Intrinsic::nvvm_suld_1d_i8_clamp: + return NVPTXISD::Suld1DI8Clamp; + case Intrinsic::nvvm_suld_1d_i16_clamp: + return NVPTXISD::Suld1DI16Clamp; + case Intrinsic::nvvm_suld_1d_i32_clamp: + return NVPTXISD::Suld1DI32Clamp; + case Intrinsic::nvvm_suld_1d_i64_clamp: + return NVPTXISD::Suld1DI64Clamp; + case Intrinsic::nvvm_suld_1d_v2i8_clamp: + return NVPTXISD::Suld1DV2I8Clamp; + case Intrinsic::nvvm_suld_1d_v2i16_clamp: + return NVPTXISD::Suld1DV2I16Clamp; + case Intrinsic::nvvm_suld_1d_v2i32_clamp: + return NVPTXISD::Suld1DV2I32Clamp; + case Intrinsic::nvvm_suld_1d_v2i64_clamp: + return NVPTXISD::Suld1DV2I64Clamp; + case Intrinsic::nvvm_suld_1d_v4i8_clamp: + return NVPTXISD::Suld1DV4I8Clamp; + case Intrinsic::nvvm_suld_1d_v4i16_clamp: + return NVPTXISD::Suld1DV4I16Clamp; + case Intrinsic::nvvm_suld_1d_v4i32_clamp: + return NVPTXISD::Suld1DV4I32Clamp; + case Intrinsic::nvvm_suld_1d_array_i8_clamp: + return NVPTXISD::Suld1DArrayI8Clamp; + case Intrinsic::nvvm_suld_1d_array_i16_clamp: + return NVPTXISD::Suld1DArrayI16Clamp; + case Intrinsic::nvvm_suld_1d_array_i32_clamp: + return NVPTXISD::Suld1DArrayI32Clamp; + case Intrinsic::nvvm_suld_1d_array_i64_clamp: + return NVPTXISD::Suld1DArrayI64Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: + return NVPTXISD::Suld1DArrayV2I8Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: + return NVPTXISD::Suld1DArrayV2I16Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: + return NVPTXISD::Suld1DArrayV2I32Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: + return NVPTXISD::Suld1DArrayV2I64Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: + return NVPTXISD::Suld1DArrayV4I8Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: + return NVPTXISD::Suld1DArrayV4I16Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: + return NVPTXISD::Suld1DArrayV4I32Clamp; + case Intrinsic::nvvm_suld_2d_i8_clamp: + return NVPTXISD::Suld2DI8Clamp; + case Intrinsic::nvvm_suld_2d_i16_clamp: + return NVPTXISD::Suld2DI16Clamp; + case Intrinsic::nvvm_suld_2d_i32_clamp: + return NVPTXISD::Suld2DI32Clamp; + case Intrinsic::nvvm_suld_2d_i64_clamp: + return NVPTXISD::Suld2DI64Clamp; + case Intrinsic::nvvm_suld_2d_v2i8_clamp: + return NVPTXISD::Suld2DV2I8Clamp; + case Intrinsic::nvvm_suld_2d_v2i16_clamp: + return NVPTXISD::Suld2DV2I16Clamp; + case Intrinsic::nvvm_suld_2d_v2i32_clamp: + return NVPTXISD::Suld2DV2I32Clamp; + case Intrinsic::nvvm_suld_2d_v2i64_clamp: + return NVPTXISD::Suld2DV2I64Clamp; + case Intrinsic::nvvm_suld_2d_v4i8_clamp: + return NVPTXISD::Suld2DV4I8Clamp; + case Intrinsic::nvvm_suld_2d_v4i16_clamp: + return NVPTXISD::Suld2DV4I16Clamp; + case Intrinsic::nvvm_suld_2d_v4i32_clamp: + return NVPTXISD::Suld2DV4I32Clamp; + case Intrinsic::nvvm_suld_2d_array_i8_clamp: + return NVPTXISD::Suld2DArrayI8Clamp; + case Intrinsic::nvvm_suld_2d_array_i16_clamp: + return NVPTXISD::Suld2DArrayI16Clamp; + case Intrinsic::nvvm_suld_2d_array_i32_clamp: + return NVPTXISD::Suld2DArrayI32Clamp; + case Intrinsic::nvvm_suld_2d_array_i64_clamp: + return NVPTXISD::Suld2DArrayI64Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: + return NVPTXISD::Suld2DArrayV2I8Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: + return NVPTXISD::Suld2DArrayV2I16Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: + return NVPTXISD::Suld2DArrayV2I32Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: + return NVPTXISD::Suld2DArrayV2I64Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: + return NVPTXISD::Suld2DArrayV4I8Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: + return NVPTXISD::Suld2DArrayV4I16Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: + return NVPTXISD::Suld2DArrayV4I32Clamp; + case Intrinsic::nvvm_suld_3d_i8_clamp: + return NVPTXISD::Suld3DI8Clamp; + case Intrinsic::nvvm_suld_3d_i16_clamp: + return NVPTXISD::Suld3DI16Clamp; + case Intrinsic::nvvm_suld_3d_i32_clamp: + return NVPTXISD::Suld3DI32Clamp; + case Intrinsic::nvvm_suld_3d_i64_clamp: + return NVPTXISD::Suld3DI64Clamp; + case Intrinsic::nvvm_suld_3d_v2i8_clamp: + return NVPTXISD::Suld3DV2I8Clamp; + case Intrinsic::nvvm_suld_3d_v2i16_clamp: + return NVPTXISD::Suld3DV2I16Clamp; + case Intrinsic::nvvm_suld_3d_v2i32_clamp: + return NVPTXISD::Suld3DV2I32Clamp; + case Intrinsic::nvvm_suld_3d_v2i64_clamp: + return NVPTXISD::Suld3DV2I64Clamp; + case Intrinsic::nvvm_suld_3d_v4i8_clamp: + return NVPTXISD::Suld3DV4I8Clamp; + case Intrinsic::nvvm_suld_3d_v4i16_clamp: + return NVPTXISD::Suld3DV4I16Clamp; + case Intrinsic::nvvm_suld_3d_v4i32_clamp: + return NVPTXISD::Suld3DV4I32Clamp; case Intrinsic::nvvm_suld_1d_i8_trap: return NVPTXISD::Suld1DI8Trap; case Intrinsic::nvvm_suld_1d_i16_trap: return NVPTXISD::Suld1DI16Trap; case Intrinsic::nvvm_suld_1d_i32_trap: return NVPTXISD::Suld1DI32Trap; + case Intrinsic::nvvm_suld_1d_i64_trap: + return NVPTXISD::Suld1DI64Trap; case Intrinsic::nvvm_suld_1d_v2i8_trap: return NVPTXISD::Suld1DV2I8Trap; case Intrinsic::nvvm_suld_1d_v2i16_trap: return NVPTXISD::Suld1DV2I16Trap; case Intrinsic::nvvm_suld_1d_v2i32_trap: return NVPTXISD::Suld1DV2I32Trap; + case Intrinsic::nvvm_suld_1d_v2i64_trap: + return NVPTXISD::Suld1DV2I64Trap; case Intrinsic::nvvm_suld_1d_v4i8_trap: return NVPTXISD::Suld1DV4I8Trap; case Intrinsic::nvvm_suld_1d_v4i16_trap: @@ -2121,12 +3024,16 @@ static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { return NVPTXISD::Suld1DArrayI16Trap; case Intrinsic::nvvm_suld_1d_array_i32_trap: return NVPTXISD::Suld1DArrayI32Trap; + case Intrinsic::nvvm_suld_1d_array_i64_trap: + return NVPTXISD::Suld1DArrayI64Trap; case Intrinsic::nvvm_suld_1d_array_v2i8_trap: return NVPTXISD::Suld1DArrayV2I8Trap; case Intrinsic::nvvm_suld_1d_array_v2i16_trap: return NVPTXISD::Suld1DArrayV2I16Trap; case Intrinsic::nvvm_suld_1d_array_v2i32_trap: return NVPTXISD::Suld1DArrayV2I32Trap; + case Intrinsic::nvvm_suld_1d_array_v2i64_trap: + return NVPTXISD::Suld1DArrayV2I64Trap; case Intrinsic::nvvm_suld_1d_array_v4i8_trap: return NVPTXISD::Suld1DArrayV4I8Trap; case Intrinsic::nvvm_suld_1d_array_v4i16_trap: @@ -2139,12 +3046,16 @@ static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { return NVPTXISD::Suld2DI16Trap; case Intrinsic::nvvm_suld_2d_i32_trap: return NVPTXISD::Suld2DI32Trap; + case Intrinsic::nvvm_suld_2d_i64_trap: + return NVPTXISD::Suld2DI64Trap; case Intrinsic::nvvm_suld_2d_v2i8_trap: return NVPTXISD::Suld2DV2I8Trap; case Intrinsic::nvvm_suld_2d_v2i16_trap: return NVPTXISD::Suld2DV2I16Trap; case Intrinsic::nvvm_suld_2d_v2i32_trap: return NVPTXISD::Suld2DV2I32Trap; + case Intrinsic::nvvm_suld_2d_v2i64_trap: + return NVPTXISD::Suld2DV2I64Trap; case Intrinsic::nvvm_suld_2d_v4i8_trap: return NVPTXISD::Suld2DV4I8Trap; case Intrinsic::nvvm_suld_2d_v4i16_trap: @@ -2157,12 +3068,16 @@ static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { return NVPTXISD::Suld2DArrayI16Trap; case Intrinsic::nvvm_suld_2d_array_i32_trap: return NVPTXISD::Suld2DArrayI32Trap; + case Intrinsic::nvvm_suld_2d_array_i64_trap: + return NVPTXISD::Suld2DArrayI64Trap; case Intrinsic::nvvm_suld_2d_array_v2i8_trap: return NVPTXISD::Suld2DArrayV2I8Trap; case Intrinsic::nvvm_suld_2d_array_v2i16_trap: return NVPTXISD::Suld2DArrayV2I16Trap; case Intrinsic::nvvm_suld_2d_array_v2i32_trap: return NVPTXISD::Suld2DArrayV2I32Trap; + case Intrinsic::nvvm_suld_2d_array_v2i64_trap: + return NVPTXISD::Suld2DArrayV2I64Trap; case Intrinsic::nvvm_suld_2d_array_v4i8_trap: return NVPTXISD::Suld2DArrayV4I8Trap; case Intrinsic::nvvm_suld_2d_array_v4i16_trap: @@ -2175,18 +3090,132 @@ static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { return NVPTXISD::Suld3DI16Trap; case Intrinsic::nvvm_suld_3d_i32_trap: return NVPTXISD::Suld3DI32Trap; + case Intrinsic::nvvm_suld_3d_i64_trap: + return NVPTXISD::Suld3DI64Trap; case Intrinsic::nvvm_suld_3d_v2i8_trap: return NVPTXISD::Suld3DV2I8Trap; case Intrinsic::nvvm_suld_3d_v2i16_trap: return NVPTXISD::Suld3DV2I16Trap; case Intrinsic::nvvm_suld_3d_v2i32_trap: return NVPTXISD::Suld3DV2I32Trap; + case Intrinsic::nvvm_suld_3d_v2i64_trap: + return NVPTXISD::Suld3DV2I64Trap; case Intrinsic::nvvm_suld_3d_v4i8_trap: return NVPTXISD::Suld3DV4I8Trap; case Intrinsic::nvvm_suld_3d_v4i16_trap: return NVPTXISD::Suld3DV4I16Trap; case Intrinsic::nvvm_suld_3d_v4i32_trap: return NVPTXISD::Suld3DV4I32Trap; + case Intrinsic::nvvm_suld_1d_i8_zero: + return NVPTXISD::Suld1DI8Zero; + case Intrinsic::nvvm_suld_1d_i16_zero: + return NVPTXISD::Suld1DI16Zero; + case Intrinsic::nvvm_suld_1d_i32_zero: + return NVPTXISD::Suld1DI32Zero; + case Intrinsic::nvvm_suld_1d_i64_zero: + return NVPTXISD::Suld1DI64Zero; + case Intrinsic::nvvm_suld_1d_v2i8_zero: + return NVPTXISD::Suld1DV2I8Zero; + case Intrinsic::nvvm_suld_1d_v2i16_zero: + return NVPTXISD::Suld1DV2I16Zero; + case Intrinsic::nvvm_suld_1d_v2i32_zero: + return NVPTXISD::Suld1DV2I32Zero; + case Intrinsic::nvvm_suld_1d_v2i64_zero: + return NVPTXISD::Suld1DV2I64Zero; + case Intrinsic::nvvm_suld_1d_v4i8_zero: + return NVPTXISD::Suld1DV4I8Zero; + case Intrinsic::nvvm_suld_1d_v4i16_zero: + return NVPTXISD::Suld1DV4I16Zero; + case Intrinsic::nvvm_suld_1d_v4i32_zero: + return NVPTXISD::Suld1DV4I32Zero; + case Intrinsic::nvvm_suld_1d_array_i8_zero: + return NVPTXISD::Suld1DArrayI8Zero; + case Intrinsic::nvvm_suld_1d_array_i16_zero: + return NVPTXISD::Suld1DArrayI16Zero; + case Intrinsic::nvvm_suld_1d_array_i32_zero: + return NVPTXISD::Suld1DArrayI32Zero; + case Intrinsic::nvvm_suld_1d_array_i64_zero: + return NVPTXISD::Suld1DArrayI64Zero; + case Intrinsic::nvvm_suld_1d_array_v2i8_zero: + return NVPTXISD::Suld1DArrayV2I8Zero; + case Intrinsic::nvvm_suld_1d_array_v2i16_zero: + return NVPTXISD::Suld1DArrayV2I16Zero; + case Intrinsic::nvvm_suld_1d_array_v2i32_zero: + return NVPTXISD::Suld1DArrayV2I32Zero; + case Intrinsic::nvvm_suld_1d_array_v2i64_zero: + return NVPTXISD::Suld1DArrayV2I64Zero; + case Intrinsic::nvvm_suld_1d_array_v4i8_zero: + return NVPTXISD::Suld1DArrayV4I8Zero; + case Intrinsic::nvvm_suld_1d_array_v4i16_zero: + return NVPTXISD::Suld1DArrayV4I16Zero; + case Intrinsic::nvvm_suld_1d_array_v4i32_zero: + return NVPTXISD::Suld1DArrayV4I32Zero; + case Intrinsic::nvvm_suld_2d_i8_zero: + return NVPTXISD::Suld2DI8Zero; + case Intrinsic::nvvm_suld_2d_i16_zero: + return NVPTXISD::Suld2DI16Zero; + case Intrinsic::nvvm_suld_2d_i32_zero: + return NVPTXISD::Suld2DI32Zero; + case Intrinsic::nvvm_suld_2d_i64_zero: + return NVPTXISD::Suld2DI64Zero; + case Intrinsic::nvvm_suld_2d_v2i8_zero: + return NVPTXISD::Suld2DV2I8Zero; + case Intrinsic::nvvm_suld_2d_v2i16_zero: + return NVPTXISD::Suld2DV2I16Zero; + case Intrinsic::nvvm_suld_2d_v2i32_zero: + return NVPTXISD::Suld2DV2I32Zero; + case Intrinsic::nvvm_suld_2d_v2i64_zero: + return NVPTXISD::Suld2DV2I64Zero; + case Intrinsic::nvvm_suld_2d_v4i8_zero: + return NVPTXISD::Suld2DV4I8Zero; + case Intrinsic::nvvm_suld_2d_v4i16_zero: + return NVPTXISD::Suld2DV4I16Zero; + case Intrinsic::nvvm_suld_2d_v4i32_zero: + return NVPTXISD::Suld2DV4I32Zero; + case Intrinsic::nvvm_suld_2d_array_i8_zero: + return NVPTXISD::Suld2DArrayI8Zero; + case Intrinsic::nvvm_suld_2d_array_i16_zero: + return NVPTXISD::Suld2DArrayI16Zero; + case Intrinsic::nvvm_suld_2d_array_i32_zero: + return NVPTXISD::Suld2DArrayI32Zero; + case Intrinsic::nvvm_suld_2d_array_i64_zero: + return NVPTXISD::Suld2DArrayI64Zero; + case Intrinsic::nvvm_suld_2d_array_v2i8_zero: + return NVPTXISD::Suld2DArrayV2I8Zero; + case Intrinsic::nvvm_suld_2d_array_v2i16_zero: + return NVPTXISD::Suld2DArrayV2I16Zero; + case Intrinsic::nvvm_suld_2d_array_v2i32_zero: + return NVPTXISD::Suld2DArrayV2I32Zero; + case Intrinsic::nvvm_suld_2d_array_v2i64_zero: + return NVPTXISD::Suld2DArrayV2I64Zero; + case Intrinsic::nvvm_suld_2d_array_v4i8_zero: + return NVPTXISD::Suld2DArrayV4I8Zero; + case Intrinsic::nvvm_suld_2d_array_v4i16_zero: + return NVPTXISD::Suld2DArrayV4I16Zero; + case Intrinsic::nvvm_suld_2d_array_v4i32_zero: + return NVPTXISD::Suld2DArrayV4I32Zero; + case Intrinsic::nvvm_suld_3d_i8_zero: + return NVPTXISD::Suld3DI8Zero; + case Intrinsic::nvvm_suld_3d_i16_zero: + return NVPTXISD::Suld3DI16Zero; + case Intrinsic::nvvm_suld_3d_i32_zero: + return NVPTXISD::Suld3DI32Zero; + case Intrinsic::nvvm_suld_3d_i64_zero: + return NVPTXISD::Suld3DI64Zero; + case Intrinsic::nvvm_suld_3d_v2i8_zero: + return NVPTXISD::Suld3DV2I8Zero; + case Intrinsic::nvvm_suld_3d_v2i16_zero: + return NVPTXISD::Suld3DV2I16Zero; + case Intrinsic::nvvm_suld_3d_v2i32_zero: + return NVPTXISD::Suld3DV2I32Zero; + case Intrinsic::nvvm_suld_3d_v2i64_zero: + return NVPTXISD::Suld3DV2I64Zero; + case Intrinsic::nvvm_suld_3d_v4i8_zero: + return NVPTXISD::Suld3DV4I8Zero; + case Intrinsic::nvvm_suld_3d_v4i16_zero: + return NVPTXISD::Suld3DV4I16Zero; + case Intrinsic::nvvm_suld_3d_v4i32_zero: + return NVPTXISD::Suld3DV4I32Zero; } } @@ -2226,45 +3255,121 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_p: + case Intrinsic::nvvm_ldu_global_p: { Info.opc = ISD::INTRINSIC_W_CHAIN; if (Intrinsic == Intrinsic::nvvm_ldu_global_i) Info.memVT = getValueType(I.getType()); - else if (Intrinsic == Intrinsic::nvvm_ldu_global_p) + else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) + Info.memVT = getPointerTy(); + else + Info.memVT = getValueType(I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + + // alignment is available as metadata. + // Grab it and set the alignment. + assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata"); + MDNode *AlignMD = I.getMetadata("align"); + assert(AlignMD && "Must have a non-null MDNode"); + assert(AlignMD->getNumOperands() == 1 && "Must have a single operand"); + Value *Align = AlignMD->getOperand(0); + int64_t Alignment = cast(Align)->getZExtValue(); + Info.align = Alignment; + + return true; + } + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: { + + Info.opc = ISD::INTRINSIC_W_CHAIN; + if (Intrinsic == Intrinsic::nvvm_ldg_global_i) Info.memVT = getValueType(I.getType()); + else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) + Info.memVT = getPointerTy(); else - Info.memVT = MVT::f32; + Info.memVT = getValueType(I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.vol = 0; Info.readMem = true; Info.writeMem = false; - Info.align = 0; + + // alignment is available as metadata. + // Grab it and set the alignment. + assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata"); + MDNode *AlignMD = I.getMetadata("align"); + assert(AlignMD && "Must have a non-null MDNode"); + assert(AlignMD->getNumOperands() == 1 && "Must have a single operand"); + Value *Align = AlignMD->getOperand(0); + int64_t Alignment = cast(Align)->getZExtValue(); + Info.align = Alignment; + return true; + } - case Intrinsic::nvvm_tex_1d_v4f32_i32: + case Intrinsic::nvvm_tex_1d_v4f32_s32: case Intrinsic::nvvm_tex_1d_v4f32_f32: case Intrinsic::nvvm_tex_1d_level_v4f32_f32: case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: - case Intrinsic::nvvm_tex_1d_array_v4f32_i32: + case Intrinsic::nvvm_tex_1d_array_v4f32_s32: case Intrinsic::nvvm_tex_1d_array_v4f32_f32: case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: - case Intrinsic::nvvm_tex_2d_v4f32_i32: + case Intrinsic::nvvm_tex_2d_v4f32_s32: case Intrinsic::nvvm_tex_2d_v4f32_f32: case Intrinsic::nvvm_tex_2d_level_v4f32_f32: case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: - case Intrinsic::nvvm_tex_2d_array_v4f32_i32: + case Intrinsic::nvvm_tex_2d_array_v4f32_s32: case Intrinsic::nvvm_tex_2d_array_v4f32_f32: case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: - case Intrinsic::nvvm_tex_3d_v4f32_i32: + case Intrinsic::nvvm_tex_3d_v4f32_s32: case Intrinsic::nvvm_tex_3d_v4f32_f32: case Intrinsic::nvvm_tex_3d_level_v4f32_f32: - case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: { + case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_cube_v4f32_f32: + case Intrinsic::nvvm_tex_cube_level_v4f32_f32: + case Intrinsic::nvvm_tex_cube_array_v4f32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: { Info.opc = getOpcForTextureInstr(Intrinsic); - Info.memVT = MVT::f32; + Info.memVT = MVT::v4f32; Info.ptrVal = nullptr; Info.offset = 0; Info.vol = 0; @@ -2273,28 +3378,120 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.align = 16; return true; } - case Intrinsic::nvvm_tex_1d_v4i32_i32: - case Intrinsic::nvvm_tex_1d_v4i32_f32: - case Intrinsic::nvvm_tex_1d_level_v4i32_f32: - case Intrinsic::nvvm_tex_1d_grad_v4i32_f32: - case Intrinsic::nvvm_tex_1d_array_v4i32_i32: - case Intrinsic::nvvm_tex_1d_array_v4i32_f32: - case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32: - case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32: - case Intrinsic::nvvm_tex_2d_v4i32_i32: - case Intrinsic::nvvm_tex_2d_v4i32_f32: - case Intrinsic::nvvm_tex_2d_level_v4i32_f32: - case Intrinsic::nvvm_tex_2d_grad_v4i32_f32: - case Intrinsic::nvvm_tex_2d_array_v4i32_i32: - case Intrinsic::nvvm_tex_2d_array_v4i32_f32: - case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32: - case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32: - case Intrinsic::nvvm_tex_3d_v4i32_i32: - case Intrinsic::nvvm_tex_3d_v4i32_f32: - case Intrinsic::nvvm_tex_3d_level_v4i32_f32: - case Intrinsic::nvvm_tex_3d_grad_v4i32_f32: { + case Intrinsic::nvvm_tex_1d_v4s32_s32: + case Intrinsic::nvvm_tex_1d_v4s32_f32: + case Intrinsic::nvvm_tex_1d_level_v4s32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_v4s32_s32: + case Intrinsic::nvvm_tex_1d_array_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_2d_v4s32_s32: + case Intrinsic::nvvm_tex_2d_v4s32_f32: + case Intrinsic::nvvm_tex_2d_level_v4s32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_v4s32_s32: + case Intrinsic::nvvm_tex_2d_array_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_3d_v4s32_s32: + case Intrinsic::nvvm_tex_3d_v4s32_f32: + case Intrinsic::nvvm_tex_3d_level_v4s32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_cube_v4s32_f32: + case Intrinsic::nvvm_tex_cube_level_v4s32_f32: + case Intrinsic::nvvm_tex_cube_array_v4s32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_cube_v4u32_f32: + case Intrinsic::nvvm_tex_cube_level_v4u32_f32: + case Intrinsic::nvvm_tex_cube_array_v4u32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_v4u32_s32: + case Intrinsic::nvvm_tex_1d_v4u32_f32: + case Intrinsic::nvvm_tex_1d_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_v4u32_s32: + case Intrinsic::nvvm_tex_1d_array_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_2d_v4u32_s32: + case Intrinsic::nvvm_tex_2d_v4u32_f32: + case Intrinsic::nvvm_tex_2d_level_v4u32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_v4u32_s32: + case Intrinsic::nvvm_tex_2d_array_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_3d_v4u32_s32: + case Intrinsic::nvvm_tex_3d_v4u32_f32: + case Intrinsic::nvvm_tex_3d_level_v4u32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: { Info.opc = getOpcForTextureInstr(Intrinsic); - Info.memVT = MVT::i32; + Info.memVT = MVT::v4i32; Info.ptrVal = nullptr; Info.offset = 0; Info.vol = 0; @@ -2303,6 +3500,21 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.align = 16; return true; } + case Intrinsic::nvvm_suld_1d_i8_clamp: + case Intrinsic::nvvm_suld_1d_v2i8_clamp: + case Intrinsic::nvvm_suld_1d_v4i8_clamp: + case Intrinsic::nvvm_suld_1d_array_i8_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: + case Intrinsic::nvvm_suld_2d_i8_clamp: + case Intrinsic::nvvm_suld_2d_v2i8_clamp: + case Intrinsic::nvvm_suld_2d_v4i8_clamp: + case Intrinsic::nvvm_suld_2d_array_i8_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: + case Intrinsic::nvvm_suld_3d_i8_clamp: + case Intrinsic::nvvm_suld_3d_v2i8_clamp: + case Intrinsic::nvvm_suld_3d_v4i8_clamp: case Intrinsic::nvvm_suld_1d_i8_trap: case Intrinsic::nvvm_suld_1d_v2i8_trap: case Intrinsic::nvvm_suld_1d_v4i8_trap: @@ -2317,7 +3529,22 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_suld_2d_array_v4i8_trap: case Intrinsic::nvvm_suld_3d_i8_trap: case Intrinsic::nvvm_suld_3d_v2i8_trap: - case Intrinsic::nvvm_suld_3d_v4i8_trap: { + case Intrinsic::nvvm_suld_3d_v4i8_trap: + case Intrinsic::nvvm_suld_1d_i8_zero: + case Intrinsic::nvvm_suld_1d_v2i8_zero: + case Intrinsic::nvvm_suld_1d_v4i8_zero: + case Intrinsic::nvvm_suld_1d_array_i8_zero: + case Intrinsic::nvvm_suld_1d_array_v2i8_zero: + case Intrinsic::nvvm_suld_1d_array_v4i8_zero: + case Intrinsic::nvvm_suld_2d_i8_zero: + case Intrinsic::nvvm_suld_2d_v2i8_zero: + case Intrinsic::nvvm_suld_2d_v4i8_zero: + case Intrinsic::nvvm_suld_2d_array_i8_zero: + case Intrinsic::nvvm_suld_2d_array_v2i8_zero: + case Intrinsic::nvvm_suld_2d_array_v4i8_zero: + case Intrinsic::nvvm_suld_3d_i8_zero: + case Intrinsic::nvvm_suld_3d_v2i8_zero: + case Intrinsic::nvvm_suld_3d_v4i8_zero: { Info.opc = getOpcForSurfaceInstr(Intrinsic); Info.memVT = MVT::i8; Info.ptrVal = nullptr; @@ -2328,6 +3555,21 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.align = 16; return true; } + case Intrinsic::nvvm_suld_1d_i16_clamp: + case Intrinsic::nvvm_suld_1d_v2i16_clamp: + case Intrinsic::nvvm_suld_1d_v4i16_clamp: + case Intrinsic::nvvm_suld_1d_array_i16_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: + case Intrinsic::nvvm_suld_2d_i16_clamp: + case Intrinsic::nvvm_suld_2d_v2i16_clamp: + case Intrinsic::nvvm_suld_2d_v4i16_clamp: + case Intrinsic::nvvm_suld_2d_array_i16_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: + case Intrinsic::nvvm_suld_3d_i16_clamp: + case Intrinsic::nvvm_suld_3d_v2i16_clamp: + case Intrinsic::nvvm_suld_3d_v4i16_clamp: case Intrinsic::nvvm_suld_1d_i16_trap: case Intrinsic::nvvm_suld_1d_v2i16_trap: case Intrinsic::nvvm_suld_1d_v4i16_trap: @@ -2342,7 +3584,22 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_suld_2d_array_v4i16_trap: case Intrinsic::nvvm_suld_3d_i16_trap: case Intrinsic::nvvm_suld_3d_v2i16_trap: - case Intrinsic::nvvm_suld_3d_v4i16_trap: { + case Intrinsic::nvvm_suld_3d_v4i16_trap: + case Intrinsic::nvvm_suld_1d_i16_zero: + case Intrinsic::nvvm_suld_1d_v2i16_zero: + case Intrinsic::nvvm_suld_1d_v4i16_zero: + case Intrinsic::nvvm_suld_1d_array_i16_zero: + case Intrinsic::nvvm_suld_1d_array_v2i16_zero: + case Intrinsic::nvvm_suld_1d_array_v4i16_zero: + case Intrinsic::nvvm_suld_2d_i16_zero: + case Intrinsic::nvvm_suld_2d_v2i16_zero: + case Intrinsic::nvvm_suld_2d_v4i16_zero: + case Intrinsic::nvvm_suld_2d_array_i16_zero: + case Intrinsic::nvvm_suld_2d_array_v2i16_zero: + case Intrinsic::nvvm_suld_2d_array_v4i16_zero: + case Intrinsic::nvvm_suld_3d_i16_zero: + case Intrinsic::nvvm_suld_3d_v2i16_zero: + case Intrinsic::nvvm_suld_3d_v4i16_zero: { Info.opc = getOpcForSurfaceInstr(Intrinsic); Info.memVT = MVT::i16; Info.ptrVal = nullptr; @@ -2353,6 +3610,21 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.align = 16; return true; } + case Intrinsic::nvvm_suld_1d_i32_clamp: + case Intrinsic::nvvm_suld_1d_v2i32_clamp: + case Intrinsic::nvvm_suld_1d_v4i32_clamp: + case Intrinsic::nvvm_suld_1d_array_i32_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: + case Intrinsic::nvvm_suld_2d_i32_clamp: + case Intrinsic::nvvm_suld_2d_v2i32_clamp: + case Intrinsic::nvvm_suld_2d_v4i32_clamp: + case Intrinsic::nvvm_suld_2d_array_i32_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: + case Intrinsic::nvvm_suld_3d_i32_clamp: + case Intrinsic::nvvm_suld_3d_v2i32_clamp: + case Intrinsic::nvvm_suld_3d_v4i32_clamp: case Intrinsic::nvvm_suld_1d_i32_trap: case Intrinsic::nvvm_suld_1d_v2i32_trap: case Intrinsic::nvvm_suld_1d_v4i32_trap: @@ -2367,7 +3639,22 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_suld_2d_array_v4i32_trap: case Intrinsic::nvvm_suld_3d_i32_trap: case Intrinsic::nvvm_suld_3d_v2i32_trap: - case Intrinsic::nvvm_suld_3d_v4i32_trap: { + case Intrinsic::nvvm_suld_3d_v4i32_trap: + case Intrinsic::nvvm_suld_1d_i32_zero: + case Intrinsic::nvvm_suld_1d_v2i32_zero: + case Intrinsic::nvvm_suld_1d_v4i32_zero: + case Intrinsic::nvvm_suld_1d_array_i32_zero: + case Intrinsic::nvvm_suld_1d_array_v2i32_zero: + case Intrinsic::nvvm_suld_1d_array_v4i32_zero: + case Intrinsic::nvvm_suld_2d_i32_zero: + case Intrinsic::nvvm_suld_2d_v2i32_zero: + case Intrinsic::nvvm_suld_2d_v4i32_zero: + case Intrinsic::nvvm_suld_2d_array_i32_zero: + case Intrinsic::nvvm_suld_2d_array_v2i32_zero: + case Intrinsic::nvvm_suld_2d_array_v4i32_zero: + case Intrinsic::nvvm_suld_3d_i32_zero: + case Intrinsic::nvvm_suld_3d_v2i32_zero: + case Intrinsic::nvvm_suld_3d_v4i32_zero: { Info.opc = getOpcForSurfaceInstr(Intrinsic); Info.memVT = MVT::i32; Info.ptrVal = nullptr; @@ -2378,7 +3665,46 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.align = 16; return true; } - + case Intrinsic::nvvm_suld_1d_i64_clamp: + case Intrinsic::nvvm_suld_1d_v2i64_clamp: + case Intrinsic::nvvm_suld_1d_array_i64_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: + case Intrinsic::nvvm_suld_2d_i64_clamp: + case Intrinsic::nvvm_suld_2d_v2i64_clamp: + case Intrinsic::nvvm_suld_2d_array_i64_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: + case Intrinsic::nvvm_suld_3d_i64_clamp: + case Intrinsic::nvvm_suld_3d_v2i64_clamp: + case Intrinsic::nvvm_suld_1d_i64_trap: + case Intrinsic::nvvm_suld_1d_v2i64_trap: + case Intrinsic::nvvm_suld_1d_array_i64_trap: + case Intrinsic::nvvm_suld_1d_array_v2i64_trap: + case Intrinsic::nvvm_suld_2d_i64_trap: + case Intrinsic::nvvm_suld_2d_v2i64_trap: + case Intrinsic::nvvm_suld_2d_array_i64_trap: + case Intrinsic::nvvm_suld_2d_array_v2i64_trap: + case Intrinsic::nvvm_suld_3d_i64_trap: + case Intrinsic::nvvm_suld_3d_v2i64_trap: + case Intrinsic::nvvm_suld_1d_i64_zero: + case Intrinsic::nvvm_suld_1d_v2i64_zero: + case Intrinsic::nvvm_suld_1d_array_i64_zero: + case Intrinsic::nvvm_suld_1d_array_v2i64_zero: + case Intrinsic::nvvm_suld_2d_i64_zero: + case Intrinsic::nvvm_suld_2d_v2i64_zero: + case Intrinsic::nvvm_suld_2d_array_i64_zero: + case Intrinsic::nvvm_suld_2d_array_v2i64_zero: + case Intrinsic::nvvm_suld_3d_i64_zero: + case Intrinsic::nvvm_suld_3d_v2i64_zero: { + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i64; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + } } return false; } @@ -2433,6 +3759,7 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const { switch (Constraint[0]) { default: break; + case 'b': case 'r': case 'h': case 'c': @@ -2452,6 +3779,8 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { + case 'b': + return std::make_pair(0U, &NVPTX::Int1RegsRegClass); case 'c': return std::make_pair(0U, &NVPTX::Int16RegsRegClass); case 'h': @@ -2475,8 +3804,435 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const { return 4; } +//===----------------------------------------------------------------------===// +// NVPTX DAG Combining +//===----------------------------------------------------------------------===// + +bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, + CodeGenOpt::Level OptLevel) const { + const Function *F = MF.getFunction(); + const TargetOptions &TO = MF.getTarget().Options; + + // Always honor command-line argument + if (FMAContractLevelOpt.getNumOccurrences() > 0) { + return FMAContractLevelOpt > 0; + } else if (OptLevel == 0) { + // Do not contract if we're not optimizing the code + return false; + } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) { + // Honor TargetOptions flags that explicitly say fusion is okay + return true; + } else if (F->hasFnAttribute("unsafe-fp-math")) { + // Check for unsafe-fp-math=true coming from Clang + Attribute Attr = F->getFnAttribute("unsafe-fp-math"); + StringRef Val = Attr.getValueAsString(); + if (Val == "true") + return true; + } + + // We did not have a clear indication that fusion is allowed, so assume not + return false; +} + +/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with +/// operands N0 and N1. This is a helper for PerformADDCombine that is +/// called with the default operands, and if that fails, with commuted +/// operands. +static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SelectionDAG &DAG = DCI.DAG; + // Skip non-integer, non-scalar case + EVT VT=N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + // fold (add (mul a, b), c) -> (mad a, b, c) + // + if (N0.getOpcode() == ISD::MUL) { + assert (VT.isInteger()); + // For integer: + // Since integer multiply-add costs the same as integer multiply + // but is more costly than integer add, do the fusion only when + // the mul is only used in the add. + if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || + !N0.getNode()->hasOneUse()) + return SDValue(); + + // Do the folding + return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + else if (N0.getOpcode() == ISD::FMUL) { + if (VT == MVT::f32 || VT == MVT::f64) { + NVPTXTargetLowering *TLI = + (NVPTXTargetLowering *)&DAG.getTargetLoweringInfo(); + if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) + return SDValue(); + + // For floating point: + // Do the fusion only when the mul has less than 5 uses and all + // are add. + // The heuristic is that if a use is not an add, then that use + // cannot be fused into fma, therefore mul is still needed anyway. + // If there are more than 4 uses, even if they are all add, fusing + // them will increase register pressue. + // + int numUses = 0; + int nonAddCount = 0; + for (SDNode::use_iterator UI = N0.getNode()->use_begin(), + UE = N0.getNode()->use_end(); + UI != UE; ++UI) { + numUses++; + SDNode *User = *UI; + if (User->getOpcode() != ISD::FADD) + ++nonAddCount; + } + if (numUses >= 5) + return SDValue(); + if (nonAddCount) { + int orderNo = N->getIROrder(); + int orderNo2 = N0.getNode()->getIROrder(); + // simple heuristics here for considering potential register + // pressure, the logics here is that the differnce are used + // to measure the distance between def and use, the longer distance + // more likely cause register pressure. + if (orderNo - orderNo2 < 500) + return SDValue(); + + // Now, check if at least one of the FMUL's operands is live beyond the node N, + // which guarantees that the FMA will not increase register pressure at node N. + bool opIsLive = false; + const SDNode *left = N0.getOperand(0).getNode(); + const SDNode *right = N0.getOperand(1).getNode(); + + if (dyn_cast(left) || dyn_cast(right)) + opIsLive = true; + + if (!opIsLive) + for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + return SDValue(); + } + + return DAG.getNode(ISD::FMA, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + } + + return SDValue(); +} + +/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. +/// +static SDValue PerformADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // First try with the default operand order. + SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, + OptLevel); + if (Result.getNode()) + return Result; + + // If that didn't work, try again with the operands commuted. + return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); +} + +static SDValue PerformANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // The type legalizer turns a vector load of i8 values into a zextload to i16 + // registers, optionally ANY_EXTENDs it (if target type is integer), + // and ANDs off the high 8 bits. Since we turn this load into a + // target-specific DAG node, the DAG combiner fails to eliminate these AND + // nodes. Do that here. + SDValue Val = N->getOperand(0); + SDValue Mask = N->getOperand(1); + + if (isa(Val)) { + std::swap(Val, Mask); + } + + SDValue AExt; + // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and + if (Val.getOpcode() == ISD::ANY_EXTEND) { + AExt = Val; + Val = Val->getOperand(0); + } + + if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { + Val = Val->getOperand(0); + } + + if (Val->getOpcode() == NVPTXISD::LoadV2 || + Val->getOpcode() == NVPTXISD::LoadV4) { + ConstantSDNode *MaskCnst = dyn_cast(Mask); + if (!MaskCnst) { + // Not an AND with a constant + return SDValue(); + } + + uint64_t MaskVal = MaskCnst->getZExtValue(); + if (MaskVal != 0xff) { + // Not an AND that chops off top 8 bits + return SDValue(); + } + + MemSDNode *Mem = dyn_cast(Val); + if (!Mem) { + // Not a MemSDNode?!? + return SDValue(); + } + + EVT MemVT = Mem->getMemoryVT(); + if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { + // We only handle the i8 case + return SDValue(); + } + + unsigned ExtType = + cast(Val->getOperand(Val->getNumOperands()-1))-> + getZExtValue(); + if (ExtType == ISD::SEXTLOAD) { + // If for some reason the load is a sextload, the and is needed to zero + // out the high 8 bits + return SDValue(); + } + + bool AddTo = false; + if (AExt.getNode() != 0) { + // Re-insert the ext as a zext. + Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), + AExt.getValueType(), Val); + AddTo = true; + } + + // If we get here, the AND is unnecessary. Just replace it with the load + DCI.CombineTo(N, Val, AddTo); + } + + return SDValue(); +} + +enum OperandSignedness { + Signed = 0, + Unsigned, + Unknown +}; + +/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand +/// that can be demoted to \p OptSize bits without loss of information. The +/// signedness of the operand, if determinable, is placed in \p S. +static bool IsMulWideOperandDemotable(SDValue Op, + unsigned OptSize, + OperandSignedness &S) { + S = Unknown; + + if (Op.getOpcode() == ISD::SIGN_EXTEND || + Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() == OptSize) { + S = Signed; + return true; + } + } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() == OptSize) { + S = Unsigned; + return true; + } + } + + return false; +} + +/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can +/// be demoted to \p OptSize bits without loss of information. If the operands +/// contain a constant, it should appear as the RHS operand. The signedness of +/// the operands is placed in \p IsSigned. +static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, + unsigned OptSize, + bool &IsSigned) { + + OperandSignedness LHSSign; + + // The LHS operand must be a demotable op + if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) + return false; + + // We should have been able to determine the signedness from the LHS + if (LHSSign == Unknown) + return false; + + IsSigned = (LHSSign == Signed); + + // The RHS can be a demotable op or a constant + if (ConstantSDNode *CI = dyn_cast(RHS)) { + APInt Val = CI->getAPIntValue(); + if (LHSSign == Unsigned) { + if (Val.isIntN(OptSize)) { + return true; + } + return false; + } else { + if (Val.isSignedIntN(OptSize)) { + return true; + } + return false; + } + } else { + OperandSignedness RHSSign; + if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) + return false; + + if (LHSSign != RHSSign) + return false; + + return true; + } +} + +/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply +/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform +/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift +/// amount. +static SDValue TryMULWIDECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT MulType = N->getValueType(0); + if (MulType != MVT::i32 && MulType != MVT::i64) { + return SDValue(); + } + + unsigned OptSize = MulType.getSizeInBits() >> 1; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Canonicalize the multiply so the constant (if any) is on the right + if (N->getOpcode() == ISD::MUL) { + if (isa(LHS)) { + std::swap(LHS, RHS); + } + } + + // If we have a SHL, determine the actual multiply amount + if (N->getOpcode() == ISD::SHL) { + ConstantSDNode *ShlRHS = dyn_cast(RHS); + if (!ShlRHS) { + return SDValue(); + } + + APInt ShiftAmt = ShlRHS->getAPIntValue(); + unsigned BitWidth = MulType.getSizeInBits(); + if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { + APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; + RHS = DCI.DAG.getConstant(MulVal, MulType); + } else { + return SDValue(); + } + } + + bool Signed; + // Verify that our operands are demotable + if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { + return SDValue(); + } + + EVT DemotedVT; + if (MulType == MVT::i32) { + DemotedVT = MVT::i16; + } else { + DemotedVT = MVT::i32; + } + + // Truncate the operands to the correct size. Note that these are just for + // type consistency and will (likely) be eliminated in later phases. + SDValue TruncLHS = + DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS); + SDValue TruncRHS = + DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS); + + unsigned Opc; + if (Signed) { + Opc = NVPTXISD::MUL_WIDE_SIGNED; + } else { + Opc = NVPTXISD::MUL_WIDE_UNSIGNED; + } + + return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS); +} + +/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + SDValue Ret = TryMULWIDECombine(N, DCI); + if (Ret.getNode()) + return Ret; + } + + return SDValue(); +} + +/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. +static SDValue PerformSHLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + SDValue Ret = TryMULWIDECombine(N, DCI); + if (Ret.getNode()) + return Ret; + } + + return SDValue(); +} + +SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + // FIXME: Get this from the DAG somehow + CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive; + switch (N->getOpcode()) { + default: break; + case ISD::ADD: + case ISD::FADD: + return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel); + case ISD::MUL: + return PerformMULCombine(N, DCI, OptLevel); + case ISD::SHL: + return PerformSHLCombine(N, DCI, OptLevel); + case ISD::AND: + return PerformANDCombine(N, DCI); + } + return SDValue(); +} + /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, + const DataLayout *TD, SmallVectorImpl &Results) { EVT ResVT = N->getValueType(0); SDLoc DL(N); @@ -2504,6 +4260,20 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, break; } + LoadSDNode *LD = cast(N); + + unsigned Align = LD->getAlignment(); + unsigned PrefAlign = + TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); + if (Align < PrefAlign) { + // This load is not sufficiently aligned, so bail out and let this vector + // load be scalarized. Note that we may still be able to emit smaller + // vector loads. For example, if we are loading a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return; + } + EVT EltVT = ResVT.getVectorElementType(); unsigned NumElts = ResVT.getVectorNumElements(); @@ -2540,8 +4310,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) OtherOps.push_back(N->getOperand(i)); - LoadSDNode *LD = cast(N); - // The select routine does not have access to the LoadSDNode instance, so // pass along the extension information OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType())); @@ -2714,7 +4482,7 @@ void NVPTXTargetLowering::ReplaceNodeResults( default: report_fatal_error("Unhandled custom legalization"); case ISD::LOAD: - ReplaceLoadVector(N, DAG, Results); + ReplaceLoadVector(N, DAG, getDataLayout(), Results); return; case ISD::INTRINSIC_W_CHAIN: ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index 7bad8a28f323..bef6ed9faad6 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -16,7 +16,6 @@ #define NVPTXISELLOWERING_H #include "NVPTX.h" -#include "NVPTXSubtarget.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetLowering.h" @@ -50,6 +49,11 @@ enum NodeType { CallSeqBegin, CallSeqEnd, CallPrototype, + FUN_SHFL_CLAMP, + FUN_SHFR_CLAMP, + MUL_WIDE_SIGNED, + MUL_WIDE_UNSIGNED, + IMAD, Dummy, LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE, @@ -73,54 +77,244 @@ enum NodeType { StoreRetvalV4, // Texture intrinsics - Tex1DFloatI32, + Tex1DFloatS32, Tex1DFloatFloat, Tex1DFloatFloatLevel, Tex1DFloatFloatGrad, - Tex1DI32I32, - Tex1DI32Float, - Tex1DI32FloatLevel, - Tex1DI32FloatGrad, - Tex1DArrayFloatI32, + Tex1DS32S32, + Tex1DS32Float, + Tex1DS32FloatLevel, + Tex1DS32FloatGrad, + Tex1DU32S32, + Tex1DU32Float, + Tex1DU32FloatLevel, + Tex1DU32FloatGrad, + Tex1DArrayFloatS32, Tex1DArrayFloatFloat, Tex1DArrayFloatFloatLevel, Tex1DArrayFloatFloatGrad, - Tex1DArrayI32I32, - Tex1DArrayI32Float, - Tex1DArrayI32FloatLevel, - Tex1DArrayI32FloatGrad, - Tex2DFloatI32, + Tex1DArrayS32S32, + Tex1DArrayS32Float, + Tex1DArrayS32FloatLevel, + Tex1DArrayS32FloatGrad, + Tex1DArrayU32S32, + Tex1DArrayU32Float, + Tex1DArrayU32FloatLevel, + Tex1DArrayU32FloatGrad, + Tex2DFloatS32, Tex2DFloatFloat, Tex2DFloatFloatLevel, Tex2DFloatFloatGrad, - Tex2DI32I32, - Tex2DI32Float, - Tex2DI32FloatLevel, - Tex2DI32FloatGrad, - Tex2DArrayFloatI32, + Tex2DS32S32, + Tex2DS32Float, + Tex2DS32FloatLevel, + Tex2DS32FloatGrad, + Tex2DU32S32, + Tex2DU32Float, + Tex2DU32FloatLevel, + Tex2DU32FloatGrad, + Tex2DArrayFloatS32, Tex2DArrayFloatFloat, Tex2DArrayFloatFloatLevel, Tex2DArrayFloatFloatGrad, - Tex2DArrayI32I32, - Tex2DArrayI32Float, - Tex2DArrayI32FloatLevel, - Tex2DArrayI32FloatGrad, - Tex3DFloatI32, + Tex2DArrayS32S32, + Tex2DArrayS32Float, + Tex2DArrayS32FloatLevel, + Tex2DArrayS32FloatGrad, + Tex2DArrayU32S32, + Tex2DArrayU32Float, + Tex2DArrayU32FloatLevel, + Tex2DArrayU32FloatGrad, + Tex3DFloatS32, Tex3DFloatFloat, Tex3DFloatFloatLevel, Tex3DFloatFloatGrad, - Tex3DI32I32, - Tex3DI32Float, - Tex3DI32FloatLevel, - Tex3DI32FloatGrad, + Tex3DS32S32, + Tex3DS32Float, + Tex3DS32FloatLevel, + Tex3DS32FloatGrad, + Tex3DU32S32, + Tex3DU32Float, + Tex3DU32FloatLevel, + Tex3DU32FloatGrad, + TexCubeFloatFloat, + TexCubeFloatFloatLevel, + TexCubeS32Float, + TexCubeS32FloatLevel, + TexCubeU32Float, + TexCubeU32FloatLevel, + TexCubeArrayFloatFloat, + TexCubeArrayFloatFloatLevel, + TexCubeArrayS32Float, + TexCubeArrayS32FloatLevel, + TexCubeArrayU32Float, + TexCubeArrayU32FloatLevel, + Tld4R2DFloatFloat, + Tld4G2DFloatFloat, + Tld4B2DFloatFloat, + Tld4A2DFloatFloat, + Tld4R2DS64Float, + Tld4G2DS64Float, + Tld4B2DS64Float, + Tld4A2DS64Float, + Tld4R2DU64Float, + Tld4G2DU64Float, + Tld4B2DU64Float, + Tld4A2DU64Float, + TexUnified1DFloatS32, + TexUnified1DFloatFloat, + TexUnified1DFloatFloatLevel, + TexUnified1DFloatFloatGrad, + TexUnified1DS32S32, + TexUnified1DS32Float, + TexUnified1DS32FloatLevel, + TexUnified1DS32FloatGrad, + TexUnified1DU32S32, + TexUnified1DU32Float, + TexUnified1DU32FloatLevel, + TexUnified1DU32FloatGrad, + TexUnified1DArrayFloatS32, + TexUnified1DArrayFloatFloat, + TexUnified1DArrayFloatFloatLevel, + TexUnified1DArrayFloatFloatGrad, + TexUnified1DArrayS32S32, + TexUnified1DArrayS32Float, + TexUnified1DArrayS32FloatLevel, + TexUnified1DArrayS32FloatGrad, + TexUnified1DArrayU32S32, + TexUnified1DArrayU32Float, + TexUnified1DArrayU32FloatLevel, + TexUnified1DArrayU32FloatGrad, + TexUnified2DFloatS32, + TexUnified2DFloatFloat, + TexUnified2DFloatFloatLevel, + TexUnified2DFloatFloatGrad, + TexUnified2DS32S32, + TexUnified2DS32Float, + TexUnified2DS32FloatLevel, + TexUnified2DS32FloatGrad, + TexUnified2DU32S32, + TexUnified2DU32Float, + TexUnified2DU32FloatLevel, + TexUnified2DU32FloatGrad, + TexUnified2DArrayFloatS32, + TexUnified2DArrayFloatFloat, + TexUnified2DArrayFloatFloatLevel, + TexUnified2DArrayFloatFloatGrad, + TexUnified2DArrayS32S32, + TexUnified2DArrayS32Float, + TexUnified2DArrayS32FloatLevel, + TexUnified2DArrayS32FloatGrad, + TexUnified2DArrayU32S32, + TexUnified2DArrayU32Float, + TexUnified2DArrayU32FloatLevel, + TexUnified2DArrayU32FloatGrad, + TexUnified3DFloatS32, + TexUnified3DFloatFloat, + TexUnified3DFloatFloatLevel, + TexUnified3DFloatFloatGrad, + TexUnified3DS32S32, + TexUnified3DS32Float, + TexUnified3DS32FloatLevel, + TexUnified3DS32FloatGrad, + TexUnified3DU32S32, + TexUnified3DU32Float, + TexUnified3DU32FloatLevel, + TexUnified3DU32FloatGrad, + TexUnifiedCubeFloatFloat, + TexUnifiedCubeFloatFloatLevel, + TexUnifiedCubeS32Float, + TexUnifiedCubeS32FloatLevel, + TexUnifiedCubeU32Float, + TexUnifiedCubeU32FloatLevel, + TexUnifiedCubeArrayFloatFloat, + TexUnifiedCubeArrayFloatFloatLevel, + TexUnifiedCubeArrayS32Float, + TexUnifiedCubeArrayS32FloatLevel, + TexUnifiedCubeArrayU32Float, + TexUnifiedCubeArrayU32FloatLevel, + Tld4UnifiedR2DFloatFloat, + Tld4UnifiedG2DFloatFloat, + Tld4UnifiedB2DFloatFloat, + Tld4UnifiedA2DFloatFloat, + Tld4UnifiedR2DS64Float, + Tld4UnifiedG2DS64Float, + Tld4UnifiedB2DS64Float, + Tld4UnifiedA2DS64Float, + Tld4UnifiedR2DU64Float, + Tld4UnifiedG2DU64Float, + Tld4UnifiedB2DU64Float, + Tld4UnifiedA2DU64Float, // Surface intrinsics + Suld1DI8Clamp, + Suld1DI16Clamp, + Suld1DI32Clamp, + Suld1DI64Clamp, + Suld1DV2I8Clamp, + Suld1DV2I16Clamp, + Suld1DV2I32Clamp, + Suld1DV2I64Clamp, + Suld1DV4I8Clamp, + Suld1DV4I16Clamp, + Suld1DV4I32Clamp, + + Suld1DArrayI8Clamp, + Suld1DArrayI16Clamp, + Suld1DArrayI32Clamp, + Suld1DArrayI64Clamp, + Suld1DArrayV2I8Clamp, + Suld1DArrayV2I16Clamp, + Suld1DArrayV2I32Clamp, + Suld1DArrayV2I64Clamp, + Suld1DArrayV4I8Clamp, + Suld1DArrayV4I16Clamp, + Suld1DArrayV4I32Clamp, + + Suld2DI8Clamp, + Suld2DI16Clamp, + Suld2DI32Clamp, + Suld2DI64Clamp, + Suld2DV2I8Clamp, + Suld2DV2I16Clamp, + Suld2DV2I32Clamp, + Suld2DV2I64Clamp, + Suld2DV4I8Clamp, + Suld2DV4I16Clamp, + Suld2DV4I32Clamp, + + Suld2DArrayI8Clamp, + Suld2DArrayI16Clamp, + Suld2DArrayI32Clamp, + Suld2DArrayI64Clamp, + Suld2DArrayV2I8Clamp, + Suld2DArrayV2I16Clamp, + Suld2DArrayV2I32Clamp, + Suld2DArrayV2I64Clamp, + Suld2DArrayV4I8Clamp, + Suld2DArrayV4I16Clamp, + Suld2DArrayV4I32Clamp, + + Suld3DI8Clamp, + Suld3DI16Clamp, + Suld3DI32Clamp, + Suld3DI64Clamp, + Suld3DV2I8Clamp, + Suld3DV2I16Clamp, + Suld3DV2I32Clamp, + Suld3DV2I64Clamp, + Suld3DV4I8Clamp, + Suld3DV4I16Clamp, + Suld3DV4I32Clamp, + Suld1DI8Trap, Suld1DI16Trap, Suld1DI32Trap, + Suld1DI64Trap, Suld1DV2I8Trap, Suld1DV2I16Trap, Suld1DV2I32Trap, + Suld1DV2I64Trap, Suld1DV4I8Trap, Suld1DV4I16Trap, Suld1DV4I32Trap, @@ -128,9 +322,11 @@ enum NodeType { Suld1DArrayI8Trap, Suld1DArrayI16Trap, Suld1DArrayI32Trap, + Suld1DArrayI64Trap, Suld1DArrayV2I8Trap, Suld1DArrayV2I16Trap, Suld1DArrayV2I32Trap, + Suld1DArrayV2I64Trap, Suld1DArrayV4I8Trap, Suld1DArrayV4I16Trap, Suld1DArrayV4I32Trap, @@ -138,9 +334,11 @@ enum NodeType { Suld2DI8Trap, Suld2DI16Trap, Suld2DI32Trap, + Suld2DI64Trap, Suld2DV2I8Trap, Suld2DV2I16Trap, Suld2DV2I32Trap, + Suld2DV2I64Trap, Suld2DV4I8Trap, Suld2DV4I16Trap, Suld2DV4I32Trap, @@ -148,9 +346,11 @@ enum NodeType { Suld2DArrayI8Trap, Suld2DArrayI16Trap, Suld2DArrayI32Trap, + Suld2DArrayI64Trap, Suld2DArrayV2I8Trap, Suld2DArrayV2I16Trap, Suld2DArrayV2I32Trap, + Suld2DArrayV2I64Trap, Suld2DArrayV4I8Trap, Suld2DArrayV4I16Trap, Suld2DArrayV4I32Trap, @@ -158,15 +358,79 @@ enum NodeType { Suld3DI8Trap, Suld3DI16Trap, Suld3DI32Trap, + Suld3DI64Trap, Suld3DV2I8Trap, Suld3DV2I16Trap, Suld3DV2I32Trap, + Suld3DV2I64Trap, Suld3DV4I8Trap, Suld3DV4I16Trap, - Suld3DV4I32Trap + Suld3DV4I32Trap, + + Suld1DI8Zero, + Suld1DI16Zero, + Suld1DI32Zero, + Suld1DI64Zero, + Suld1DV2I8Zero, + Suld1DV2I16Zero, + Suld1DV2I32Zero, + Suld1DV2I64Zero, + Suld1DV4I8Zero, + Suld1DV4I16Zero, + Suld1DV4I32Zero, + + Suld1DArrayI8Zero, + Suld1DArrayI16Zero, + Suld1DArrayI32Zero, + Suld1DArrayI64Zero, + Suld1DArrayV2I8Zero, + Suld1DArrayV2I16Zero, + Suld1DArrayV2I32Zero, + Suld1DArrayV2I64Zero, + Suld1DArrayV4I8Zero, + Suld1DArrayV4I16Zero, + Suld1DArrayV4I32Zero, + + Suld2DI8Zero, + Suld2DI16Zero, + Suld2DI32Zero, + Suld2DI64Zero, + Suld2DV2I8Zero, + Suld2DV2I16Zero, + Suld2DV2I32Zero, + Suld2DV2I64Zero, + Suld2DV4I8Zero, + Suld2DV4I16Zero, + Suld2DV4I32Zero, + + Suld2DArrayI8Zero, + Suld2DArrayI16Zero, + Suld2DArrayI32Zero, + Suld2DArrayI64Zero, + Suld2DArrayV2I8Zero, + Suld2DArrayV2I16Zero, + Suld2DArrayV2I32Zero, + Suld2DArrayV2I64Zero, + Suld2DArrayV4I8Zero, + Suld2DArrayV4I16Zero, + Suld2DArrayV4I32Zero, + + Suld3DI8Zero, + Suld3DI16Zero, + Suld3DI32Zero, + Suld3DI64Zero, + Suld3DV2I8Zero, + Suld3DV2I16Zero, + Suld3DV2I32Zero, + Suld3DV2I64Zero, + Suld3DV4I8Zero, + Suld3DV4I16Zero, + Suld3DV4I32Zero }; } +class NVPTXSubtarget; + //===--------------------------------------------------------------------===// // TargetLowering Implementation //===--------------------------------------------------------------------===// @@ -196,9 +460,9 @@ class NVPTXTargetLowering : public TargetLowering { /// getFunctionAlignment - Return the Log2 alignment of this function. unsigned getFunctionAlignment(const Function *F) const; - EVT getSetCCResultType(LLVMContext &, EVT VT) const override { + EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override { if (VT.isVector()) - return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); return MVT::i1; } @@ -236,7 +500,14 @@ class NVPTXTargetLowering : public TargetLowering { // PTX always uses 32-bit shift amounts MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } - bool shouldSplitVectorType(EVT VT) const override; + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; + + bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const; + + virtual bool isFMAFasterThanFMulAndFAdd(EVT) const { + return true; + } private: const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here @@ -255,8 +526,12 @@ class NVPTXTargetLowering : public TargetLowering { SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS, Type *Ty, unsigned Idx) const; diff --git a/lib/Target/NVPTX/NVPTXInstrFormats.td b/lib/Target/NVPTX/NVPTXInstrFormats.td index f11f1b8f96fc..ffcb5d5273a2 100644 --- a/lib/Target/NVPTX/NVPTXInstrFormats.td +++ b/lib/Target/NVPTX/NVPTXInstrFormats.td @@ -36,8 +36,24 @@ class NVPTXInst pattern> bit IsLoad = 0; bit IsStore = 0; - let TSFlags{3-0} = VecInstType; - let TSFlags{4-4} = IsSimpleMove; - let TSFlags{5-5} = IsLoad; - let TSFlags{6-6} = IsStore; + bit IsTex = 0; + bit IsSust = 0; + bit IsSurfTexQuery = 0; + bit IsTexModeUnified = 0; + + // The following field is encoded as log2 of the vector size minus one, + // with 0 meaning the operation is not a surface instruction. For example, + // if IsSuld == 2, then the instruction is a suld instruction with vector size + // 2**(2-1) = 2. + bits<2> IsSuld = 0; + + let TSFlags{3-0} = VecInstType; + let TSFlags{4-4} = IsSimpleMove; + let TSFlags{5-5} = IsLoad; + let TSFlags{6-6} = IsStore; + let TSFlags{7} = IsTex; + let TSFlags{9-8} = IsSuld; + let TSFlags{10} = IsSust; + let TSFlags{11} = IsSurfTexQuery; + let TSFlags{12} = IsTexModeUnified; } diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index cdc80887dc28..b5b4fbed0799 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -29,8 +29,8 @@ using namespace llvm; void NVPTXInstrInfo::anchor() {} // FIXME: Add the subtarget support on this constructor. -NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm) - : NVPTXGenInstrInfo(), TM(tm), RegInfo(*TM.getSubtargetImpl()) {} +NVPTXInstrInfo::NVPTXInstrInfo(NVPTXSubtarget &STI) + : NVPTXGenInstrInfo(), RegInfo(STI) {} void NVPTXInstrInfo::copyPhysReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h index 88a9e45f25f5..2ac29748676a 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -24,11 +24,10 @@ namespace llvm { class NVPTXInstrInfo : public NVPTXGenInstrInfo { - NVPTXTargetMachine &TM; const NVPTXRegisterInfo RegInfo; virtual void anchor(); public: - explicit NVPTXInstrInfo(NVPTXTargetMachine &TM); + explicit NVPTXInstrInfo(NVPTXSubtarget &STI); const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; } diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index fbcd0e4a358f..9900b8c8433f 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -139,17 +139,10 @@ def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">; def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; -def doFMAF32 : Predicate<"doFMAF32">; -def doFMAF32_ftz : Predicate<"(doFMAF32 && useF32FTZ())">; -def doFMAF32AGG : Predicate<"doFMAF32AGG">; -def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && useF32FTZ())">; -def doFMAF64 : Predicate<"doFMAF64">; -def doFMAF64AGG : Predicate<"doFMAF64AGG">; - def doMulWide : Predicate<"doMulWide">; -def allowFMA : Predicate<"allowFMA">; -def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">; +def allowFMA : Predicate<"allowFMA()">; +def noFMA : Predicate<"!allowFMA()">; def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; @@ -158,9 +151,12 @@ def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">; +def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">; def true : Predicate<"1">; +def hasPTX31 : Predicate<"Subtarget.getPTXVersion() >= 31">; + //===----------------------------------------------------------------------===// // Some Common Instruction Class Templates @@ -219,13 +215,13 @@ multiclass F3 { !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[allowFMA_ftz]>; + Requires<[allowFMA, doF32FTZ]>; def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[allowFMA_ftz]>; + Requires<[allowFMA, doF32FTZ]>; def f32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), @@ -245,34 +241,38 @@ multiclass F3_rn { (ins Float64Regs:$a, Float64Regs:$b), !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), [(set Float64Regs:$dst, - (OpNode Float64Regs:$a, Float64Regs:$b))]>; + (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[noFMA]>; def f64ri : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b), !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), [(set Float64Regs:$dst, - (OpNode Float64Regs:$a, fpimm:$b))]>; + (OpNode Float64Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[doF32FTZ]>; + Requires<[noFMA, doF32FTZ]>; def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[doF32FTZ]>; + Requires<[noFMA, doF32FTZ]>; def f32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, Float32Regs:$b))]>; + (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[noFMA]>; def f32ri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, fpimm:$b))]>; + (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; } multiclass F2 { @@ -461,33 +461,45 @@ def SHL2MUL16 : SDNodeXFormgetTargetConstant(temp.shl(v), MVT::i16); }]>; -def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b), +def MULWIDES64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, i64imm:$b), +def MULWIDES64Imm64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b), +def MULWIDEU64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, i64imm:$b), +def MULWIDEU64Imm64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b), +def MULWIDES32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, i32imm:$b), +def MULWIDES32Imm + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b), - "mul.wide.u16 \t$dst, $a, $b;", []>; -def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, i32imm:$b), +def MULWIDEU32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)), (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, @@ -507,25 +519,63 @@ def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>; def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), - (MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>, + (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), - (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>; + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), - (MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>, + (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), - (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), - (MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>, + (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), - (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), - (MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>, + (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, + Requires<[doMulWide]>; + + +def SDTMulWide + : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; +def mul_wide_signed + : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; +def mul_wide_unsigned + : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; + +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), + (MULWIDES32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), + (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; + + +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), + (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), + (MULWIDES64Imm Int32Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), + (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, Requires<[doMulWide]>; defm MULT : I3<"mul.lo.s", mul>; @@ -541,69 +591,75 @@ defm SREM : I3<"rem.s", srem>; defm UREM : I3<"rem.u", urem>; // The ri version will not be selected as DAGCombiner::visitUREM will lower it. +def SDTIMAD + : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, + SDTCisInt<2>, SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; +def imad + : SDNode<"NVPTXISD::IMAD", SDTIMAD>; + def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; def MAD16rri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; def MAD16rir : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; def MAD16rii : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b, i16imm:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b), - imm:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, imm:$b, imm:$c))]>; def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; def MAD32rri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; def MAD32rir : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; def MAD32rii : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b, i32imm:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, imm:$b), imm:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, imm:$b, imm:$c))]>; def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; def MAD64rri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; def MAD64rir : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; def MAD64rii : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b, i64imm:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, imm:$b), imm:$c))]>; - + [(set Int64Regs:$dst, + (imad Int64Regs:$a, imm:$b, imm:$c))]>; def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), "neg.s16 \t$dst, $src;", @@ -809,36 +865,26 @@ multiclass FPCONTRACT32 { def rrr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, Float32Regs:$b), - Float32Regs:$c))]>, Requires<[Pred]>; - // This is to WAR a weird bug in Tablegen that does not automatically - // generate the following permutated rule rrr2 from the above rrr. - // So we explicitly add it here. This happens to FMA32 only. - // See the comments at FMAD32 and FMA32 for more information. - def rrr2 : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd Float32Regs:$c, - (fmul Float32Regs:$a, Float32Regs:$b)))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>, Requires<[Pred]>; def rri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>, Requires<[Pred]>; def rir : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>, Requires<[Pred]>; def rii : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b, f32imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>, Requires<[Pred]>; } @@ -846,73 +892,32 @@ multiclass FPCONTRACT64 { def rrr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, Float64Regs:$b), - Float64Regs:$c))]>, Requires<[Pred]>; + [(set Float64Regs:$dst, + (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>, + Requires<[Pred]>; def rri : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a, - Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>; + [(set Float64Regs:$dst, + (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>, + Requires<[Pred]>; def rir : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>, + [(set Float64Regs:$dst, + (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>, Requires<[Pred]>; def rii : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b, f64imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>, + [(set Float64Regs:$dst, + (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>, Requires<[Pred]>; } -// Due to a unknown reason (most likely a bug in tablegen), tablegen does not -// automatically generate the rrr2 rule from -// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32. -// If we reverse the order of the following two lines, then rrr2 rule will be -// generated for FMA32, but not for rrr. -// Therefore, we manually write the rrr2 rule in FPCONTRACT32. -defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>; -defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>; -defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>; - -// b*c-a => fmad(b, c, -a) -multiclass FPCONTRACT32_SUB_PAT_MAD { - def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a), - (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>, - Requires<[Pred]>; -} - -// a-b*c => fmad(-b,c, a) -// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c -// b*c-a => fmad(b, c, -a) -// - legal because b*c-a <=> b*c+(-a) -multiclass FPCONTRACT32_SUB_PAT { - def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)), - (Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>, - Requires<[Pred]>; - def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a), - (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>, - Requires<[Pred]>; -} - -// a-b*c => fmad(-b,c, a) -// b*c-a => fmad(b, c, -a) -multiclass FPCONTRACT64_SUB_PAT { - def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)), - (Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>, - Requires<[Pred]>; - - def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a), - (Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>, - Requires<[Pred]>; -} - -defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT; -defm FMAF32ext : FPCONTRACT32_SUB_PAT; -defm FMAF64ext : FPCONTRACT64_SUB_PAT; +defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>; +defm FMA32 : FPCONTRACT32<"fma.rn.f32", true>; +defm FMA64 : FPCONTRACT64<"fma.rn.f64", true>; def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), "sin.approx.f32 \t$dst, $src;", @@ -1083,6 +1088,43 @@ multiclass RSHIFT_FORMAT { defm SRA : RSHIFT_FORMAT<"shr.s", sra>; defm SRL : RSHIFT_FORMAT<"shr.u", srl>; +// +// Rotate: use ptx shf instruction if available. +// + +// 32 bit r2 = rotl r1, n +// => +// r2 = shf.l r1, r1, n +def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]> ; + +def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// 32 bit r2 = rotr r1, n +// => +// r2 = shf.r r1, r1, n +def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// +// Rotate: if ptx shf instruction is not available, then use shift+add +// // 32bit def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), @@ -1100,9 +1142,11 @@ def SUB_FRM_32 : SDNodeXForm; def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>; + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>; + (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), @@ -1115,7 +1159,8 @@ def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t", !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", !strconcat("}}", ""))))))))), - [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>; + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), @@ -1128,7 +1173,8 @@ def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t", !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", !strconcat("}}", ""))))))))), - [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>; + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; // 64bit def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, @@ -1177,6 +1223,29 @@ def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, !strconcat("}}", ""))))))))), [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; +// BFE - bit-field extract + +multiclass BFE { + // BFE supports both 32-bit and 64-bit values, but the start and length + // operands are always 32-bit + def rrr + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, Int32Regs:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rri + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rii + : NVPTXInst<(outs RC:$d), + (ins RC:$a, i32imm:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; +} + +defm BFE_S32 : BFE<"s32", Int32Regs>; +defm BFE_U32 : BFE<"u32", Int32Regs>; +defm BFE_S64 : BFE<"s64", Int64Regs>; +defm BFE_U64 : BFE<"u64", Int64Regs>; //----------------------------------- // General Comparison @@ -1292,6 +1361,32 @@ def : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)), (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a), (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>; +// +// Funnnel shift in clamp mode +// +// - SDNodes are created so they can be used in the DAG code, +// e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) +// +def SDTIntShiftDOp: SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; +def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; +def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; + +def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFL_CLAMP Int32Regs:$lo, + Int32Regs:$hi, Int32Regs:$amt))]>; + +def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFR_CLAMP Int32Regs:$lo, + Int32Regs:$hi, Int32Regs:$amt))]>; + //----------------------------------- // Data Movement (Load / Store, Move) //----------------------------------- @@ -1819,7 +1914,7 @@ def StoreParamV2I8 : StoreParamV2Inst; def StoreParamV4I32 : NVPTXInst<(outs), (ins Int32Regs:$val, Int32Regs:$val2, Int32Regs:$val3, Int32Regs:$val4, i32imm:$a, i32imm:$b), - "st.param.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};", + "st.param.v4.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};", []>; def StoreParamV4I16 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2, diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 5e228fc396ca..14e51aa309ea 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -792,13 +792,18 @@ def INT_NVVM_H2F : F_MATH_1; -def : Pat<(f32 (f16_to_f32 Int16Regs:$a)), +def : Pat<(f32 (f16_to_fp Int16Regs:$a)), (CVT_f32_f16 Int16Regs:$a, CvtNONE)>; -def : Pat<(i16 (f32_to_f16 Float32Regs:$a)), +def : Pat<(i16 (fp_to_f16 Float32Regs:$a)), (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i16 (f32_to_f16 Float32Regs:$a)), +def : Pat<(i16 (fp_to_f16 Float32Regs:$a)), (CVT_f16_f32 Float32Regs:$a, CvtRN)>; +def : Pat<(f64 (f16_to_fp Int16Regs:$a)), + (CVT_f64_f16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i16 (fp_to_f16 Float64Regs:$a)), + (CVT_f16_f64 Float64Regs:$a, CvtRN)>; + // // Bitcast // @@ -1057,12 +1062,24 @@ def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_max_32 node:$a, node:$b)>; def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_max_32 node:$a, node:$b)>; +def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) + , (atomic_load_max_64 node:$a, node:$b)>; +def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_max_64 node:$a, node:$b)>; +def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_max_64 node:$a, node:$b)>; def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; +def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; +def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; +def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2; @@ -1072,6 +1089,14 @@ defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2; defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2; defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2; defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2; defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2; // atom_min @@ -1089,12 +1122,24 @@ def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_min_32 node:$a, node:$b)>; def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_min_32 node:$a, node:$b)>; +def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; +def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; +def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; +def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; +def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; +def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2; @@ -1104,6 +1149,14 @@ defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2; defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2; defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2; defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2; defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2; // atom_inc atom_dec @@ -1153,6 +1214,12 @@ def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_and_32 node:$a, node:$b)>; def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_and_32 node:$a, node:$b)>; +def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; +def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; +def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2; @@ -1162,6 +1229,14 @@ defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2; defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2; // atom_or @@ -1171,6 +1246,12 @@ def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_or_32 node:$a, node:$b)>; def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_or_32 node:$a, node:$b)>; +def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; +def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; +def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2; @@ -1180,6 +1261,14 @@ defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2; defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2; +defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2; // atom_xor @@ -1189,6 +1278,12 @@ def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_xor_32 node:$a, node:$b)>; def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_xor_32 node:$a, node:$b)>; +def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; +def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; +def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2; @@ -1198,6 +1293,14 @@ defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2; defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2; +defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2; // atom_cas @@ -1276,67 +1379,33 @@ def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs, // Support for ldu on sm_20 or later //----------------------------------- -def ldu_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldu_global_i node:$ptr), [{ - MemIntrinsicSDNode *M = cast(N); - return M->getMemoryVT() == MVT::i8; -}]>; - // Scalar -// @TODO: Revisit this, Changed imemAny to imem -multiclass LDU_G { - def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>; - def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDU]>; - def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>; - def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; -} - -multiclass LDU_G_NOINTRIN { +multiclass LDU_G { def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + []>, Requires<[hasLDU]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; } -defm INT_PTX_LDU_GLOBAL_i8 : LDU_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, - ldu_i8>; -defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs, -int_nvvm_ldu_global_f>; -defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs, -int_nvvm_ldu_global_f>; -defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, -int_nvvm_ldu_global_p>; -defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs, -int_nvvm_ldu_global_p>; +defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>; +defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>; +defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>; +defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>; +defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>; +defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>; +defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>; +defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>; // vector @@ -1406,65 +1475,40 @@ defm INT_PTX_LDU_G_v4f32_ELE // Support for ldg on sm_35 or later //----------------------------------- -def ldg_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldg_global_i node:$ptr), [{ - MemIntrinsicSDNode *M = cast(N); - return M->getMemoryVT() == MVT::i8; -}]>; - -multiclass LDG_G { +multiclass LDG_G { def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + []>, Requires<[hasLDG]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>; -} - -multiclass LDG_G_NOINTRIN { - def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>; - def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDG]>; - def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>; - def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; } defm INT_PTX_LDG_GLOBAL_i8 - : LDG_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, ldg_i8>; + : LDG_G<"u8 \t$result, [$src];", Int16Regs>; defm INT_PTX_LDG_GLOBAL_i16 - : LDG_G<"u16 \t$result, [$src];", Int16Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u16 \t$result, [$src];", Int16Regs>; defm INT_PTX_LDG_GLOBAL_i32 - : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u32 \t$result, [$src];", Int32Regs>; defm INT_PTX_LDG_GLOBAL_i64 - : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u64 \t$result, [$src];", Int64Regs>; defm INT_PTX_LDG_GLOBAL_f32 - : LDG_G<"f32 \t$result, [$src];", Float32Regs, int_nvvm_ldg_global_f>; + : LDG_G<"f32 \t$result, [$src];", Float32Regs>; defm INT_PTX_LDG_GLOBAL_f64 - : LDG_G<"f64 \t$result, [$src];", Float64Regs, int_nvvm_ldg_global_f>; + : LDG_G<"f64 \t$result, [$src];", Float64Regs>; defm INT_PTX_LDG_GLOBAL_p32 - : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_p>; + : LDG_G<"u32 \t$result, [$src];", Int32Regs>; defm INT_PTX_LDG_GLOBAL_p64 - : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_p>; + : LDG_G<"u64 \t$result, [$src];", Int64Regs>; // vector @@ -1689,6 +1733,207 @@ def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a), [(int_nvvm_compiler_error Int64Regs:$a)]>; +// isspacep + +def ISSPACEP_CONST_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.const \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>, + Requires<[hasPTX31]>; +def ISSPACEP_CONST_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.const \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>, + Requires<[hasPTX31]>; +def ISSPACEP_GLOBAL_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.global \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>; +def ISSPACEP_GLOBAL_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.global \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>; +def ISSPACEP_LOCAL_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.local \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>; +def ISSPACEP_LOCAL_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.local \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>; +def ISSPACEP_SHARED_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.shared \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>; +def ISSPACEP_SHARED_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.shared \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>; + + +// Special register reads +def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d), + (ins SpecialRegs:$r), + "mov.b32\t$d, $r;", []>; + +def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>; + + +// rotate builtin support + +def ROTATE_B32_HW_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, + (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]> ; + +def ROTATE_B32_HW_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, + (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]> ; + +def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]> ; + +def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt), + (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]> ; + +def GET_LO_INT64 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %dummy;\n\t", + !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t", + !strconcat("}}", "")))), + []> ; + +def GET_HI_INT64 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %dummy;\n\t", + !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t", + !strconcat("}}", "")))), + []> ; + +def PACK_TWO_INT32 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi), + "mov.b64 \t$dst, {{$lo, $hi}};", []> ; + +def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src), + (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src))> ; + +// funnel shift, requires >= sm_32 +def SHF_L_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_L_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_R_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_R_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +// HW version of rotate 64 +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), + (PACK_TWO_INT32 + (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), imm:$amt), + (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), imm:$amt))>, + Requires<[hasHWROT32]>; + +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), + (PACK_TWO_INT32 + (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt), + (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>, + Requires<[hasHWROT32]>; + + +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), + (PACK_TWO_INT32 + (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), imm:$amt), + (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), imm:$amt))>, + Requires<[hasHWROT32]>; + +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), + (PACK_TWO_INT32 + (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt), + (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>, + Requires<[hasHWROT32]>; + +// SW version of rotate 64 +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), + (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), + (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]>; + + //----------------------------------- // Texture Intrinsics //----------------------------------- @@ -1696,9 +1941,10 @@ def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a), // NOTE: For Fermi support, any new texture/surface/sampler intrinsics must be // also defined in NVPTXReplaceImageHandles.cpp - +// texmode_independent +let IsTex = 1, IsTexModeUnified = 0 in { // Texture fetch instructions using handles -def TEX_1D_F32_I32 +def TEX_1D_F32_S32 : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, Float32Regs:$b, Float32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x), @@ -1725,19 +1971,19 @@ def TEX_1D_F32_F32_GRAD "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", []>; -def TEX_1D_I32_I32 +def TEX_1D_S32_S32 : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x), "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];", []>; -def TEX_1D_I32_F32 +def TEX_1D_S32_F32 : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x), "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];", []>; -def TEX_1D_I32_F32_LEVEL +def TEX_1D_S32_F32_LEVEL : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, @@ -1745,7 +1991,7 @@ def TEX_1D_I32_F32_LEVEL "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$x\\}], $lod;", []>; -def TEX_1D_I32_F32_GRAD +def TEX_1D_S32_F32_GRAD : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, @@ -1753,8 +1999,36 @@ def TEX_1D_I32_F32_GRAD "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", []>; +def TEX_1D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x), + "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];", + []>; +def TEX_1D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x), + "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];", + []>; +def TEX_1D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x\\}], $lod;", + []>; +def TEX_1D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; -def TEX_1D_ARRAY_F32_I32 +def TEX_1D_ARRAY_F32_S32 : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, Float32Regs:$b, Float32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), @@ -1784,21 +2058,21 @@ def TEX_1D_ARRAY_F32_F32_GRAD "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", []>; -def TEX_1D_ARRAY_I32_I32 +def TEX_1D_ARRAY_S32_S32 : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$l, $x\\}];", []>; -def TEX_1D_ARRAY_I32_F32 +def TEX_1D_ARRAY_S32_F32 : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x), "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$l, $x\\}];", []>; -def TEX_1D_ARRAY_I32_F32_LEVEL +def TEX_1D_ARRAY_S32_F32_LEVEL : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, @@ -1806,7 +2080,7 @@ def TEX_1D_ARRAY_I32_F32_LEVEL "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$l, $x\\}], $lod;", []>; -def TEX_1D_ARRAY_I32_F32_GRAD +def TEX_1D_ARRAY_S32_F32_GRAD : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, @@ -1814,8 +2088,38 @@ def TEX_1D_ARRAY_I32_F32_GRAD "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", []>; +def TEX_1D_ARRAY_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}];", + []>; +def TEX_1D_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x), + "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}];", + []>; +def TEX_1D_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}], $lod;", + []>; +def TEX_1D_ARRAY_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; -def TEX_2D_F32_I32 +def TEX_2D_F32_S32 : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, Float32Regs:$b, Float32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), @@ -1847,21 +2151,21 @@ def TEX_2D_F32_F32_GRAD "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " "\\{$grady0, $grady1\\};", []>; -def TEX_2D_I32_I32 +def TEX_2D_S32_S32 : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$x, $y\\}];", []>; -def TEX_2D_I32_F32 +def TEX_2D_S32_F32 : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$x, $y\\}];", []>; -def TEX_2D_I32_F32_LEVEL +def TEX_2D_S32_F32_LEVEL : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, @@ -1869,7 +2173,7 @@ def TEX_2D_I32_F32_LEVEL "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$x, $y\\}], $lod;", []>; -def TEX_2D_I32_F32_GRAD +def TEX_2D_S32_F32_GRAD : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, @@ -1879,8 +2183,40 @@ def TEX_2D_I32_F32_GRAD "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " "\\{$grady0, $grady1\\};", []>; +def TEX_2D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TEX_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TEX_2D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$lod), + "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}], $lod;", + []>; +def TEX_2D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; -def TEX_2D_ARRAY_F32_I32 +def TEX_2D_ARRAY_F32_S32 : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, Float32Regs:$b, Float32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, @@ -1914,7 +2250,7 @@ def TEX_2D_ARRAY_F32_F32_GRAD "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " "\\{$grady0, $grady1\\};", []>; -def TEX_2D_ARRAY_I32_I32 +def TEX_2D_ARRAY_S32_S32 : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, @@ -1922,7 +2258,7 @@ def TEX_2D_ARRAY_I32_I32 "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$l, $x, $y, $y\\}];", []>; -def TEX_2D_ARRAY_I32_F32 +def TEX_2D_ARRAY_S32_F32 : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, @@ -1930,7 +2266,7 @@ def TEX_2D_ARRAY_I32_F32 "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$l, $x, $y, $y\\}];", []>; -def TEX_2D_ARRAY_I32_F32_LEVEL +def TEX_2D_ARRAY_S32_F32_LEVEL : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, @@ -1938,7 +2274,7 @@ def TEX_2D_ARRAY_I32_F32_LEVEL "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;", []>; -def TEX_2D_ARRAY_I32_F32_GRAD +def TEX_2D_ARRAY_S32_F32_GRAD : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, @@ -1949,8 +2285,43 @@ def TEX_2D_ARRAY_I32_F32_GRAD "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " "\\{$grady0, $grady1\\};", []>; +def TEX_2D_ARRAY_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$y), + "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_2D_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y), + "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_2D_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$lod), + "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;", + []>; +def TEX_2D_ARRAY_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; -def TEX_3D_F32_I32 +def TEX_3D_F32_S32 : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, Float32Regs:$b, Float32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, @@ -1987,7 +2358,7 @@ def TEX_3D_F32_F32_GRAD "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " "\\{$grady0, $grady1, $grady2, $grady2\\};", []>; -def TEX_3D_I32_I32 +def TEX_3D_S32_S32 : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, @@ -1995,7 +2366,7 @@ def TEX_3D_I32_I32 "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$x, $y, $z, $z\\}];", []>; -def TEX_3D_I32_F32 +def TEX_3D_S32_F32 : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, @@ -2003,7 +2374,7 @@ def TEX_3D_I32_F32 "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$x, $y, $z, $z\\}];", []>; -def TEX_3D_I32_F32_LEVEL +def TEX_3D_S32_F32_LEVEL : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, @@ -2011,7 +2382,7 @@ def TEX_3D_I32_F32_LEVEL "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", []>; -def TEX_3D_I32_F32_GRAD +def TEX_3D_S32_F32_GRAD : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, @@ -2024,653 +2395,3012 @@ def TEX_3D_I32_F32_GRAD "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " "\\{$grady0, $grady1, $grady2, $grady2\\};", []>; - - -// Surface load instructions -def SULD_1D_I8_TRAP - : NVPTXInst<(outs Int16Regs:$r), - (ins Int64Regs:$s, Int32Regs:$x), - "suld.b.1d.b8.trap \\{$r\\}, [$s, \\{$x\\}];", +def TEX_3D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$z), + "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", []>; -def SULD_1D_I16_TRAP - : NVPTXInst<(outs Int16Regs:$r), - (ins Int64Regs:$s, Int32Regs:$x), - "suld.b.1d.b16.trap \\{$r\\}, [$s, \\{$x\\}];", +def TEX_3D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z), + "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", []>; -def SULD_1D_I32_TRAP - : NVPTXInst<(outs Int32Regs:$r), - (ins Int64Regs:$s, Int32Regs:$x), - "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];", +def TEX_3D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, Float32Regs:$lod), + "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", []>; -def SULD_1D_V2I8_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), - (ins Int64Regs:$s, Int32Regs:$x), - "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];", +def TEX_3D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$gradx2, Float32Regs:$grady0, + Float32Regs:$grady1, Float32Regs:$grady2), + "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], " + "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " + "\\{$grady0, $grady1, $grady2, $grady2\\};", []>; -def SULD_1D_V2I16_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), - (ins Int64Regs:$s, Int32Regs:$x), - "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];", + +def TEX_CUBE_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", []>; -def SULD_1D_V2I32_TRAP - : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), - (ins Int64Regs:$s, Int32Regs:$x), - "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];", +def TEX_CUBE_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", []>; -def SULD_1D_V4I8_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (ins Int64Regs:$s, Int32Regs:$x), - "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", +def TEX_CUBE_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", []>; -def SULD_1D_V4I16_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (ins Int64Regs:$s, Int32Regs:$x), - "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", +def TEX_CUBE_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", []>; -def SULD_1D_V4I32_TRAP - : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (ins Int64Regs:$s, Int32Regs:$x), - "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", +def TEX_CUBE_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_CUBE_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", []>; -def SULD_1D_ARRAY_I8_TRAP - : NVPTXInst<(outs Int16Regs:$r), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), - "suld.b.a1d.b8.trap \\{$r\\}, [$s, \\{$l, $x\\}];", +def TEX_CUBE_ARRAY_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}];", []>; -def SULD_1D_ARRAY_I16_TRAP - : NVPTXInst<(outs Int16Regs:$r), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), - "suld.b.a1d.b16.trap \\{$r\\}, [$s, \\{$l, $x\\}];", +def TEX_CUBE_ARRAY_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;", []>; -def SULD_1D_ARRAY_I32_TRAP - : NVPTXInst<(outs Int32Regs:$r), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), - "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];", +def TEX_CUBE_ARRAY_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}];", []>; -def SULD_1D_ARRAY_V2I8_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), - "suld.b.a1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", +def TEX_CUBE_ARRAY_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;", []>; -def SULD_1D_ARRAY_V2I16_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), - "suld.b.a1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", +def TEX_CUBE_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}];", []>; -def SULD_1D_ARRAY_V2I32_TRAP - : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), - "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", +def TEX_CUBE_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;", []>; -def SULD_1D_ARRAY_V4I8_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + +def TLD4_R_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_G_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_B_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_A_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_R_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_G_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_B_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_A_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_R_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_G_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_B_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_A_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +} + + +// texmode_unified +let IsTex = 1, IsTexModeUnified = 1 in { +// Texture fetch instructions using handles +def TEX_UNIFIED_1D_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x), + "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x), + "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$lod), + "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_UNIFIED_1D_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x), + "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x), + "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_UNIFIED_1D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x), + "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x), + "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; + +def TEX_UNIFIED_1D_ARRAY_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x), + "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x), + "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_UNIFIED_1D_ARRAY_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x), + "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x), + "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_UNIFIED_1D_ARRAY_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x), + "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x), + "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; + +def TEX_UNIFIED_2D_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y), + "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$lod), + "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_UNIFIED_2D_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y), + "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$lod), + "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_UNIFIED_2D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y), + "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$lod), + "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; + +def TEX_UNIFIED_2D_ARRAY_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$y), + "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y), + "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$lod), + "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_UNIFIED_2D_ARRAY_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$y), + "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y), + "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$lod), + "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_UNIFIED_2D_ARRAY_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$y), + "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y), + "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$lod), + "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; + +def TEX_UNIFIED_3D_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$z), + "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z), + "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, Float32Regs:$lod), + "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_UNIFIED_3D_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$gradx2, Float32Regs:$grady0, + Float32Regs:$grady1, Float32Regs:$grady2), + "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], " + "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " + "\\{$grady0, $grady1, $grady2, $grady2\\};", + []>; +def TEX_UNIFIED_3D_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$z), + "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z), + "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, Float32Regs:$lod), + "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_UNIFIED_3D_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$gradx2, Float32Regs:$grady0, + Float32Regs:$grady1, Float32Regs:$grady2), + "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], " + "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " + "\\{$grady0, $grady1, $grady2, $grady2\\};", + []>; +def TEX_UNIFIED_3D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$z), + "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z), + "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, Float32Regs:$lod), + "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_UNIFIED_3D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$gradx2, Float32Regs:$grady0, + Float32Regs:$grady1, Float32Regs:$grady2), + "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], " + "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " + "\\{$grady0, $grady1, $grady2, $grady2\\};", + []>; + +def TEX_UNIFIED_CUBE_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_UNIFIED_CUBE_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_UNIFIED_CUBE_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; + +def TEX_UNIFIED_CUBE_ARRAY_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}], $lod;", + []>; +def TEX_UNIFIED_CUBE_ARRAY_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}], $lod;", + []>; +def TEX_UNIFIED_CUBE_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}], $lod;", + []>; + +def TLD4_UNIFIED_R_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_G_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_B_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_A_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_R_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_G_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_B_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_A_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_R_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_G_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_B_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_A_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +} + + + +//=== Surface load instructions +// .clamp variant +let IsSuld = 1 in { +def SULD_1D_I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b8.clamp \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b16.clamp \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b32.clamp \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b64.clamp \\{$r\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b8.clamp \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b16.clamp \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b32.clamp \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b64.clamp \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b8.clamp \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b16.clamp \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b32.clamp \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b64.clamp \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b8.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b16.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b32.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b64.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b8.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b16.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b32.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b64.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 2 in { +def SULD_1D_V2I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V2I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V2I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V2I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b8.clamp \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b16.clamp \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b32.clamp \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b64.clamp \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_V2I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 3 in { +def SULD_1D_V4I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V4I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V4I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V4I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + + +def SULD_3D_V4I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b8.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b16.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b32.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +} + + +// .trap variant +let IsSuld = 1 in { +def SULD_1D_I8_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b8.trap \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I16_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b16.trap \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I32_TRAP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I64_TRAP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b64.trap \\{$r\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_I8_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b8.trap \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I16_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b16.trap \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I32_TRAP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I64_TRAP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b64.trap \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_I8_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I16_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I32_TRAP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I64_TRAP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b64.trap \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_I8_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I16_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I32_TRAP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I64_TRAP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b64.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_I8_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I16_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I32_TRAP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I64_TRAP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b64.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 2 in { +def SULD_1D_V2I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I64_TRAP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V2I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I64_TRAP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V2I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I64_TRAP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V2I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I64_TRAP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b64.trap \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_V2I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I64_TRAP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 3 in { +def SULD_1D_V4I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V4I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), "suld.b.a1d.v4.b8.trap \\{$r, $g, $b, $a\\}, " "[$s, \\{$l, $x\\}];", []>; -def SULD_1D_ARRAY_V4I16_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), - "suld.b.a1d.v4.b16.trap \\{$r, $g, $b, $a\\}, " - "[$s, \\{$l, $x\\}];", +def SULD_1D_ARRAY_V4I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b16.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b32.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V4I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V4I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b8.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b16.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b32.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + + +def SULD_3D_V4I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b8.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b16.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +// .zero variant +let IsSuld = 1 in { +def SULD_1D_I8_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b8.zero \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I16_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b16.zero \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I32_ZERO + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b32.zero \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I64_ZERO + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b64.zero \\{$r\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_I8_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b8.zero \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I16_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b16.zero \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I32_ZERO + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b32.zero \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I64_ZERO + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b64.zero \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_I8_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b8.zero \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I16_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b16.zero \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I32_ZERO + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b32.zero \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I64_ZERO + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b64.zero \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_I8_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b8.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I16_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b16.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I32_ZERO + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b32.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I64_ZERO + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b64.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_I8_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b8.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I16_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b16.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I32_ZERO + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b32.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I64_ZERO + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b64.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 2 in { +def SULD_1D_V2I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I64_ZERO + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V2I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I64_ZERO + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V2I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I64_ZERO + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V2I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b8.zero \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b16.zero \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b32.zero \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I64_ZERO + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b64.zero \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_V2I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I64_ZERO + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 3 in { +def SULD_1D_V4I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V4I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b8.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b16.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b32.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V4I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V4I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b8.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b16.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b32.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + + +def SULD_3D_V4I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b8.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b16.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b32.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +//----------------------------------- +// Texture Query Intrinsics +//----------------------------------- + +let IsSurfTexQuery = 1 in { +def TXQ_CHANNEL_ORDER + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.channel_order.b32 \t$d, [$a];", + []>; +def TXQ_CHANNEL_DATA_TYPE + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.channel_data_type.b32 \t$d, [$a];", + []>; +def TXQ_WIDTH + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.width.b32 \t$d, [$a];", + []>; +def TXQ_HEIGHT + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.height.b32 \t$d, [$a];", + []>; +def TXQ_DEPTH + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.depth.b32 \t$d, [$a];", + []>; +def TXQ_ARRAY_SIZE + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.array_size.b32 \t$d, [$a];", + []>; +def TXQ_NUM_SAMPLES + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.num_samples.b32 \t$d, [$a];", + []>; +def TXQ_NUM_MIPMAP_LEVELS + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.num_mipmap_levels.b32 \t$d, [$a];", + []>; +} + +def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a), + (TXQ_CHANNEL_ORDER Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a), + (TXQ_CHANNEL_DATA_TYPE Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_width Int64Regs:$a), + (TXQ_WIDTH Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_height Int64Regs:$a), + (TXQ_HEIGHT Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_depth Int64Regs:$a), + (TXQ_DEPTH Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_array_size Int64Regs:$a), + (TXQ_ARRAY_SIZE Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a), + (TXQ_NUM_SAMPLES Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a), + (TXQ_NUM_MIPMAP_LEVELS Int64Regs:$a)>; + + +//----------------------------------- +// Surface Query Intrinsics +//----------------------------------- + +let IsSurfTexQuery = 1 in { +def SUQ_CHANNEL_ORDER + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.channel_order.b32 \t$d, [$a];", + []>; +def SUQ_CHANNEL_DATA_TYPE + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.channel_data_type.b32 \t$d, [$a];", + []>; +def SUQ_WIDTH + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.width.b32 \t$d, [$a];", + []>; +def SUQ_HEIGHT + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.height.b32 \t$d, [$a];", + []>; +def SUQ_DEPTH + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.depth.b32 \t$d, [$a];", + []>; +def SUQ_ARRAY_SIZE + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.array_size.b32 \t$d, [$a];", + []>; +} + +def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a), + (SUQ_CHANNEL_ORDER Int64Regs:$a)>; +def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a), + (SUQ_CHANNEL_DATA_TYPE Int64Regs:$a)>; +def : Pat<(int_nvvm_suq_width Int64Regs:$a), + (SUQ_WIDTH Int64Regs:$a)>; +def : Pat<(int_nvvm_suq_height Int64Regs:$a), + (SUQ_HEIGHT Int64Regs:$a)>; +def : Pat<(int_nvvm_suq_depth Int64Regs:$a), + (SUQ_DEPTH Int64Regs:$a)>; +def : Pat<(int_nvvm_suq_array_size Int64Regs:$a), + (SUQ_ARRAY_SIZE Int64Regs:$a)>; + + +//===- Handle Query -------------------------------------------------------===// + +// TODO: These intrinsics are not yet finalized, pending PTX ISA design work +def ISTYPEP_SAMPLER + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.samplerref \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_istypep_sampler Int64Regs:$a))]>; +def ISTYPEP_SURFACE + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.surfref \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_istypep_surface Int64Regs:$a))]>; +def ISTYPEP_TEXTURE + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.texref \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_istypep_texture Int64Regs:$a))]>; + +//===- Surface Stores -----------------------------------------------------===// + +let IsSust = 1 in { +// Unformatted +// .clamp variant +def SUST_B_1D_B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.b.1d.b8.clamp \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.b.1d.b16.clamp \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + "sust.b.1d.b32.clamp \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + "sust.b.1d.b64.clamp \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_V2B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.b.1d.v2.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.b.1d.v2.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + "sust.b.1d.v2.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + "sust.b.1d.v2.b64.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V4B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.b.1d.v4.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_V4B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.b.1d.v4.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_V4B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + "sust.b.1d.v4.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_1D_ARRAY_B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.b.a1d.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.b.a1d.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r), + "sust.b.a1d.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r), + "sust.b.a1d.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_V2B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.b.a1d.v2.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.b.a1d.v2.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g), + "sust.b.a1d.v2.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r, + Int64Regs:$g), + "sust.b.a1d.v2.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V4B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a1d.v4.b8.clamp \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_ARRAY_V4B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a1d.v4.b16.clamp \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_ARRAY_V4B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.a1d.v4.b32.clamp \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_2D_B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.b.2d.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.b.2d.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + "sust.b.2d.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + "sust.b.2d.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_V2B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.b.2d.v2.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.b.2d.v2.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + "sust.b.2d.v2.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + "sust.b.2d.v2.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V4B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.2d.v4.b8.clamp \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_V4B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.2d.v4.b16.clamp \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_V4B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.2d.v4.b32.clamp \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_2D_ARRAY_B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.b.a2d.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.b.a2d.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r), + "sust.b.a2d.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r), + "sust.b.a2d.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_V2B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.b.a2d.v2.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.b.a2d.v2.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g), + "sust.b.a2d.v2.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g), + "sust.b.a2d.v2.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V4B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a2d.v4.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_ARRAY_V4B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a2d.v4.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_ARRAY_V4B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.a2d.v4.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_3D_B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.b.3d.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.b.3d.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + "sust.b.3d.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + "sust.b.3d.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_V2B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.b.3d.v2.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.b.3d.v2.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + "sust.b.3d.v2.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + "sust.b.3d.v2.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V4B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.3d.v4.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_3D_V4B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.3d.v4.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_3D_V4B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.3d.v4.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +// .trap variant +def SUST_B_1D_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.b.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.b.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};", []>; -def SULD_1D_ARRAY_V4I32_TRAP - : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), - "suld.b.a1d.v4.b32.trap \\{$r, $g, $b, $a\\}, " - "[$s, \\{$l, $x\\}];", +def SUST_B_1D_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + "sust.b.1d.b64.trap \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.b.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.b.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + "sust.b.1d.v2.b64.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.b.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.b.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + "sust.b.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", []>; -def SULD_2D_I8_TRAP - : NVPTXInst<(outs Int16Regs:$r), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), - "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];", + +def SUST_B_1D_ARRAY_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.b.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", []>; -def SULD_2D_I16_TRAP - : NVPTXInst<(outs Int16Regs:$r), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), - "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];", +def SUST_B_1D_ARRAY_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.b.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", []>; -def SULD_2D_I32_TRAP - : NVPTXInst<(outs Int32Regs:$r), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), - "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];", +def SUST_B_1D_ARRAY_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r), + "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", []>; -def SULD_2D_V2I8_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), - "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", +def SUST_B_1D_ARRAY_B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r), + "sust.b.a1d.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", []>; -def SULD_2D_V2I16_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), - "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", +def SUST_B_1D_ARRAY_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.b.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", []>; -def SULD_2D_V2I32_TRAP - : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), - "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", +def SUST_B_1D_ARRAY_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.b.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", []>; -def SULD_2D_V4I8_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), - "suld.b.2d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", +def SUST_B_1D_ARRAY_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g), + "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", []>; -def SULD_2D_V4I16_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), - "suld.b.2d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", +def SUST_B_1D_ARRAY_V2B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r, + Int64Regs:$g), + "sust.b.a1d.v2.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", []>; -def SULD_2D_V4I32_TRAP - : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), - "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", +def SUST_B_1D_ARRAY_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_ARRAY_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_ARRAY_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", []>; -def SULD_2D_ARRAY_I8_TRAP - : NVPTXInst<(outs Int16Regs:$r), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), - "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + +def SUST_B_2D_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.b.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", []>; -def SULD_2D_ARRAY_I16_TRAP - : NVPTXInst<(outs Int16Regs:$r), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), - "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", +def SUST_B_2D_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.b.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", []>; -def SULD_2D_ARRAY_I32_TRAP - : NVPTXInst<(outs Int32Regs:$r), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), - "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", +def SUST_B_2D_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", []>; -def SULD_2D_ARRAY_V2I8_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), - "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, " - "[$s, \\{$l, $x, $y, $y\\}];", +def SUST_B_2D_B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + "sust.b.2d.b64.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", []>; -def SULD_2D_ARRAY_V2I16_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), - "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, " - "[$s, \\{$l, $x, $y, $y\\}];", +def SUST_B_2D_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.b.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", []>; -def SULD_2D_ARRAY_V2I32_TRAP - : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), - "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, " - "[$s, \\{$l, $x, $y, $y\\}];", +def SUST_B_2D_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.b.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", []>; -def SULD_2D_ARRAY_V4I8_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), - "suld.b.a2d.v4.b8.trap \\{$r, $g, $b, $a\\}, " - "[$s, \\{$l, $x, $y, $y\\}];", +def SUST_B_2D_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", []>; -def SULD_2D_ARRAY_V4I16_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), - "suld.b.a2d.v4.b16.trap \\{$r, $g, $b, $a\\}, " - "[$s, \\{$l, $x, $y, $y\\}];", +def SUST_B_2D_V2B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + "sust.b.2d.v2.b64.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", []>; -def SULD_2D_ARRAY_V4I32_TRAP - : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), - "suld.b.a2d.v4.b32.trap \\{$r, $g, $b, $a\\}, " - "[$s, \\{$l, $x, $y, $y\\}];", +def SUST_B_2D_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", []>; -def SULD_3D_I8_TRAP - : NVPTXInst<(outs Int16Regs:$r), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), - "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + +def SUST_B_2D_ARRAY_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.b.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.b.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r), + "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", []>; -def SULD_3D_I16_TRAP - : NVPTXInst<(outs Int16Regs:$r), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), - "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", +def SUST_B_2D_ARRAY_B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r), + "sust.b.a2d.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", []>; -def SULD_3D_I32_TRAP - : NVPTXInst<(outs Int32Regs:$r), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), - "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", +def SUST_B_2D_ARRAY_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.b.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", []>; -def SULD_3D_V2I8_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), - "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", +def SUST_B_2D_ARRAY_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.b.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", []>; -def SULD_3D_V2I16_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), - "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", +def SUST_B_2D_ARRAY_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g), + "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", []>; -def SULD_3D_V2I32_TRAP - : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), - "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", +def SUST_B_2D_ARRAY_V2B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g), + "sust.b.a2d.v2.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", []>; -def SULD_3D_V4I8_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), - "suld.b.3d.v4.b8.trap \\{$r, $g, $b, $a\\}, " - "[$s, \\{$x, $y, $z, $z\\}];", +def SUST_B_2D_ARRAY_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", []>; -def SULD_3D_V4I16_TRAP - : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), - "suld.b.3d.v4.b16.trap \\{$r, $g, $b, $a\\}, " - "[$s, \\{$x, $y, $z, $z\\}];", +def SUST_B_2D_ARRAY_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", []>; -def SULD_3D_V4I32_TRAP - : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), - "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, " - "[$s, \\{$x, $y, $z, $z\\}];", +def SUST_B_2D_ARRAY_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", []>; -//----------------------------------- -// Texture Query Intrinsics -//----------------------------------- -def TXQ_CHANNEL_ORDER - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.channel_order.b32 \t$d, [$a];", - []>; -def TXQ_CHANNEL_DATA_TYPE - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.channel_data_type.b32 \t$d, [$a];", - []>; -def TXQ_WIDTH - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.width.b32 \t$d, [$a];", - []>; -def TXQ_HEIGHT - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.height.b32 \t$d, [$a];", +def SUST_B_3D_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.b.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", []>; -def TXQ_DEPTH - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.depth.b32 \t$d, [$a];", +def SUST_B_3D_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.b.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", []>; -def TXQ_ARRAY_SIZE - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.array_size.b32 \t$d, [$a];", +def SUST_B_3D_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", []>; -def TXQ_NUM_SAMPLES - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.num_samples.b32 \t$d, [$a];", +def SUST_B_3D_B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + "sust.b.3d.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", []>; -def TXQ_NUM_MIPMAP_LEVELS - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.num_mipmap_levels.b32 \t$d, [$a];", +def SUST_B_3D_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.b.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", []>; - -def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a), - (TXQ_CHANNEL_ORDER Int64Regs:$a)>; -def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a), - (TXQ_CHANNEL_DATA_TYPE Int64Regs:$a)>; -def : Pat<(int_nvvm_txq_width Int64Regs:$a), - (TXQ_WIDTH Int64Regs:$a)>; -def : Pat<(int_nvvm_txq_height Int64Regs:$a), - (TXQ_HEIGHT Int64Regs:$a)>; -def : Pat<(int_nvvm_txq_depth Int64Regs:$a), - (TXQ_DEPTH Int64Regs:$a)>; -def : Pat<(int_nvvm_txq_array_size Int64Regs:$a), - (TXQ_ARRAY_SIZE Int64Regs:$a)>; -def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a), - (TXQ_NUM_SAMPLES Int64Regs:$a)>; -def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a), - (TXQ_NUM_MIPMAP_LEVELS Int64Regs:$a)>; - - -//----------------------------------- -// Surface Query Intrinsics -//----------------------------------- -def SUQ_CHANNEL_ORDER - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.channel_order.b32 \t$d, [$a];", +def SUST_B_3D_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.b.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", []>; -def SUQ_CHANNEL_DATA_TYPE - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.channel_data_type.b32 \t$d, [$a];", +def SUST_B_3D_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", []>; -def SUQ_WIDTH - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.width.b32 \t$d, [$a];", +def SUST_B_3D_V2B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + "sust.b.3d.v2.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", []>; -def SUQ_HEIGHT - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.height.b32 \t$d, [$a];", +def SUST_B_3D_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", []>; -def SUQ_DEPTH - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.depth.b32 \t$d, [$a];", +def SUST_B_3D_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", []>; -def SUQ_ARRAY_SIZE - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.array_size.b32 \t$d, [$a];", +def SUST_B_3D_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", []>; -def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a), - (SUQ_CHANNEL_ORDER Int64Regs:$a)>; -def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a), - (SUQ_CHANNEL_DATA_TYPE Int64Regs:$a)>; -def : Pat<(int_nvvm_suq_width Int64Regs:$a), - (SUQ_WIDTH Int64Regs:$a)>; -def : Pat<(int_nvvm_suq_height Int64Regs:$a), - (SUQ_HEIGHT Int64Regs:$a)>; -def : Pat<(int_nvvm_suq_depth Int64Regs:$a), - (SUQ_DEPTH Int64Regs:$a)>; -def : Pat<(int_nvvm_suq_array_size Int64Regs:$a), - (SUQ_ARRAY_SIZE Int64Regs:$a)>; - - -//===- Handle Query -------------------------------------------------------===// - -// TODO: These intrinsics are not yet finalized, pending PTX ISA design work -def ISTYPEP_SAMPLER - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "istypep.samplerref \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_istypep_sampler Int64Regs:$a))]>; -def ISTYPEP_SURFACE - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "istypep.surfref \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_istypep_surface Int64Regs:$a))]>; -def ISTYPEP_TEXTURE - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "istypep.texref \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_istypep_texture Int64Regs:$a))]>; - -//===- Surface Stores -----------------------------------------------------===// - -// Unformatted -def SUST_B_1D_B8_TRAP +// .zero variant +def SUST_B_1D_B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - "sust.b.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};", + "sust.b.1d.b8.zero \t[$s, \\{$x\\}], \\{$r\\};", []>; -def SUST_B_1D_B16_TRAP +def SUST_B_1D_B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - "sust.b.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};", + "sust.b.1d.b16.zero \t[$s, \\{$x\\}], \\{$r\\};", []>; -def SUST_B_1D_B32_TRAP +def SUST_B_1D_B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), - "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};", + "sust.b.1d.b32.zero \t[$s, \\{$x\\}], \\{$r\\};", []>; -def SUST_B_1D_V2B8_TRAP +def SUST_B_1D_B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + "sust.b.1d.b64.zero \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_V2B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - "sust.b.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + "sust.b.1d.v2.b8.zero \t[$s, \\{$x\\}], \\{$r, $g\\};", []>; -def SUST_B_1D_V2B16_TRAP +def SUST_B_1D_V2B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - "sust.b.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + "sust.b.1d.v2.b16.zero \t[$s, \\{$x\\}], \\{$r, $g\\};", []>; -def SUST_B_1D_V2B32_TRAP +def SUST_B_1D_V2B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + "sust.b.1d.v2.b32.zero \t[$s, \\{$x\\}], \\{$r, $g\\};", []>; -def SUST_B_1D_V4B8_TRAP +def SUST_B_1D_V2B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + "sust.b.1d.v2.b64.zero \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V4B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - "sust.b.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + "sust.b.1d.v4.b8.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", []>; -def SUST_B_1D_V4B16_TRAP +def SUST_B_1D_V4B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - "sust.b.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + "sust.b.1d.v4.b16.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", []>; -def SUST_B_1D_V4B32_TRAP +def SUST_B_1D_V4B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - "sust.b.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + "sust.b.1d.v4.b32.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", []>; -def SUST_B_1D_ARRAY_B8_TRAP +def SUST_B_1D_ARRAY_B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), - "sust.b.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", + "sust.b.a1d.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};", []>; -def SUST_B_1D_ARRAY_B16_TRAP +def SUST_B_1D_ARRAY_B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), - "sust.b.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", + "sust.b.a1d.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};", []>; -def SUST_B_1D_ARRAY_B32_TRAP +def SUST_B_1D_ARRAY_B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r), - "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", + "sust.b.a1d.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};", []>; -def SUST_B_1D_ARRAY_V2B8_TRAP +def SUST_B_1D_ARRAY_B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r), + "sust.b.a1d.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_V2B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - "sust.b.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + "sust.b.a1d.v2.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", []>; -def SUST_B_1D_ARRAY_V2B16_TRAP +def SUST_B_1D_ARRAY_V2B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - "sust.b.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + "sust.b.a1d.v2.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", []>; -def SUST_B_1D_ARRAY_V2B32_TRAP +def SUST_B_1D_ARRAY_V2B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + "sust.b.a1d.v2.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", []>; -def SUST_B_1D_ARRAY_V4B8_TRAP +def SUST_B_1D_ARRAY_V2B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r, + Int64Regs:$g), + "sust.b.a1d.v2.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V4B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - "sust.b.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], " + "sust.b.a1d.v4.b8.zero \t[$s, \\{$idx, $x\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_1D_ARRAY_V4B16_TRAP +def SUST_B_1D_ARRAY_V4B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - "sust.b.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], " + "sust.b.a1d.v4.b16.zero \t[$s, \\{$idx, $x\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_1D_ARRAY_V4B32_TRAP +def SUST_B_1D_ARRAY_V4B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - "sust.b.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], " + "sust.b.a1d.v4.b32.zero \t[$s, \\{$idx, $x\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_2D_B8_TRAP +def SUST_B_2D_B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - "sust.b.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", + "sust.b.2d.b8.zero \t[$s, \\{$x, $y\\}], \\{$r\\};", []>; -def SUST_B_2D_B16_TRAP +def SUST_B_2D_B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - "sust.b.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", + "sust.b.2d.b16.zero \t[$s, \\{$x, $y\\}], \\{$r\\};", []>; -def SUST_B_2D_B32_TRAP +def SUST_B_2D_B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", + "sust.b.2d.b32.zero \t[$s, \\{$x, $y\\}], \\{$r\\};", []>; -def SUST_B_2D_V2B8_TRAP +def SUST_B_2D_B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + "sust.b.2d.b64.zero \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_V2B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - "sust.b.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + "sust.b.2d.v2.b8.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", []>; -def SUST_B_2D_V2B16_TRAP +def SUST_B_2D_V2B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - "sust.b.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + "sust.b.2d.v2.b16.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", []>; -def SUST_B_2D_V2B32_TRAP +def SUST_B_2D_V2B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), - "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + "sust.b.2d.v2.b32.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", []>; -def SUST_B_2D_V4B8_TRAP +def SUST_B_2D_V2B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + "sust.b.2d.v2.b64.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V4B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - "sust.b.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], " + "sust.b.2d.v4.b8.zero \t[$s, \\{$x, $y\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_2D_V4B16_TRAP +def SUST_B_2D_V4B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - "sust.b.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], " + "sust.b.2d.v4.b16.zero \t[$s, \\{$x, $y\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_2D_V4B32_TRAP +def SUST_B_2D_V4B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - "sust.b.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], " + "sust.b.2d.v4.b32.zero \t[$s, \\{$x, $y\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_2D_ARRAY_B8_TRAP +def SUST_B_2D_ARRAY_B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - "sust.b.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + "sust.b.a2d.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", []>; -def SUST_B_2D_ARRAY_B16_TRAP +def SUST_B_2D_ARRAY_B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - "sust.b.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + "sust.b.a2d.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", []>; -def SUST_B_2D_ARRAY_B32_TRAP +def SUST_B_2D_ARRAY_B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + "sust.b.a2d.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", []>; -def SUST_B_2D_ARRAY_V2B8_TRAP +def SUST_B_2D_ARRAY_B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r), + "sust.b.a2d.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_V2B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - "sust.b.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "sust.b.a2d.v2.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], " "\\{$r, $g\\};", []>; -def SUST_B_2D_ARRAY_V2B16_TRAP +def SUST_B_2D_ARRAY_V2B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - "sust.b.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "sust.b.a2d.v2.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], " "\\{$r, $g\\};", []>; -def SUST_B_2D_ARRAY_V2B32_TRAP +def SUST_B_2D_ARRAY_V2B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), - "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "sust.b.a2d.v2.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], " "\\{$r, $g\\};", []>; -def SUST_B_2D_ARRAY_V4B8_TRAP +def SUST_B_2D_ARRAY_V2B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g), + "sust.b.a2d.v2.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V4B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - "sust.b.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "sust.b.a2d.v4.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_2D_ARRAY_V4B16_TRAP +def SUST_B_2D_ARRAY_V4B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - "sust.b.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "sust.b.a2d.v4.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_2D_ARRAY_V4B32_TRAP +def SUST_B_2D_ARRAY_V4B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - "sust.b.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "sust.b.a2d.v4.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_3D_B8_TRAP +def SUST_B_3D_B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int16Regs:$r), - "sust.b.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + "sust.b.3d.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", []>; -def SUST_B_3D_B16_TRAP +def SUST_B_3D_B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int16Regs:$r), - "sust.b.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + "sust.b.3d.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", []>; -def SUST_B_3D_B32_TRAP +def SUST_B_3D_B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int32Regs:$r), - "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + "sust.b.3d.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", []>; -def SUST_B_3D_V2B8_TRAP +def SUST_B_3D_B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + "sust.b.3d.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_V2B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int16Regs:$r, Int16Regs:$g), - "sust.b.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "sust.b.3d.v2.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], " "\\{$r, $g\\};", []>; -def SUST_B_3D_V2B16_TRAP +def SUST_B_3D_V2B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int16Regs:$r, Int16Regs:$g), - "sust.b.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "sust.b.3d.v2.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], " "\\{$r, $g\\};", []>; -def SUST_B_3D_V2B32_TRAP +def SUST_B_3D_V2B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int32Regs:$r, Int32Regs:$g), - "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "sust.b.3d.v2.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], " "\\{$r, $g\\};", []>; -def SUST_B_3D_V4B8_TRAP +def SUST_B_3D_V2B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + "sust.b.3d.v2.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V4B8_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - "sust.b.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "sust.b.3d.v4.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_3D_V4B16_TRAP +def SUST_B_3D_V4B16_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - "sust.b.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "sust.b.3d.v4.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], " "\\{$r, $g, $b, $a\\};", []>; -def SUST_B_3D_V4B32_TRAP +def SUST_B_3D_V4B32_ZERO : NVPTXInst<(outs), (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - "sust.b.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "sust.b.3d.v4.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], " "\\{$r, $g, $b, $a\\};", []>; + + // Formatted def SUST_P_1D_B8_TRAP @@ -2957,12 +5687,341 @@ def SUST_P_3D_V4B32_TRAP "sust.p.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], " "\\{$r, $g, $b, $a\\};", []>; +} + +// Surface store instruction patterns +// I'm not sure why we can't just include these in the instruction definitions, +// but TableGen complains of type errors :( + +// .clamp variant +def : Pat<(int_nvvm_sust_b_1d_i8_clamp + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i16_clamp + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + (SUST_B_1D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i64_clamp + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_B_1D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp + Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_1D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_ARRAY_B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_ARRAY_B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), + (SUST_B_1D_ARRAY_B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_ARRAY_B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_B_1D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_ARRAY_V4B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_ARRAY_V4B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_1D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_2d_i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_B_2D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i64_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), + (SUST_B_2D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), + (SUST_B_2D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_2D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_ARRAY_B8_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_ARRAY_B16_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_B_2D_ARRAY_B32_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_ARRAY_B64_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + (SUST_B_2D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + (SUST_B_2D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_ARRAY_V4B8_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_ARRAY_V4B16_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_2D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_3d_i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_B_3D_B8_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_B_3D_B16_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; +def : Pat<(int_nvvm_sust_b_3d_i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + (SUST_B_3D_B32_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i64_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + (SUST_B_3D_B64_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_3D_V2B8_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_3D_V2B16_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + (SUST_B_3D_V2B32_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + (SUST_B_3D_V2B64_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_3D_V4B8_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_3D_V4B16_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_3D_V4B32_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; -// Surface store instruction patterns -// I'm not sure why we can't just include these in the instruction definitions, -// but TableGen complains of type errors :( +// .trap variant def : Pat<(int_nvvm_sust_b_1d_i8_trap Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), (SUST_B_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; @@ -2975,6 +6034,10 @@ def : Pat<(int_nvvm_sust_b_1d_i32_trap Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), (SUST_B_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; +def : Pat<(int_nvvm_sust_b_1d_i64_trap + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; + def : Pat<(int_nvvm_sust_b_1d_v2i8_trap Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), (SUST_B_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, @@ -2990,6 +6053,11 @@ def : Pat<(int_nvvm_sust_b_1d_v2i32_trap (SUST_B_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g)>; +def : Pat<(int_nvvm_sust_b_1d_v2i64_trap + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + def : Pat<(int_nvvm_sust_b_1d_v4i8_trap Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), @@ -3025,6 +6093,11 @@ def : Pat<(int_nvvm_sust_b_1d_array_i32_trap (SUST_B_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r)>; +def : Pat<(int_nvvm_sust_b_1d_array_i64_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_ARRAY_B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r)>; + def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), (SUST_B_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, @@ -3040,6 +6113,11 @@ def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap (SUST_B_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g)>; +def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), @@ -3075,6 +6153,11 @@ def : Pat<(int_nvvm_sust_b_2d_i32_trap (SUST_B_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r)>; +def : Pat<(int_nvvm_sust_b_2d_i64_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + def : Pat<(int_nvvm_sust_b_2d_v2i8_trap Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), (SUST_B_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, @@ -3090,6 +6173,11 @@ def : Pat<(int_nvvm_sust_b_2d_v2i32_trap (SUST_B_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; +def : Pat<(int_nvvm_sust_b_2d_v2i64_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), + (SUST_B_2D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g)>; + def : Pat<(int_nvvm_sust_b_2d_v4i8_trap Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), @@ -3128,6 +6216,12 @@ def : Pat<(int_nvvm_sust_b_2d_array_i32_trap Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r)>; +def : Pat<(int_nvvm_sust_b_2d_array_i64_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_ARRAY_B64_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), @@ -3148,6 +6242,12 @@ def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap (SUST_B_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; +def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + (SUST_B_2D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; + def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), @@ -3192,6 +6292,13 @@ def : Pat<(int_nvvm_sust_b_3d_i32_trap Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int32Regs:$r)>; +def : Pat<(int_nvvm_sust_b_3d_i64_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + (SUST_B_3D_B64_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r)>; + def : Pat<(int_nvvm_sust_b_3d_v2i8_trap Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int16Regs:$r, Int16Regs:$g), @@ -3213,6 +6320,13 @@ def : Pat<(int_nvvm_sust_b_3d_v2i32_trap Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int32Regs:$r, Int32Regs:$g)>; +def : Pat<(int_nvvm_sust_b_3d_v2i64_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + (SUST_B_3D_V2B64_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g)>; + def : Pat<(int_nvvm_sust_b_3d_v4i8_trap Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), @@ -3235,6 +6349,334 @@ def : Pat<(int_nvvm_sust_b_3d_v4i32_trap Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; +// .zero variant +def : Pat<(int_nvvm_sust_b_1d_i8_zero + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i16_zero + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + (SUST_B_1D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i64_zero + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i8_zero + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i16_zero + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_B_1D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i64_zero + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i8_zero + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i16_zero + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i32_zero + Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_1D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_1d_array_i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_ARRAY_B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_ARRAY_B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), + (SUST_B_1D_ARRAY_B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i64_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_ARRAY_B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_B_1D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_ARRAY_V4B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_ARRAY_V4B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_1D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_2d_i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_B_2D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i64_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), + (SUST_B_2D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i64_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), + (SUST_B_2D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_2D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_2d_array_i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_ARRAY_B8_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_ARRAY_B16_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_B_2D_ARRAY_B32_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i64_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_ARRAY_B64_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + (SUST_B_2D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + (SUST_B_2D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_ARRAY_V4B8_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_ARRAY_V4B16_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_2D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_3d_i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_B_3D_B8_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_B_3D_B16_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + (SUST_B_3D_B32_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i64_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + (SUST_B_3D_B64_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_3D_V2B8_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_3D_V2B16_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + (SUST_B_3D_V2B32_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i64_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + (SUST_B_3D_V2B64_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_3D_V4B8_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_3D_V4B16_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_3D_V4B32_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + def : Pat<(int_nvvm_sust_p_1d_i8_trap diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h index 0ee018cc7e5d..554764930a9e 100644 --- a/lib/Target/NVPTX/NVPTXMCExpr.h +++ b/lib/Target/NVPTX/NVPTXMCExpr.h @@ -66,7 +66,7 @@ class NVPTXFloatMCExpr : public MCTargetExpr { const MCAsmLayout *Layout) const override { return false; } - void AddValueSymbols(MCAssembler *) const override {}; + void visitUsedExpr(MCStreamer &Streamer) const override {}; const MCSection *FindAssociatedSection() const override { return nullptr; } diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index 62f288b67caa..358ccce39818 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -53,9 +53,9 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) { return "%f"; } if (RC == &NVPTX::Float64RegsRegClass) { - return "%fl"; + return "%fd"; } else if (RC == &NVPTX::Int64RegsRegClass) { - return "%rl"; + return "%rd"; } else if (RC == &NVPTX::Int32RegsRegClass) { return "%r"; } else if (RC == &NVPTX::Int16RegsRegClass) { diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td index 7a38a66b9227..efcee6b6f2bd 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -35,9 +35,9 @@ foreach i = 0-4 in { def P#i : NVPTXReg<"%p"#i>; // Predicate def RS#i : NVPTXReg<"%rs"#i>; // 16-bit def R#i : NVPTXReg<"%r"#i>; // 32-bit - def RL#i : NVPTXReg<"%rl"#i>; // 64-bit + def RL#i : NVPTXReg<"%rd"#i>; // 64-bit def F#i : NVPTXReg<"%f"#i>; // 32-bit float - def FL#i : NVPTXReg<"%fl"#i>; // 64-bit float + def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float // Arguments def ia#i : NVPTXReg<"%ia"#i>; @@ -46,6 +46,10 @@ foreach i = 0-4 in { def da#i : NVPTXReg<"%da"#i>; } +foreach i = 0-31 in { + def ENVREG#i : NVPTXReg<"%envreg"#i>; +} + //===----------------------------------------------------------------------===// // Register classes //===----------------------------------------------------------------------===// @@ -61,4 +65,5 @@ def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>; def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>; // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. -def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>; +def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot, + (sequence "ENVREG%u", 0, 31))>; diff --git a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index afd53a6a84cd..20d4e272341e 100644 --- a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -15,6 +15,7 @@ #include "NVPTX.h" #include "NVPTXMachineFunctionInfo.h" +#include "NVPTXSubtarget.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -32,10 +33,16 @@ class NVPTXReplaceImageHandles : public MachineFunctionPass { public: NVPTXReplaceImageHandles(); - bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "NVPTX Replace Image Handles"; + } private: bool processInstr(MachineInstr &MI); void replaceImageHandle(MachineOperand &Op, MachineFunction &MF); + bool findIndexForHandle(MachineOperand &Op, MachineFunction &MF, + unsigned &Idx); }; } @@ -65,242 +72,43 @@ bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) { E = InstrsToRemove.end(); I != E; ++I) { (*I)->eraseFromParent(); } - return Changed; } bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) { MachineFunction &MF = *MI.getParent()->getParent(); - // Check if we have a surface/texture instruction - switch (MI.getOpcode()) { - default: return false; - case NVPTX::TEX_1D_F32_I32: - case NVPTX::TEX_1D_F32_F32: - case NVPTX::TEX_1D_F32_F32_LEVEL: - case NVPTX::TEX_1D_F32_F32_GRAD: - case NVPTX::TEX_1D_I32_I32: - case NVPTX::TEX_1D_I32_F32: - case NVPTX::TEX_1D_I32_F32_LEVEL: - case NVPTX::TEX_1D_I32_F32_GRAD: - case NVPTX::TEX_1D_ARRAY_F32_I32: - case NVPTX::TEX_1D_ARRAY_F32_F32: - case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL: - case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD: - case NVPTX::TEX_1D_ARRAY_I32_I32: - case NVPTX::TEX_1D_ARRAY_I32_F32: - case NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL: - case NVPTX::TEX_1D_ARRAY_I32_F32_GRAD: - case NVPTX::TEX_2D_F32_I32: - case NVPTX::TEX_2D_F32_F32: - case NVPTX::TEX_2D_F32_F32_LEVEL: - case NVPTX::TEX_2D_F32_F32_GRAD: - case NVPTX::TEX_2D_I32_I32: - case NVPTX::TEX_2D_I32_F32: - case NVPTX::TEX_2D_I32_F32_LEVEL: - case NVPTX::TEX_2D_I32_F32_GRAD: - case NVPTX::TEX_2D_ARRAY_F32_I32: - case NVPTX::TEX_2D_ARRAY_F32_F32: - case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL: - case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD: - case NVPTX::TEX_2D_ARRAY_I32_I32: - case NVPTX::TEX_2D_ARRAY_I32_F32: - case NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL: - case NVPTX::TEX_2D_ARRAY_I32_F32_GRAD: - case NVPTX::TEX_3D_F32_I32: - case NVPTX::TEX_3D_F32_F32: - case NVPTX::TEX_3D_F32_F32_LEVEL: - case NVPTX::TEX_3D_F32_F32_GRAD: - case NVPTX::TEX_3D_I32_I32: - case NVPTX::TEX_3D_I32_F32: - case NVPTX::TEX_3D_I32_F32_LEVEL: - case NVPTX::TEX_3D_I32_F32_GRAD: { + const MCInstrDesc &MCID = MI.getDesc(); + + if (MCID.TSFlags & NVPTXII::IsTexFlag) { // This is a texture fetch, so operand 4 is a texref and operand 5 is // a samplerref MachineOperand &TexHandle = MI.getOperand(4); - MachineOperand &SampHandle = MI.getOperand(5); - replaceImageHandle(TexHandle, MF); - replaceImageHandle(SampHandle, MF); - - return true; - } - case NVPTX::SULD_1D_I8_TRAP: - case NVPTX::SULD_1D_I16_TRAP: - case NVPTX::SULD_1D_I32_TRAP: - case NVPTX::SULD_1D_ARRAY_I8_TRAP: - case NVPTX::SULD_1D_ARRAY_I16_TRAP: - case NVPTX::SULD_1D_ARRAY_I32_TRAP: - case NVPTX::SULD_2D_I8_TRAP: - case NVPTX::SULD_2D_I16_TRAP: - case NVPTX::SULD_2D_I32_TRAP: - case NVPTX::SULD_2D_ARRAY_I8_TRAP: - case NVPTX::SULD_2D_ARRAY_I16_TRAP: - case NVPTX::SULD_2D_ARRAY_I32_TRAP: - case NVPTX::SULD_3D_I8_TRAP: - case NVPTX::SULD_3D_I16_TRAP: - case NVPTX::SULD_3D_I32_TRAP: { - // This is a V1 surface load, so operand 1 is a surfref - MachineOperand &SurfHandle = MI.getOperand(1); - replaceImageHandle(SurfHandle, MF); + if (!(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) { + MachineOperand &SampHandle = MI.getOperand(5); + replaceImageHandle(SampHandle, MF); + } return true; - } - case NVPTX::SULD_1D_V2I8_TRAP: - case NVPTX::SULD_1D_V2I16_TRAP: - case NVPTX::SULD_1D_V2I32_TRAP: - case NVPTX::SULD_1D_ARRAY_V2I8_TRAP: - case NVPTX::SULD_1D_ARRAY_V2I16_TRAP: - case NVPTX::SULD_1D_ARRAY_V2I32_TRAP: - case NVPTX::SULD_2D_V2I8_TRAP: - case NVPTX::SULD_2D_V2I16_TRAP: - case NVPTX::SULD_2D_V2I32_TRAP: - case NVPTX::SULD_2D_ARRAY_V2I8_TRAP: - case NVPTX::SULD_2D_ARRAY_V2I16_TRAP: - case NVPTX::SULD_2D_ARRAY_V2I32_TRAP: - case NVPTX::SULD_3D_V2I8_TRAP: - case NVPTX::SULD_3D_V2I16_TRAP: - case NVPTX::SULD_3D_V2I32_TRAP: { - // This is a V2 surface load, so operand 2 is a surfref - MachineOperand &SurfHandle = MI.getOperand(2); - - replaceImageHandle(SurfHandle, MF); + } else if (MCID.TSFlags & NVPTXII::IsSuldMask) { + unsigned VecSize = + 1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1); - return true; - } - case NVPTX::SULD_1D_V4I8_TRAP: - case NVPTX::SULD_1D_V4I16_TRAP: - case NVPTX::SULD_1D_V4I32_TRAP: - case NVPTX::SULD_1D_ARRAY_V4I8_TRAP: - case NVPTX::SULD_1D_ARRAY_V4I16_TRAP: - case NVPTX::SULD_1D_ARRAY_V4I32_TRAP: - case NVPTX::SULD_2D_V4I8_TRAP: - case NVPTX::SULD_2D_V4I16_TRAP: - case NVPTX::SULD_2D_V4I32_TRAP: - case NVPTX::SULD_2D_ARRAY_V4I8_TRAP: - case NVPTX::SULD_2D_ARRAY_V4I16_TRAP: - case NVPTX::SULD_2D_ARRAY_V4I32_TRAP: - case NVPTX::SULD_3D_V4I8_TRAP: - case NVPTX::SULD_3D_V4I16_TRAP: - case NVPTX::SULD_3D_V4I32_TRAP: { - // This is a V4 surface load, so operand 4 is a surfref - MachineOperand &SurfHandle = MI.getOperand(4); + // For a surface load of vector size N, the Nth operand will be the surfref + MachineOperand &SurfHandle = MI.getOperand(VecSize); replaceImageHandle(SurfHandle, MF); return true; - } - case NVPTX::SUST_B_1D_B8_TRAP: - case NVPTX::SUST_B_1D_B16_TRAP: - case NVPTX::SUST_B_1D_B32_TRAP: - case NVPTX::SUST_B_1D_V2B8_TRAP: - case NVPTX::SUST_B_1D_V2B16_TRAP: - case NVPTX::SUST_B_1D_V2B32_TRAP: - case NVPTX::SUST_B_1D_V4B8_TRAP: - case NVPTX::SUST_B_1D_V4B16_TRAP: - case NVPTX::SUST_B_1D_V4B32_TRAP: - case NVPTX::SUST_B_1D_ARRAY_B8_TRAP: - case NVPTX::SUST_B_1D_ARRAY_B16_TRAP: - case NVPTX::SUST_B_1D_ARRAY_B32_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP: - case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP: - case NVPTX::SUST_B_2D_B8_TRAP: - case NVPTX::SUST_B_2D_B16_TRAP: - case NVPTX::SUST_B_2D_B32_TRAP: - case NVPTX::SUST_B_2D_V2B8_TRAP: - case NVPTX::SUST_B_2D_V2B16_TRAP: - case NVPTX::SUST_B_2D_V2B32_TRAP: - case NVPTX::SUST_B_2D_V4B8_TRAP: - case NVPTX::SUST_B_2D_V4B16_TRAP: - case NVPTX::SUST_B_2D_V4B32_TRAP: - case NVPTX::SUST_B_2D_ARRAY_B8_TRAP: - case NVPTX::SUST_B_2D_ARRAY_B16_TRAP: - case NVPTX::SUST_B_2D_ARRAY_B32_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP: - case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP: - case NVPTX::SUST_B_3D_B8_TRAP: - case NVPTX::SUST_B_3D_B16_TRAP: - case NVPTX::SUST_B_3D_B32_TRAP: - case NVPTX::SUST_B_3D_V2B8_TRAP: - case NVPTX::SUST_B_3D_V2B16_TRAP: - case NVPTX::SUST_B_3D_V2B32_TRAP: - case NVPTX::SUST_B_3D_V4B8_TRAP: - case NVPTX::SUST_B_3D_V4B16_TRAP: - case NVPTX::SUST_B_3D_V4B32_TRAP: - case NVPTX::SUST_P_1D_B8_TRAP: - case NVPTX::SUST_P_1D_B16_TRAP: - case NVPTX::SUST_P_1D_B32_TRAP: - case NVPTX::SUST_P_1D_V2B8_TRAP: - case NVPTX::SUST_P_1D_V2B16_TRAP: - case NVPTX::SUST_P_1D_V2B32_TRAP: - case NVPTX::SUST_P_1D_V4B8_TRAP: - case NVPTX::SUST_P_1D_V4B16_TRAP: - case NVPTX::SUST_P_1D_V4B32_TRAP: - case NVPTX::SUST_P_1D_ARRAY_B8_TRAP: - case NVPTX::SUST_P_1D_ARRAY_B16_TRAP: - case NVPTX::SUST_P_1D_ARRAY_B32_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP: - case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP: - case NVPTX::SUST_P_2D_B8_TRAP: - case NVPTX::SUST_P_2D_B16_TRAP: - case NVPTX::SUST_P_2D_B32_TRAP: - case NVPTX::SUST_P_2D_V2B8_TRAP: - case NVPTX::SUST_P_2D_V2B16_TRAP: - case NVPTX::SUST_P_2D_V2B32_TRAP: - case NVPTX::SUST_P_2D_V4B8_TRAP: - case NVPTX::SUST_P_2D_V4B16_TRAP: - case NVPTX::SUST_P_2D_V4B32_TRAP: - case NVPTX::SUST_P_2D_ARRAY_B8_TRAP: - case NVPTX::SUST_P_2D_ARRAY_B16_TRAP: - case NVPTX::SUST_P_2D_ARRAY_B32_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP: - case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP: - case NVPTX::SUST_P_3D_B8_TRAP: - case NVPTX::SUST_P_3D_B16_TRAP: - case NVPTX::SUST_P_3D_B32_TRAP: - case NVPTX::SUST_P_3D_V2B8_TRAP: - case NVPTX::SUST_P_3D_V2B16_TRAP: - case NVPTX::SUST_P_3D_V2B32_TRAP: - case NVPTX::SUST_P_3D_V4B8_TRAP: - case NVPTX::SUST_P_3D_V4B16_TRAP: - case NVPTX::SUST_P_3D_V4B32_TRAP: { + } else if (MCID.TSFlags & NVPTXII::IsSustFlag) { // This is a surface store, so operand 0 is a surfref MachineOperand &SurfHandle = MI.getOperand(0); replaceImageHandle(SurfHandle, MF); return true; - } - case NVPTX::TXQ_CHANNEL_ORDER: - case NVPTX::TXQ_CHANNEL_DATA_TYPE: - case NVPTX::TXQ_WIDTH: - case NVPTX::TXQ_HEIGHT: - case NVPTX::TXQ_DEPTH: - case NVPTX::TXQ_ARRAY_SIZE: - case NVPTX::TXQ_NUM_SAMPLES: - case NVPTX::TXQ_NUM_MIPMAP_LEVELS: - case NVPTX::SUQ_CHANNEL_ORDER: - case NVPTX::SUQ_CHANNEL_DATA_TYPE: - case NVPTX::SUQ_WIDTH: - case NVPTX::SUQ_HEIGHT: - case NVPTX::SUQ_DEPTH: - case NVPTX::SUQ_ARRAY_SIZE: { + } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) { // This is a query, so operand 1 is a surfref/texref MachineOperand &Handle = MI.getOperand(1); @@ -308,22 +116,38 @@ bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) { return true; } - } + + return false; } void NVPTXReplaceImageHandles:: replaceImageHandle(MachineOperand &Op, MachineFunction &MF) { + unsigned Idx; + if (findIndexForHandle(Op, MF, Idx)) { + Op.ChangeToImmediate(Idx); + } +} + +bool NVPTXReplaceImageHandles:: +findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) { const MachineRegisterInfo &MRI = MF.getRegInfo(); NVPTXMachineFunctionInfo *MFI = MF.getInfo(); + + assert(Op.isReg() && "Handle is not in a reg?"); + // Which instruction defines the handle? - MachineInstr *MI = MRI.getVRegDef(Op.getReg()); - assert(MI && "No def for image handle vreg?"); - MachineInstr &TexHandleDef = *MI; + MachineInstr &TexHandleDef = *MRI.getVRegDef(Op.getReg()); switch (TexHandleDef.getOpcode()) { case NVPTX::LD_i64_avar: { // The handle is a parameter value being loaded, replace with the // parameter symbol + const NVPTXSubtarget &ST = MF.getTarget().getSubtarget(); + if (ST.getDrvInterface() == NVPTX::CUDA) { + // For CUDA, we preserve the param loads coming from function arguments + return false; + } + assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!"); StringRef Sym = TexHandleDef.getOperand(6).getSymbolName(); std::string ParamBaseName = MF.getName(); @@ -333,19 +157,27 @@ replaceImageHandle(MachineOperand &Op, MachineFunction &MF) { std::string NewSym; raw_string_ostream NewSymStr(NewSym); NewSymStr << MF.getFunction()->getName() << "_param_" << Param; - Op.ChangeToImmediate( - MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str())); + InstrsToRemove.insert(&TexHandleDef); - break; + Idx = MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str()); + return true; } case NVPTX::texsurf_handles: { // The handle is a global variable, replace with the global variable name assert(TexHandleDef.getOperand(1).isGlobal() && "Load is not a global!"); const GlobalValue *GV = TexHandleDef.getOperand(1).getGlobal(); assert(GV->hasName() && "Global sampler must be named!"); - Op.ChangeToImmediate(MFI->getImageHandleSymbolIndex(GV->getName().data())); InstrsToRemove.insert(&TexHandleDef); - break; + Idx = MFI->getImageHandleSymbolIndex(GV->getName().data()); + return true; + } + case NVPTX::nvvm_move_i64: + case TargetOpcode::COPY: { + bool Res = findIndexForHandle(TexHandleDef.getOperand(1), MF, Idx); + if (Res) { + InstrsToRemove.insert(&TexHandleDef); + } + return Res; } default: llvm_unreachable("Unknown instruction operating on handle"); diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp index 8c7df52be344..d5cded218362 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -25,10 +25,41 @@ using namespace llvm; // Pin the vtable to this file. void NVPTXSubtarget::anchor() {} +static std::string computeDataLayout(bool is64Bit) { + std::string Ret = "e"; + + if (!is64Bit) + Ret += "-p:32:32"; + + Ret += "-i64:64-v16:16-v32:32-n16:32:64"; + + return Ret; +} + +NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + // Provide the default CPU if we don't have one. + if (CPU.empty() && FS.size()) + llvm_unreachable("we are not using FeatureStr"); + TargetName = CPU.empty() ? "sm_20" : CPU; + + ParseSubtargetFeatures(TargetName, FS); + + // Set default to PTX 3.2 (CUDA 5.5) + if (PTXVersion == 0) { + PTXVersion = 32; + } + + return *this; +} + NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit) + const std::string &FS, const TargetMachine &TM, + bool is64Bit) : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0), - SmVersion(20) { + SmVersion(20), DL(computeDataLayout(is64Bit)), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), + TLInfo((NVPTXTargetMachine &)TM), TSInfo(&DL), FrameLowering(*this) { Triple T(TT); @@ -36,26 +67,4 @@ NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU, drvInterface = NVPTX::NVCL; else drvInterface = NVPTX::CUDA; - - // Provide the default CPU if none - std::string defCPU = "sm_20"; - - ParseSubtargetFeatures((CPU.empty() ? defCPU : CPU), FS); - - // Get the TargetName from the FS if available - if (FS.empty() && CPU.empty()) - TargetName = defCPU; - else if (!CPU.empty()) - TargetName = CPU; - else - llvm_unreachable("we are not using FeatureStr"); - - // We default to PTX 3.1, but we cannot just default to it in the initializer - // since the attribute parser checks if the given option is >= the default. - // So if we set ptx31 as the default, the ptx30 attribute would never match. - // Instead, we use 0 as the default and manually set 31 if the default is - // used. - if (PTXVersion == 0) { - PTXVersion = 31; - } } diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h index 581e5edbcfb9..4c41e4e470dd 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/lib/Target/NVPTX/NVPTXSubtarget.h @@ -15,6 +15,12 @@ #define NVPTXSUBTARGET_H #include "NVPTX.h" +#include "NVPTXFrameLowering.h" +#include "NVPTXISelLowering.h" +#include "NVPTXInstrInfo.h" +#include "NVPTXRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include @@ -35,12 +41,30 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31 unsigned int SmVersion; + const DataLayout DL; // Calculates type size & alignment + NVPTXInstrInfo InstrInfo; + NVPTXTargetLowering TLInfo; + TargetSelectionDAGInfo TSInfo; + + // NVPTX does not have any call stack frame, but need a NVPTX specific + // FrameLowering class because TargetFrameLowering is abstract. + NVPTXFrameLowering FrameLowering; + public: /// This constructor initializes the data members to match that /// of the specified module. /// NVPTXSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit); + const std::string &FS, const TargetMachine &TM, bool is64Bit); + + const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } + const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const NVPTXRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const NVPTXTargetLowering *getTargetLowering() const { return &TLInfo; } + const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } bool hasBrkPt() const { return SmVersion >= 11; } bool hasAtomRedG32() const { return SmVersion >= 11; } @@ -57,15 +81,22 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasFMAF32() const { return SmVersion >= 20; } bool hasFMAF64() const { return SmVersion >= 13; } bool hasLDG() const { return SmVersion >= 32; } - bool hasLDU() const { return SmVersion >= 20; } + bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); } bool hasGenericLdSt() const { return SmVersion >= 20; } - inline bool hasHWROT32() const { return false; } - inline bool hasSWROT32() const { return true; } + inline bool hasHWROT32() const { return SmVersion >= 32; } + inline bool hasSWROT32() const { + return ((SmVersion >= 20) && (SmVersion < 32)); + } inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); } inline bool hasROT64() const { return SmVersion >= 20; } bool hasImageHandles() const { - // Currently disabled + // Enable handles for Kepler+, where CUDA supports indirect surfaces and + // textures + if (getDrvInterface() == NVPTX::CUDA) + return (SmVersion >= 30); + + // Disabled, otherwise return false; } bool is64Bit() const { return Is64Bit; } @@ -76,6 +107,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { unsigned getPTXVersion() const { return PTXVersion; } + NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef FS); }; diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 30583b0ff8c1..069a1b9966f0 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -66,26 +66,13 @@ extern "C" void LLVMInitializeNVPTXTarget() { *PassRegistry::getPassRegistry()); } -static std::string computeDataLayout(const NVPTXSubtarget &ST) { - std::string Ret = "e"; - - if (!ST.is64Bit()) - Ret += "-p:32:32"; - - Ret += "-i64:64-v16:16-v32:32-n16:32:64"; - - return Ret; -} - -NVPTXTargetMachine::NVPTXTargetMachine( - const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, bool is64bit) +NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool is64bit) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, is64bit), DL(computeDataLayout(Subtarget)), - InstrInfo(*this), TLInfo(*this), TSInfo(&DL), - FrameLowering( - *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ { + Subtarget(TT, CPU, FS, *this, is64bit) { initAsmInfo(); } @@ -119,6 +106,7 @@ class NVPTXPassConfig : public TargetPassConfig { bool addInstSelector() override; bool addPreRegAlloc() override; bool addPostRegAlloc() override; + void addMachineSSAOptimization() override; FunctionPass *createTargetRegisterAllocator(bool) override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; @@ -220,3 +208,43 @@ void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { printAndVerify("After StackSlotColoring"); } + +void NVPTXPassConfig::addMachineSSAOptimization() { + // Pre-ra tail duplication. + if (addPass(&EarlyTailDuplicateID)) + printAndVerify("After Pre-RegAlloc TailDuplicate"); + + // Optimize PHIs before DCE: removing dead PHI cycles may make more + // instructions dead. + addPass(&OptimizePHIsID); + + // This pass merges large allocas. StackSlotColoring is a different pass + // which merges spill slots. + addPass(&StackColoringID); + + // If the target requests it, assign local variables to stack slots relative + // to one another and simplify frame index references where possible. + addPass(&LocalStackSlotAllocationID); + + // With optimization, dead code should already be eliminated. However + // there is one known exception: lowered code for arguments that are only + // used by tail calls, where the tail calls reuse the incoming stack + // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). + addPass(&DeadMachineInstructionElimID); + printAndVerify("After codegen DCE pass"); + + // Allow targets to insert passes that improve instruction level parallelism, + // like if-conversion. Such passes will typically need dominator trees and + // loop info, just like LICM and CSE below. + if (addILPOpts()) + printAndVerify("After ILP optimizations"); + + addPass(&MachineLICMID); + addPass(&MachineCSEID); + + addPass(&MachineSinkingID); + printAndVerify("After Machine LICM, CSE and Sinking passes"); + + addPass(&PeepholeOptimizerID); + printAndVerify("After codegen peephole optimization pass"); +} diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 2db7c1861761..a7a1c8f4e171 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -14,13 +14,8 @@ #ifndef NVPTX_TARGETMACHINE_H #define NVPTX_TARGETMACHINE_H -#include "ManagedStringPool.h" -#include "NVPTXFrameLowering.h" -#include "NVPTXISelLowering.h" -#include "NVPTXInstrInfo.h" -#include "NVPTXRegisterInfo.h" #include "NVPTXSubtarget.h" -#include "llvm/IR/DataLayout.h" +#include "ManagedStringPool.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSelectionDAGInfo.h" @@ -31,50 +26,37 @@ namespace llvm { /// class NVPTXTargetMachine : public LLVMTargetMachine { NVPTXSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - NVPTXInstrInfo InstrInfo; - NVPTXTargetLowering TLInfo; - TargetSelectionDAGInfo TSInfo; - - // NVPTX does not have any call stack frame, but need a NVPTX specific - // FrameLowering class because TargetFrameLowering is abstract. - NVPTXFrameLowering FrameLowering; // Hold Strings that can be free'd all together with NVPTXTargetMachine ManagedStringPool ManagedStrPool; - //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level, - // bool DisableVerify, MCContext *&OutCtx); - public: NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit); const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); + } + const NVPTXInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; } - const DataLayout *getDataLayout() const override { return &DL; } const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; } - const NVPTXRegisterInfo *getRegisterInfo() const override { - return &(InstrInfo.getRegisterInfo()); + return getSubtargetImpl()->getRegisterInfo(); } - NVPTXTargetLowering *getTargetLowering() const override { - return const_cast(&TLInfo); + const NVPTXTargetLowering *getTargetLowering() const override { + return getSubtargetImpl()->getTargetLowering(); } const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } - //virtual bool addInstSelector(PassManagerBase &PM, - // CodeGenOpt::Level OptLevel); - - //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level); - ManagedStringPool *getManagedStrPool() const { return const_cast(&ManagedStrPool); } diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h index 0b438c521a20..ba8086d78880 100644 --- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -87,7 +87,8 @@ class NVPTXTargetObjectFile : public TargetLoweringObjectFile { new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); } - const MCSection *getSectionForConstant(SectionKind Kind) const override { + const MCSection *getSectionForConstant(SectionKind Kind, + const Constant *C) const override { return ReadOnlySection; } diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp index cb8bd7260256..a8d6b95ae4ec 100644 --- a/lib/Target/NVPTX/NVVMReflect.cpp +++ b/lib/Target/NVPTX/NVVMReflect.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" @@ -47,17 +48,16 @@ class NVVMReflect : public ModulePass { private: StringMap VarMap; typedef DenseMap::iterator VarMapIter; - Function *ReflectFunction; public: static char ID; - NVVMReflect() : ModulePass(ID), ReflectFunction(nullptr) { + NVVMReflect() : ModulePass(ID) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); VarMap.clear(); } NVVMReflect(const StringMap &Mapping) - : ModulePass(ID), ReflectFunction(nullptr) { + : ModulePass(ID) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); for (StringMap::const_iterator I = Mapping.begin(), E = Mapping.end(); I != E; ++I) { @@ -70,6 +70,8 @@ class NVVMReflect : public ModulePass { } bool runOnModule(Module &) override; +private: + bool handleFunction(Function *ReflectFunction); void setVarMap(); }; } @@ -120,19 +122,7 @@ void NVVMReflect::setVarMap() { } } -bool NVVMReflect::runOnModule(Module &M) { - if (!NVVMReflectEnabled) - return false; - - setVarMap(); - - ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION); - - // If reflect function is not used, then there will be - // no entry in the module. - if (!ReflectFunction) - return false; - +bool NVVMReflect::handleFunction(Function *ReflectFunction) { // Validate _reflect function assert(ReflectFunction->isDeclaration() && "_reflect function should not have a body"); @@ -155,13 +145,15 @@ bool NVVMReflect::runOnModule(Module &M) { "Only one operand expect for _reflect function"); // In cuda, we will have an extra constant-to-generic conversion of // the string. - const Value *conv = Reflect->getArgOperand(0); - assert(isa(conv) && "Expected a const-to-gen conversion"); - const CallInst *ConvCall = cast(conv); - const Value *str = ConvCall->getArgOperand(0); - assert(isa(str) && + const Value *Str = Reflect->getArgOperand(0); + if (isa(Str)) { + // CUDA path + const CallInst *ConvCall = cast(Str); + Str = ConvCall->getArgOperand(0); + } + assert(isa(Str) && "Format of _reflect function not recognized"); - const ConstantExpr *GEP = cast(str); + const ConstantExpr *GEP = cast(Str); const Value *Sym = GEP->getOperand(0); assert(isa(Sym) && "Format of _reflect function not recognized"); @@ -195,3 +187,36 @@ bool NVVMReflect::runOnModule(Module &M) { ToRemove[i]->eraseFromParent(); return true; } + +bool NVVMReflect::runOnModule(Module &M) { + if (!NVVMReflectEnabled) + return false; + + setVarMap(); + + + bool Res = false; + std::string Name; + Type *Tys[1]; + Type *I8Ty = Type::getInt8Ty(M.getContext()); + Function *ReflectFunction; + + // Check for standard overloaded versions of llvm.nvvm.reflect + + for (unsigned i = 0; i != 5; ++i) { + Tys[0] = PointerType::get(I8Ty, i); + Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys); + ReflectFunction = M.getFunction(Name); + if(ReflectFunction != 0) { + Res |= handleFunction(ReflectFunction); + } + } + + ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION); + // If reflect function is not used, then there will be + // no entry in the module. + if (ReflectFunction != 0) + Res |= handleFunction(ReflectFunction); + + return Res; +} diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 2f562ca7891a..d7066d58709a 100644 --- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -244,6 +245,8 @@ class PPCAsmParser : public MCTargetAsmParser { bool ParseDirectiveTC(unsigned Size, SMLoc L); bool ParseDirectiveMachine(SMLoc L); bool ParseDarwinDirectiveMachine(SMLoc L); + bool ParseDirectiveAbiVersion(SMLoc L); + bool ParseDirectiveLocalEntry(SMLoc L); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -1412,6 +1415,10 @@ bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) { return ParseDirectiveTC(isPPC64()? 8 : 4, DirectiveID.getLoc()); if (IDVal == ".machine") return ParseDirectiveMachine(DirectiveID.getLoc()); + if (IDVal == ".abiversion") + return ParseDirectiveAbiVersion(DirectiveID.getLoc()); + if (IDVal == ".localentry") + return ParseDirectiveLocalEntry(DirectiveID.getLoc()); } else { if (IDVal == ".machine") return ParseDarwinDirectiveMachine(DirectiveID.getLoc()); @@ -1534,6 +1541,64 @@ bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) { return false; } +/// ParseDirectiveAbiVersion +/// ::= .abiversion constant-expression +bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) { + int64_t AbiVersion; + if (getParser().parseAbsoluteExpression(AbiVersion)){ + Error(L, "expected constant expression"); + return false; + } + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + + PPCTargetStreamer &TStreamer = + *static_cast( + getParser().getStreamer().getTargetStreamer()); + TStreamer.emitAbiVersion(AbiVersion); + + return false; +} + +/// ParseDirectiveLocalEntry +/// ::= .localentry symbol, expression +bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) { + StringRef Name; + if (getParser().parseIdentifier(Name)) { + Error(L, "expected identifier in directive"); + return false; + } + MCSymbol *Sym = getContext().GetOrCreateSymbol(Name); + + if (getLexer().isNot(AsmToken::Comma)) { + Error(L, "unexpected token in directive"); + return false; + } + Lex(); + + const MCExpr *Expr; + if (getParser().parseExpression(Expr)) { + Error(L, "expected expression"); + return false; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + + PPCTargetStreamer &TStreamer = + *static_cast( + getParser().getStreamer().getTargetStreamer()); + TStreamer.emitLocalEntry(Sym, Expr); + + return false; +} + + + /// Force static initialization. extern "C" void LLVMInitializePowerPCAsmParser() { RegisterMCAsmParser A(ThePPC32Target); diff --git a/lib/Target/PowerPC/Disassembler/LLVMBuild.txt b/lib/Target/PowerPC/Disassembler/LLVMBuild.txt index c1011ff6a0e0..b0978c227ae9 100644 --- a/lib/Target/PowerPC/Disassembler/LLVMBuild.txt +++ b/lib/Target/PowerPC/Disassembler/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = PowerPCDisassembler parent = PowerPC -required_libraries = MC PowerPCDesc PowerPCInfo Support +required_libraries = MC PowerPCInfo Support add_to_library_groups = PowerPC diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 12584bea5a32..c54d5e75bdfd 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -9,7 +9,9 @@ #include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCFixupKinds.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCELF.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCMachObjectWriter.h" @@ -128,6 +130,30 @@ class PPCAsmBackend : public MCAsmBackend { } } + void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFixup &Fixup, const MCFragment *DF, + const MCValue &Target, uint64_t &Value, + bool &IsResolved) override { + switch ((PPC::Fixups)Fixup.getKind()) { + default: break; + case PPC::fixup_ppc_br24: + case PPC::fixup_ppc_br24abs: + // If the target symbol has a local entry point we must not attempt + // to resolve the fixup directly. Emit a relocation and leave + // resolution of the final target address to the linker. + if (const MCSymbolRefExpr *A = Target.getSymA()) { + const MCSymbolData &Data = Asm.getSymbolData(A->getSymbol()); + // The "other" values are stored in the last 6 bits of the second byte. + // The traditional defines for STO values assume the full byte and thus + // the shift to pack it. + unsigned Other = MCELF::getOther(Data) << 2; + if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0) + IsResolved = false; + } + break; + } + } + bool mayNeedRelaxation(const MCInst &Inst) const override { // FIXME. return false; diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index cd3b4f453591..e93e95fc0751 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -11,6 +11,7 @@ #include "MCTargetDesc/PPCFixupKinds.h" #include "MCTargetDesc/PPCMCExpr.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/MC/MCELF.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCValue.h" @@ -30,6 +31,9 @@ namespace { bool IsPCRel) const; unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; + + bool needsRelocateWithSymbol(const MCSymbolData &SD, + unsigned Type) const override; }; } @@ -83,7 +87,15 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target, llvm_unreachable("Unimplemented"); case PPC::fixup_ppc_br24: case PPC::fixup_ppc_br24abs: - Type = ELF::R_PPC_REL24; + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC_REL24; + break; + case MCSymbolRefExpr::VK_PLT: + Type = ELF::R_PPC_PLTREL24; + break; + } break; case PPC::fixup_ppc_brcond14: case PPC::fixup_ppc_brcond14abs: @@ -379,6 +391,23 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target, return getRelocTypeInner(Target, Fixup, IsPCRel); } +bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD, + unsigned Type) const { + switch (Type) { + default: + return false; + + case ELF::R_PPC_REL24: + // If the target symbol has a local entry point, we must keep the + // target symbol to preserve that information for the linker. + // The "other" values are stored in the last 6 bits of the second byte. + // The traditional defines for STO values assume the full byte and thus + // the shift to pack it. + unsigned Other = MCELF::getOther(SD) << 2; + return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0; + } +} + MCObjectWriter *llvm::createPPCELFObjectWriter(raw_ostream &OS, bool Is64Bit, bool IsLittleEndian, diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp index 10d068dc49e6..3ac0aca6b78c 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp @@ -11,6 +11,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectStreamer.h" using namespace llvm; @@ -127,33 +128,6 @@ PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res, return true; } -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast(Value); - AddValueSymbols_(BE->getLHS(), Asm); - AddValueSymbols_(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbols_(cast(Value)->getSubExpr(), Asm); - break; - } -} - -void PPCMCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbols_(getSubExpr(), Asm); +void PPCMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h index 3421b9157711..bca408507e72 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h @@ -79,7 +79,7 @@ class PPCMCExpr : public MCTargetExpr { void PrintImpl(raw_ostream &OS) const override; bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const override; - void AddValueSymbols(MCAssembler *) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; const MCSection *FindAssociatedSection() const override { return getSubExpr()->FindAssociatedSection(); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index 7057797cf8cb..4c6780ff75a7 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -16,12 +16,16 @@ #include "PPCMCAsmInfo.h" #include "PPCTargetStreamer.h" #include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCELF.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MachineLocation.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" @@ -125,12 +129,21 @@ class PPCTargetAsmStreamer : public PPCTargetStreamer { void emitMachine(StringRef CPU) override { OS << "\t.machine " << CPU << '\n'; } + virtual void emitAbiVersion(int AbiVersion) override { + OS << "\t.abiversion " << AbiVersion << '\n'; + } + virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) { + OS << "\t.localentry\t" << *S << ", " << *LocalOffset << '\n'; + } }; class PPCTargetELFStreamer : public PPCTargetStreamer { public: PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {} - void emitTCEntry(const MCSymbol &S) override { + MCELFStreamer &getStreamer() { + return static_cast(Streamer); + } + virtual void emitTCEntry(const MCSymbol &S) override { // Creates a R_PPC64_TOC relocation Streamer.EmitSymbolValue(&S, 8); } @@ -138,6 +151,39 @@ class PPCTargetELFStreamer : public PPCTargetStreamer { // FIXME: Is there anything to do in here or does this directive only // limit the parser? } + virtual void emitAbiVersion(int AbiVersion) override { + MCAssembler &MCA = getStreamer().getAssembler(); + unsigned Flags = MCA.getELFHeaderEFlags(); + Flags &= ~ELF::EF_PPC64_ABI; + Flags |= (AbiVersion & ELF::EF_PPC64_ABI); + MCA.setELFHeaderEFlags(Flags); + } + virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) { + MCAssembler &MCA = getStreamer().getAssembler(); + MCSymbolData &Data = getStreamer().getOrCreateSymbolData(S); + + int64_t Res; + if (!LocalOffset->EvaluateAsAbsolute(Res, MCA)) + report_fatal_error(".localentry expression must be absolute."); + + unsigned Encoded = ELF::encodePPC64LocalEntryOffset(Res); + if (Res != ELF::decodePPC64LocalEntryOffset(Encoded)) + report_fatal_error(".localentry expression cannot be encoded."); + + // The "other" values are stored in the last 6 bits of the second byte. + // The traditional defines for STO values assume the full byte and thus + // the shift to pack it. + unsigned Other = MCELF::getOther(Data) << 2; + Other &= ~ELF::STO_PPC64_LOCAL_MASK; + Other |= Encoded; + MCELF::setOther(Data, Other >> 2); + + // For GAS compatibility, unless we already saw a .abiversion directive, + // set e_flags to indicate ELFv2 ABI. + unsigned Flags = MCA.getELFHeaderEFlags(); + if ((Flags & ELF::EF_PPC64_ABI) == 0) + MCA.setELFHeaderEFlags(Flags | 2); + } }; class PPCTargetMachOStreamer : public PPCTargetStreamer { @@ -150,6 +196,12 @@ class PPCTargetMachOStreamer : public PPCTargetStreamer { // FIXME: We should update the CPUType, CPUSubType in the Object file if // the new values are different from the defaults. } + virtual void emitAbiVersion(int AbiVersion) override { + llvm_unreachable("Unknown pseudo-op: .abiversion"); + } + virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) { + llvm_unreachable("Unknown pseudo-op: .localentry"); + } }; } diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h index c42c5be14bef..ba5fa4f79b4e 100644 --- a/lib/Target/PowerPC/PPC.h +++ b/lib/Target/PowerPC/PPC.h @@ -60,10 +60,11 @@ namespace llvm { // PPC Specific MachineOperand flags. MO_NO_FLAG, - /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the - /// reference is actually to the "FOO$stub" symbol. This is used for calls - /// and jumps to external functions on Tiger and earlier. - MO_DARWIN_STUB = 1, + /// MO_PLT_OR_STUB - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "FOO$stub" or "FOO@plt" symbol. This is + /// used for calls and jumps to external functions on Tiger and earlier, and + /// for PIC calls on Linux and ELF systems. + MO_PLT_OR_STUB = 1, /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to /// the function's picbase, e.g. lo16(symbol-picbase). diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td index bd58539c6fc2..a9842b287cbb 100644 --- a/lib/Target/PowerPC/PPC.td +++ b/lib/Target/PowerPC/PPC.td @@ -46,6 +46,7 @@ def DirectivePwr5x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "" def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">; def DirectivePwr6x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">; def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">; +def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">; def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true", "Enable 64-bit instructions">; @@ -285,6 +286,15 @@ def : ProcessorModel<"pwr7", P7Model, FeaturePOPCNTD, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */, DeprecatedMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr8", P7Model /* FIXME: Update to P8Model when available */, + [DirectivePwr8, FeatureAltivec, + FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, + FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, + FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeaturePOPCNTD, FeatureLDBRX, + Feature64Bit /*, Feature64BitRegs */, + DeprecatedMFTB, DeprecatedDST]>; def : Processor<"ppc", G3Itineraries, [Directive32]>; def : ProcessorModel<"ppc64", G5Model, [Directive64, FeatureAltivec, diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index fd044d951fcc..6f67c598c754 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -18,6 +18,7 @@ #include "PPC.h" #include "InstPrinter/PPCInstPrinter.h" +#include "PPCMachineFunctionInfo.h" #include "MCTargetDesc/PPCMCExpr.h" #include "MCTargetDesc/PPCPredicates.h" #include "PPCSubtarget.h" @@ -27,10 +28,12 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" @@ -100,9 +103,11 @@ namespace { } bool doFinalization(Module &M) override; + void EmitStartOfAsmFile(Module &M) override; void EmitFunctionEntryLabel() override; + void EmitFunctionBodyStart() override; void EmitFunctionBodyEnd() override; }; @@ -330,6 +335,66 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { OutStreamer.EmitLabel(PICBase); return; } + case PPC::GetGBRO: { + // Get the offset from the GOT Base Register to the GOT + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + MCSymbol *PICOffset = MF->getInfo()->getPICOffsetSymbol(); + TmpInst.setOpcode(PPC::LWZ); + const MCExpr *Exp = + MCSymbolRefExpr::Create(PICOffset, MCSymbolRefExpr::VK_None, OutContext); + const MCExpr *PB = + MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), + MCSymbolRefExpr::VK_None, + OutContext); + const MCOperand MO = TmpInst.getOperand(1); + TmpInst.getOperand(1) = MCOperand::CreateExpr(MCBinaryExpr::CreateSub(Exp, + PB, + OutContext)); + TmpInst.addOperand(MO); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + case PPC::UpdateGBR: { + // Update the GOT Base Register to point to the GOT. It may be possible to + // merge this with the PPC::GetGBRO, doing it all in one step. + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + TmpInst.setOpcode(PPC::ADD4); + TmpInst.addOperand(TmpInst.getOperand(0)); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + case PPC::LWZtoc: { + // Transform %X3 = LWZtoc , %X2 + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + + // Change the opcode to LWZ, and the global address operand to be a + // reference to the GOT entry we will synthesize later. + TmpInst.setOpcode(PPC::LWZ); + const MachineOperand &MO = MI->getOperand(1); + + // Map symbol -> label of TOC entry + assert(MO.isGlobal() || MO.isCPI() || MO.isJTI()); + MCSymbol *MOSymbol = nullptr; + if (MO.isGlobal()) + MOSymbol = getSymbol(MO.getGlobal()); + else if (MO.isCPI()) + MOSymbol = GetCPISymbol(MO.getIndex()); + else if (MO.isJTI()) + MOSymbol = GetJTISymbol(MO.getIndex()); + + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); + + const MCExpr *Exp = + MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_None, + OutContext); + const MCExpr *PB = + MCSymbolRefExpr::Create(OutContext.GetOrCreateSymbol(Twine(".L.TOC.")), + OutContext); + Exp = MCBinaryExpr::CreateSub(Exp, PB, OutContext); + TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp); + EmitToStreamer(OutStreamer, TmpInst); + return; + } case PPC::LDtocJTI: case PPC::LDtocCPT: case PPC::LDtoc: { @@ -717,10 +782,73 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitToStreamer(OutStreamer, TmpInst); } +void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) { + if (Subtarget.isELFv2ABI()) { + PPCTargetStreamer *TS = + static_cast(OutStreamer.getTargetStreamer()); + + if (TS) + TS->emitAbiVersion(2); + } + + if (Subtarget.isPPC64() || TM.getRelocationModel() != Reloc::PIC_) + return AsmPrinter::EmitStartOfAsmFile(M); + + // FIXME: The use of .got2 assumes large GOT model (-fPIC), which is not + // optimal for some cases. We should consider supporting small model (-fpic) + // as well in the future. + assert(TM.getCodeModel() != CodeModel::Small && + "Small code model PIC is currently unsupported."); + OutStreamer.SwitchSection(OutContext.getELFSection(".got2", + ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC, + SectionKind::getReadOnly())); + + MCSymbol *TOCSym = OutContext.GetOrCreateSymbol(Twine(".L.TOC.")); + MCSymbol *CurrentPos = OutContext.CreateTempSymbol(); + + OutStreamer.EmitLabel(CurrentPos); + + // The GOT pointer points to the middle of the GOT, in order to reference the + // entire 64kB range. 0x8000 is the midpoint. + const MCExpr *tocExpr = + MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(CurrentPos, OutContext), + MCConstantExpr::Create(0x8000, OutContext), + OutContext); + + OutStreamer.EmitAssignment(TOCSym, tocExpr); + + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); +} + void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { - if (!Subtarget.isPPC64()) // linux/ppc32 - Normal entry label. + // linux/ppc32 - Normal entry label. + if (!Subtarget.isPPC64() && TM.getRelocationModel() != Reloc::PIC_) return AsmPrinter::EmitFunctionEntryLabel(); - + + if (!Subtarget.isPPC64()) { + const PPCFunctionInfo *PPCFI = MF->getInfo(); + if (PPCFI->usesPICBase()) { + MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol(); + MCSymbol *PICBase = MF->getPICBaseSymbol(); + OutStreamer.EmitLabel(RelocSymbol); + + const MCExpr *OffsExpr = + MCBinaryExpr::CreateSub( + MCSymbolRefExpr::Create(OutContext.GetOrCreateSymbol(Twine(".L.TOC.")), + OutContext), + MCSymbolRefExpr::Create(PICBase, OutContext), + OutContext); + OutStreamer.EmitValue(OffsExpr, 4); + OutStreamer.EmitLabel(CurrentFnSym); + return; + } else + return AsmPrinter::EmitFunctionEntryLabel(); + } + + // ELFv2 ABI - Normal entry label. + if (Subtarget.isELFv2ABI()) + return AsmPrinter::EmitFunctionEntryLabel(); + // Emit an official procedure descriptor. MCSectionSubPair Current = OutStreamer.getCurrentSection(); const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".opd", @@ -759,8 +887,15 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) { PPCTargetStreamer &TS = static_cast(*OutStreamer.getTargetStreamer()); - if (isPPC64 && !TOC.empty()) { - const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".toc", + if (!TOC.empty()) { + const MCSectionELF *Section; + + if (isPPC64) + Section = OutStreamer.getContext().getELFSection(".toc", + ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC, + SectionKind::getReadOnly()); + else + Section = OutStreamer.getContext().getELFSection(".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC, SectionKind::getReadOnly()); OutStreamer.SwitchSection(Section); @@ -769,7 +904,10 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) { E = TOC.end(); I != E; ++I) { OutStreamer.EmitLabel(I->second); MCSymbol *S = OutContext.GetOrCreateSymbol(I->first->getName()); - TS.emitTCEntry(*S); + if (isPPC64) + TS.emitTCEntry(*S); + else + OutStreamer.EmitSymbolValue(S, 4); } } @@ -795,6 +933,68 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) { return AsmPrinter::doFinalization(M); } +/// EmitFunctionBodyStart - Emit a global entry point prefix for ELFv2. +void PPCLinuxAsmPrinter::EmitFunctionBodyStart() { + // In the ELFv2 ABI, in functions that use the TOC register, we need to + // provide two entry points. The ABI guarantees that when calling the + // local entry point, r2 is set up by the caller to contain the TOC base + // for this function, and when calling the global entry point, r12 is set + // up by the caller to hold the address of the global entry point. We + // thus emit a prefix sequence along the following lines: + // + // func: + // # global entry point + // addis r2,r12,(.TOC.-func)@ha + // addi r2,r2,(.TOC.-func)@l + // .localentry func, .-func + // # local entry point, followed by function body + // + // This ensures we have r2 set up correctly while executing the function + // body, no matter which entry point is called. + if (Subtarget.isELFv2ABI() + // Only do all that if the function uses r2 in the first place. + && !MF->getRegInfo().use_empty(PPC::X2)) { + + MCSymbol *GlobalEntryLabel = OutContext.CreateTempSymbol(); + OutStreamer.EmitLabel(GlobalEntryLabel); + const MCSymbolRefExpr *GlobalEntryLabelExp = + MCSymbolRefExpr::Create(GlobalEntryLabel, OutContext); + + MCSymbol *TOCSymbol = OutContext.GetOrCreateSymbol(StringRef(".TOC.")); + const MCExpr *TOCDeltaExpr = + MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(TOCSymbol, OutContext), + GlobalEntryLabelExp, OutContext); + + const MCExpr *TOCDeltaHi = + PPCMCExpr::CreateHa(TOCDeltaExpr, false, OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS) + .addReg(PPC::X2) + .addReg(PPC::X12) + .addExpr(TOCDeltaHi)); + + const MCExpr *TOCDeltaLo = + PPCMCExpr::CreateLo(TOCDeltaExpr, false, OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI) + .addReg(PPC::X2) + .addReg(PPC::X2) + .addExpr(TOCDeltaLo)); + + MCSymbol *LocalEntryLabel = OutContext.CreateTempSymbol(); + OutStreamer.EmitLabel(LocalEntryLabel); + const MCSymbolRefExpr *LocalEntryLabelExp = + MCSymbolRefExpr::Create(LocalEntryLabel, OutContext); + const MCExpr *LocalOffsetExp = + MCBinaryExpr::CreateSub(LocalEntryLabelExp, + GlobalEntryLabelExp, OutContext); + + PPCTargetStreamer *TS = + static_cast(OutStreamer.getTargetStreamer()); + + if (TS) + TS->emitLocalEntry(CurrentFnSym, LocalOffsetExp); + } +} + /// EmitFunctionBodyEnd - Print the traceback table before the .size /// directive. /// diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td index d48164d62160..222760a0cb91 100644 --- a/lib/Target/PowerPC/PPCCallingConv.td +++ b/lib/Target/PowerPC/PPCCallingConv.td @@ -31,13 +31,18 @@ def RetCC_PPC : CallingConv<[ CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>, CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>, CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>, + + // Floating point types returned as "direct" go into F1 .. F8; note that + // only the ELFv2 ABI fully utilizes all these registers. + CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, - CCIfType<[f32], CCAssignToReg<[F1, F2]>>, - CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4]>>, - - // Vector types are always returned in V2. - CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>, - CCIfType<[v2f64, v2i64], CCAssignToReg<[VSH2]>> + // Vector types returned as "direct" go into V2 .. V9; note that only the + // ELFv2 ABI fully utilizes all these registers. + CCIfType<[v16i8, v8i16, v4i32, v4f32], + CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>, + CCIfType<[v2f64, v2i64], + CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>> ]>; @@ -69,10 +74,12 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[ CCIfType<[i32], CCPromoteToType>, CCIfType<[i64], CCAssignToReg<[X3, X4]>>, CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>, - CCIfType<[f32], CCAssignToReg<[F1, F2]>>, - CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4]>>, - CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>, - CCIfType<[v2f64, v2i64], CCAssignToReg<[VSH2]>> + CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[v16i8, v8i16, v4i32, v4f32], + CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>, + CCIfType<[v2f64, v2i64], + CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>> ]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index 924a07c6cff3..2e524d604789 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -1030,6 +1030,10 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) { if (DstVT != MVT::i32 && DstVT != MVT::i64) return false; + // If we don't have FCTIDUZ and we need it, punt to SelectionDAG. + if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT()) + return false; + Value *Src = I->getOperand(0); Type *SrcTy = Src->getType(); if (!isTypeLegal(SrcTy, SrcVT)) @@ -1197,6 +1201,13 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl &Args, bool IsVarArg) { SmallVector ArgLocs; CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context); + + // Reserve space for the linkage area on the stack. + bool isELFv2ABI = PPCSubTarget->isELFv2ABI(); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); + CCInfo.AllocateStack(LinkageSize, 8); + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS); // Bail out if we can't handle any of the arguments. @@ -1218,6 +1229,14 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl &Args, // Get a count of how many bytes are to be pushed onto the stack. NumBytes = CCInfo.getNextStackOffset(); + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. + NumBytes = std::max(NumBytes, LinkageSize + 64); + // Issue CALLSEQ_START. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TII.getCallFrameSetupOpcode())) @@ -1482,6 +1501,10 @@ bool PPCFastISel::SelectCall(const Instruction *I) { for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II) MIB.addReg(RegArgs[II], RegState::Implicit); + // Direct calls in the ELFv2 ABI need the TOC register live into the call. + if (PPCSubTarget->isELFv2ABI()) + MIB.addReg(PPC::X2, RegState::Implicit); + // Add a register mask with the call-preserved registers. Proper // defs for return values will be added by setPhysRegsDeadExcept(). MIB.addRegMask(TRI.getCallPreservedMask(CC)); diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index 9c5e588f2fc2..b2577a9c7cf7 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -41,30 +41,6 @@ PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI) (STI.hasQPX() || STI.isBGQ()) ? 32 : 16, 0), Subtarget(STI) {} -unsigned PPCFrameLowering::getMinCallArgumentsSize(bool isPPC64, - bool isDarwinABI) { - // For the Darwin ABI / 64-bit SVR4 ABI: - // The prolog code of the callee may store up to 8 GPR argument registers to - // the stack, allowing va_start to index over them in memory if its varargs. - // Because we cannot tell if this is needed on the caller side, we have to - // conservatively assume that it is needed. As such, make sure we have at - // least enough stack space for the caller to store the 8 GPRs. - if (isDarwinABI || isPPC64) - return 8 * (isPPC64 ? 8 : 4); - - // 32-bit SVR4 ABI: - // There is no default stack allocated for the 8 first GPR arguments. - return 0; -} - -/// getMinCallFrameSize - Return the minimum size a call frame can be using -/// the PowerPC ABI. -unsigned PPCFrameLowering::getMinCallFrameSize(bool isPPC64, bool isDarwinABI) { - // The call frame needs to be at least big enough for linkage and 8 args. - return PPCFrameLowering::getLinkageSize(isPPC64, isDarwinABI) + - PPCFrameLowering::getMinCallArgumentsSize(isPPC64, isDarwinABI); -} - // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack. const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots( unsigned &NumEntries) const { @@ -422,9 +398,10 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, // Get the maximum call frame size of all the calls. unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); - // Maximum call frame needs to be at least big enough for linkage and 8 args. - unsigned minCallFrameSize = getMinCallFrameSize(Subtarget.isPPC64(), - Subtarget.isDarwinABI()); + // Maximum call frame needs to be at least big enough for linkage area. + unsigned minCallFrameSize = getLinkageSize(Subtarget.isPPC64(), + Subtarget.isDarwinABI(), + Subtarget.isELFv2ABI()); maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize); // If we have dynamic alloca then maxCallFrameSize needs to be aligned so @@ -485,7 +462,7 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const { const PPCRegisterInfo *RegInfo = static_cast(MF.getTarget().getRegisterInfo()); bool HasBP = RegInfo->hasBasePointer(MF); - unsigned BPReg = HasBP ? (unsigned) PPC::R30 : FPReg; + unsigned BPReg = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg; unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -530,12 +507,14 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { DebugLoc dl; bool needsFrameMoves = MMI.hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry(); + bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_; // Get processor type. bool isPPC64 = Subtarget.isPPC64(); // Get the ABI. bool isDarwinABI = Subtarget.isDarwinABI(); bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isELFv2ABI = Subtarget.isELFv2ABI(); assert((isDarwinABI || isSVR4ABI) && "Currently only Darwin and SVR4 ABIs are supported for PowerPC."); @@ -570,7 +549,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { bool HasBP = RegInfo->hasBasePointer(MF); unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1; - unsigned BPReg = isPPC64 ? PPC::X30 : PPC::R30; + unsigned BPReg = RegInfo->getBaseRegister(MF); unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR; unsigned ScratchReg = isPPC64 ? PPC::X0 : PPC::R0; @@ -626,7 +605,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { BPOffset = FFI->getObjectOffset(BPIndex); } else { BPOffset = - PPCFrameLowering::getBasePointerSaveOffset(isPPC64, isDarwinABI); + PPCFrameLowering::getBasePointerSaveOffset(isPPC64, + isDarwinABI, + isPIC); } } @@ -647,6 +628,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { "Prologue CR saving supported only in 64-bit mode"); if (!MustSaveCRs.empty()) { // will only occur for PPC64 + // FIXME: In the ELFv2 ABI, we are not required to save all CR fields. + // If only one or two CR fields are clobbered, it could be more + // efficient to use mfocrf to selectively save just those fields. MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg); for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i) @@ -815,8 +799,12 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { // For 64-bit SVR4 when we have spilled CRs, the spill location // is SP+8, not a frame-relative slot. if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) { + // In the ELFv1 ABI, only CR2 is noted in CFI and stands in for + // the whole CR word. In the ELFv2 ABI, every CR that was + // actually saved gets its own CFI record. + unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2; unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(PPC::CR2, true), 8)); + nullptr, MRI->getDwarfRegNum(CRReg, true), 8)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); continue; @@ -863,6 +851,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, // Get the ABI. bool isDarwinABI = Subtarget.isDarwinABI(); bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_; // Check if the link register (LR) has been saved. PPCFunctionInfo *FI = MF.getInfo(); @@ -873,7 +862,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, bool HasBP = RegInfo->hasBasePointer(MF); unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1; - unsigned BPReg = isPPC64 ? PPC::X30 : PPC::R30; + unsigned BPReg = RegInfo->getBaseRegister(MF); unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; unsigned ScratchReg = isPPC64 ? PPC::X0 : PPC::R0; unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg @@ -914,7 +903,9 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, BPOffset = FFI->getObjectOffset(BPIndex); } else { BPOffset = - PPCFrameLowering::getBasePointerSaveOffset(isPPC64, isDarwinABI); + PPCFrameLowering::getBasePointerSaveOffset(isPPC64, + isDarwinABI, + isPIC); } } @@ -1091,6 +1082,7 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, int FPSI = FI->getFramePointerSaveIndex(); bool isPPC64 = Subtarget.isPPC64(); bool isDarwinABI = Subtarget.isDarwinABI(); + bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_; MachineFrameInfo *MFI = MF.getFrameInfo(); // If the frame pointer save index hasn't been defined yet. @@ -1105,7 +1097,7 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, int BPSI = FI->getBasePointerSaveIndex(); if (!BPSI && RegInfo->hasBasePointer(MF)) { - int BPOffset = getBasePointerSaveOffset(isPPC64, isDarwinABI); + int BPOffset = getBasePointerSaveOffset(isPPC64, isDarwinABI, isPIC); // Allocate the frame index for the base pointer save area. BPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, BPOffset, true); // Save the result. diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h index ca1ca56d08fa..c0c7d248f8d2 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.h +++ b/lib/Target/PowerPC/PPCFrameLowering.h @@ -76,8 +76,8 @@ class PPCFrameLowering: public TargetFrameLowering { /// getTOCSaveOffset - Return the previous frame offset to save the /// TOC register -- 64-bit SVR4 ABI only. - static unsigned getTOCSaveOffset(void) { - return 40; + static unsigned getTOCSaveOffset(bool isELFv2ABI) { + return isELFv2ABI ? 24 : 40; } /// getFramePointerSaveOffset - Return the previous frame offset to save the @@ -97,30 +97,29 @@ class PPCFrameLowering: public TargetFrameLowering { /// getBasePointerSaveOffset - Return the previous frame offset to save the /// base pointer. - static unsigned getBasePointerSaveOffset(bool isPPC64, bool isDarwinABI) { + static unsigned getBasePointerSaveOffset(bool isPPC64, + bool isDarwinABI, + bool isPIC) { if (isDarwinABI) return isPPC64 ? -16U : -8U; // SVR4 ABI: First slot in the general register save area. - return isPPC64 ? -16U : -8U; + return isPPC64 ? -16U : isPIC ? -12U : -8U; } /// getLinkageSize - Return the size of the PowerPC ABI linkage area. /// - static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI) { + static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI, + bool isELFv2ABI) { if (isDarwinABI || isPPC64) - return 6 * (isPPC64 ? 8 : 4); + return (isELFv2ABI ? 4 : 6) * (isPPC64 ? 8 : 4); // SVR4 ABI: return 8; } - /// getMinCallArgumentsSize - Return the size of the minium PowerPC ABI - /// argument area. - static unsigned getMinCallArgumentsSize(bool isPPC64, bool isDarwinABI); const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const override; - static unsigned getMinCallFrameSize(bool isPPC64, bool isDarwinABI); }; } // End llvm namespace diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp index d0315f94834f..d9b242cad265 100644 --- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp +++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -162,7 +162,8 @@ unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) { unsigned Directive = DAG->TM.getSubtarget().getDarwinDirective(); // If we're using a special group-terminating nop, then we need only one. - if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7) + if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 || + Directive == PPC::DIR_PWR8 ) return 1; return 5 - CurSlots; @@ -223,7 +224,7 @@ void PPCDispatchGroupSBHazardRecognizer::EmitNoop() { // If the group has now filled all of its slots, or if we're using a special // group-terminating nop, the group is complete. if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 || - CurSlots == 6) { + Directive == PPC::DIR_PWR8 || CurSlots == 6) { CurGroup.clear(); CurSlots = CurBranches = 0; } else { diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 4881b3fbb7ac..a9e146281b87 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -14,6 +14,7 @@ #include "PPC.h" #include "MCTargetDesc/PPCPredicates.h" +#include "PPCMachineFunctionInfo.h" #include "PPCTargetMachine.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -275,9 +276,21 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { DebugLoc dl; if (PPCLowering->getPointerTy() == MVT::i32) { - GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass); + if (PPCSubTarget->isTargetELF()) + GlobalBaseReg = PPC::R30; + else + GlobalBaseReg = + RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR)); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); + if (PPCSubTarget->isTargetELF()) { + unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); + BuildMI(FirstMBB, MBBI, dl, + TII.get(PPC::GetGBRO), TempReg).addReg(GlobalBaseReg); + BuildMI(FirstMBB, MBBI, dl, + TII.get(PPC::UpdateGBR)).addReg(GlobalBaseReg).addReg(TempReg); + MF->getInfo()->setUsesPICBase(true); + } } else { GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_NOX0RegClass); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8)); @@ -1445,7 +1458,13 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain); } case PPCISD::TOC_ENTRY: { - assert (PPCSubTarget->isPPC64() && "Only supported for 64-bit ABI"); + if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) { + SDValue GA = N->getOperand(0); + return CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA, + N->getOperand(1)); + } + assert (PPCSubTarget->isPPC64() && + "Only supported for 64-bit ABI and 32-bit SVR4"); // For medium and large code model, we generate two instructions as // described below. Otherwise we allow SelectCodeCommon to handle this, diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index f6884d5a2716..d699e0fdd829 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1331,7 +1331,13 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { // If all of the bits are known zero on the LHS or RHS, the add won't // carry. - Base = N.getOperand(0); + if (FrameIndexSDNode *FI = + dyn_cast(N.getOperand(0))) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else { + Base = N.getOperand(0); + } Disp = DAG.getTargetConstant(imm, N.getValueType()); return true; } @@ -1491,10 +1497,9 @@ static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags, HiOpFlags = PPCII::MO_HA; LoOpFlags = PPCII::MO_LO; - // Don't use the pic base if not in PIC relocation model. Or if we are on a - // non-darwin platform. We don't support PIC on other platforms yet. - bool isPIC = TM.getRelocationModel() == Reloc::PIC_ && - TM.getSubtarget().isDarwin(); + // Don't use the pic base if not in PIC relocation model. + bool isPIC = TM.getRelocationModel() == Reloc::PIC_; + if (isPIC) { HiOpFlags |= PPCII::MO_PIC_FLAG; LoOpFlags |= PPCII::MO_PIC_FLAG; @@ -1550,6 +1555,15 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), + PPCII::MO_PIC_FLAG); + SDLoc DL(CP); + return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT)); + } + SDValue CPIHi = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); SDValue CPILo = @@ -1571,6 +1585,15 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + PPCII::MO_PIC_FLAG); + SDLoc DL(GA); + return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), PtrVT, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT)); + } + SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); return LowerLabelRef(JTIHi, JTILo, isPIC, DAG); @@ -1700,6 +1723,14 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV); + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, + GSDN->getOffset(), + PPCII::MO_PIC_FLAG); + return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, MVT::i32)); + } + SDValue GAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); SDValue GALo = @@ -1936,7 +1967,8 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__trampoline_setup", PtrVT), &Args, 0); + DAG.getExternalSymbol("__trampoline_setup", PtrVT), + std::move(Args), 0); std::pair CallResult = LowerCallTo(CLI); return CallResult.second; @@ -2126,11 +2158,116 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, unsigned ArgSize = ArgVT.getStoreSize(); if (Flags.isByVal()) ArgSize = Flags.getByValSize(); - ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + + // Round up to multiples of the pointer size, except for array members, + // which are always packed. + if (!Flags.isInConsecutiveRegs()) + ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; return ArgSize; } +/// CalculateStackSlotAlignment - Calculates the alignment of this argument +/// on the stack. +static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize) { + unsigned Align = PtrByteSize; + + // Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) + Align = 16; + + // ByVal parameters are aligned as requested. + if (Flags.isByVal()) { + unsigned BVAlign = Flags.getByValAlign(); + if (BVAlign > PtrByteSize) { + if (BVAlign % PtrByteSize != 0) + llvm_unreachable( + "ByVal alignment is not a multiple of the pointer size"); + + Align = BVAlign; + } + } + + // Array members are always packed to their original alignment. + if (Flags.isInConsecutiveRegs()) { + // If the array member was split into multiple registers, the first + // needs to be aligned to the size of the full type. (Except for + // ppcf128, which is only aligned as its f64 components.) + if (Flags.isSplit() && OrigVT != MVT::ppcf128) + Align = OrigVT.getStoreSize(); + else + Align = ArgVT.getStoreSize(); + } + + return Align; +} + +/// CalculateStackSlotUsed - Return whether this argument will use its +/// stack slot (instead of being passed in registers). ArgOffset, +/// AvailableFPRs, and AvailableVRs must hold the current argument +/// position, and will be updated to account for this argument. +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, + unsigned LinkageSize, + unsigned ParamAreaSize, + unsigned &ArgOffset, + unsigned &AvailableFPRs, + unsigned &AvailableVRs) { + bool UseMemory = false; + + // Respect alignment of argument on the stack. + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + // If there's no space left in the argument save area, we must + // use memory (this check also catches zero-sized arguments). + if (ArgOffset >= LinkageSize + ParamAreaSize) + UseMemory = true; + + // Allocate argument on the stack. + ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + // If we overran the argument save area, we must use memory + // (this check catches arguments passed partially in memory) + if (ArgOffset > LinkageSize + ParamAreaSize) + UseMemory = true; + + // However, if the argument is actually passed in an FPR or a VR, + // we don't use memory after all. + if (!Flags.isByVal()) { + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) + if (AvailableFPRs > 0) { + --AvailableFPRs; + return false; + } + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) + if (AvailableVRs > 0) { + --AvailableVRs; + return false; + } + } + + return UseMemory; +} + +/// EnsureStackAlignment - Round stack frame size up from NumBytes to +/// ensure minimum alignment required for target. +static unsigned EnsureStackAlignment(const TargetMachine &Target, + unsigned NumBytes) { + unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment(); + unsigned AlignMask = TargetAlign - 1; + NumBytes = (NumBytes + AlignMask) & ~AlignMask; + return NumBytes; +} + SDValue PPCTargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2206,7 +2343,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( getTargetMachine(), ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. - CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false); + CCInfo.AllocateStack(LinkageSize, PtrByteSize); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); @@ -2285,23 +2423,14 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Area that is at least reserved in the caller of this function. unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); + MinReservedArea = std::max(MinReservedArea, LinkageSize); // Set the size that is at least reserved in caller of this function. Tail // call optimized function's reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - PPCFunctionInfo *FI = MF.getInfo(); - - MinReservedArea = - std::max(MinReservedArea, - PPCFrameLowering::getMinCallFrameSize(false, false)); - - unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()-> - getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask; - - FI->setMinReservedArea(MinReservedArea); + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); SmallVector MemOps; @@ -2397,32 +2526,6 @@ PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); } -// Set the size that is at least reserved in caller of this function. Tail -// call optimized functions' reserved stack space needs to be aligned so that -// taking the difference between two stack areas will result in an aligned -// stack. -void -PPCTargetLowering::setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG, - unsigned nAltivecParamsAtEnd, - unsigned MinReservedArea, - bool isPPC64) const { - PPCFunctionInfo *FI = MF.getInfo(); - // Add the Altivec parameters at the end, if needed. - if (nAltivecParamsAtEnd) { - MinReservedArea = ((MinReservedArea+15)/16)*16; - MinReservedArea += 16*nAltivecParamsAtEnd; - } - MinReservedArea = - std::max(MinReservedArea, - PPCFrameLowering::getMinCallFrameSize(isPPC64, true)); - unsigned TargetAlign - = DAG.getMachineFunction().getTarget().getFrameLowering()-> - getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask; - FI->setMinReservedArea(MinReservedArea); -} - SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue Chain, @@ -2433,6 +2536,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SmallVectorImpl &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // + bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool isLittleEndian = Subtarget.isLittleEndian(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo(); @@ -2443,9 +2548,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( (CallConv == CallingConv::Fast)); unsigned PtrByteSize = 8; - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true); - // Area that is at least reserved in caller of this function. - unsigned MinReservedArea = ArgOffset; + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -2467,44 +2571,52 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_FPR_Regs = 13; const unsigned Num_VR_Regs = array_lengthof(VR); - unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + // Do a first pass over the arguments to determine whether the ABI + // guarantees that our caller has allocated the parameter save area + // on its stack frame. In the ELFv1 ABI, this is always the case; + // in the ELFv2 ABI, it is true if this is a vararg function or if + // any parameter is located in a stack slot. + + bool HasParameterArea = !isELFv2ABI || isVarArg; + unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; + unsigned NumBytes = LinkageSize; + unsigned AvailableFPRs = Num_FPR_Regs; + unsigned AvailableVRs = Num_VR_Regs; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) + if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytes, AvailableFPRs, AvailableVRs)) + HasParameterArea = true; // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, // although the first ones are often in registers. + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; SmallVector MemOps; - unsigned nAltivecParamsAtEnd = 0; Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; + EVT OrigVT = Ins[ArgNo].ArgVT; unsigned ObjSize = ObjectVT.getStoreSize(); unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); CurArgIdx = Ins[ArgNo].OrigArgIndex; + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; unsigned CurArgOffset = ArgOffset; - // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. - if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || - ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8 || - ObjectVT==MVT::v2f64 || ObjectVT==MVT::v2i64) { - if (isVarArg) { - MinReservedArea = ((MinReservedArea+15)/16)*16; - MinReservedArea += CalculateStackSlotSize(ObjectVT, - Flags, - PtrByteSize); - } else - nAltivecParamsAtEnd++; - } else - // Calculate min reserved area. - MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, - Flags, - PtrByteSize); + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, Num_GPR_Regs); // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. @@ -2526,21 +2638,31 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( continue; } - unsigned BVAlign = Flags.getByValAlign(); - if (BVAlign > 8) { - ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign; - CurArgOffset = ArgOffset; - } - - // All aggregates smaller than 8 bytes must be passed right-justified. - if (ObjSize < PtrByteSize) - CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize); - // The value of the object is its address. - int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true); + // Create a stack object covering all stack doublewords occupied + // by the argument. If the argument is (fully or partially) on + // the stack, or if the argument is fully in registers but the + // caller has allocated the parameter save anyway, we can refer + // directly to the caller's stack frame. Otherwise, create a + // local copy in our own frame. + int FI; + if (HasParameterArea || + ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) + FI = MFI->CreateFixedObject(ArgSize, ArgOffset, true); + else + FI = MFI->CreateStackObject(ArgSize, Align, false); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - InVals.push_back(FIN); - if (ObjSize < 8) { + // Handle aggregates smaller than 8 bytes. + if (ObjSize < PtrByteSize) { + // The value of the object is its address, which differs from the + // address of the enclosing doubleword on big-endian systems. + SDValue Arg = FIN; + if (!isLittleEndian) { + SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, PtrVT); + Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); + } + InVals.push_back(Arg); + if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); @@ -2549,25 +2671,19 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( if (ObjSize==1 || ObjSize==2 || ObjSize==4) { EVT ObjType = (ObjSize == 1 ? MVT::i8 : (ObjSize == 2 ? MVT::i16 : MVT::i32)); - Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, + Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, MachinePointerInfo(FuncArg), ObjType, false, false, 0); } else { // For sizes that don't fit a truncating store (3, 5, 6, 7), // store the whole register as-is to the parameter save area - // slot. The address of the parameter was already calculated - // above (InVals.push_back(FIN)) to be the right-justified - // offset within the slot. For this store, we need a new - // frame index that points at the beginning of the slot. - int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + // slot. Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(FuncArg), false, false, 0); } MemOps.push_back(Store); - ++GPR_idx; } // Whether we copied from a register or not, advance the offset // into the parameter save area by a full doubleword. @@ -2575,27 +2691,29 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( continue; } + // The value of the object is its address, which is the address of + // its first stack doubleword. + InVals.push_back(FIN); + + // Store whatever pieces of the object are in registers to memory. for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { - // Store whatever pieces of the object are in registers - // to memory. ArgOffset will be the address of the beginning - // of the object. - if (GPR_idx != Num_GPR_Regs) { - unsigned VReg; - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); - int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg, j), - false, false, 0); - MemOps.push_back(Store); - ++GPR_idx; - ArgOffset += PtrByteSize; - } else { - ArgOffset += ArgSize - j; + if (GPR_idx == Num_GPR_Regs) break; + + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Addr = FIN; + if (j) { + SDValue Off = DAG.getConstant(j, PtrVT); + Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); } + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, + MachinePointerInfo(FuncArg, j), + false, false, 0); + MemOps.push_back(Store); + ++GPR_idx; } + ArgOffset += ArgSize; continue; } @@ -2604,6 +2722,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::i1: case MVT::i32: case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); @@ -2612,8 +2733,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); - - ++GPR_idx; } else { needsLoad = true; ArgSize = PtrByteSize; @@ -2623,11 +2742,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::f32: case MVT::f64: - // Every 8 bytes of argument space consumes one of the GPRs available for - // argument passing. - if (GPR_idx != Num_GPR_Regs) { - ++GPR_idx; - } + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. if (FPR_idx != Num_FPR_Regs) { unsigned VReg; @@ -2640,12 +2757,32 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++FPR_idx; + } else if (GPR_idx != Num_GPR_Regs) { + // This can only ever happen in the presence of f32 array types, + // since otherwise we never run out of FPRs before running out + // of GPRs. + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::f32) { + if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) + ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); + } + + ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); } else { needsLoad = true; - ArgSize = PtrByteSize; } - ArgOffset += 8; + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; + ArgOffset += ArgSize; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; break; case MVT::v4f32: case MVT::v4i32: @@ -2653,39 +2790,28 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: - // Note that vector arguments in registers don't reserve stack space, - // except in varargs functions. + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. if (VR_idx != Num_VR_Regs) { unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ? MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) : MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - if (isVarArg) { - while ((ArgOffset % 16) != 0) { - ArgOffset += PtrByteSize; - if (GPR_idx != Num_GPR_Regs) - GPR_idx++; - } - ArgOffset += 16; - GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? - } ++VR_idx; } else { - // Vectors are aligned. - ArgOffset = ((ArgOffset+15)/16)*16; - CurArgOffset = ArgOffset; - ArgOffset += 16; needsLoad = true; } + ArgOffset += 16; break; } // We need to load the argument to a virtual register if we determined // above that we ran out of physical registers of the appropriate type. if (needsLoad) { - int FI = MFI->CreateFixedObject(ObjSize, - CurArgOffset + (ArgSize - ObjSize), - isImmutable); + if (ObjSize < ArgSize && !isLittleEndian) + CurArgOffset += ArgSize - ObjSize; + int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), false, false, false, 0); @@ -2694,11 +2820,19 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( InVals.push_back(ArgVal); } + // Area that is at least reserved in the caller of this function. + unsigned MinReservedArea; + if (HasParameterArea) + MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); + else + MinReservedArea = LinkageSize; + // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, true); + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. @@ -2712,7 +2846,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // If this function is vararg, store any remaining integer argument regs // to their spots on the stack so that they may be loaded by deferencing the // result of va_next. - for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { + for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx < Num_GPR_Regs; ++GPR_idx) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, @@ -2751,7 +2886,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin( (CallConv == CallingConv::Fast)); unsigned PtrByteSize = isPPC64 ? 8 : 4; - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, + false); + unsigned ArgOffset = LinkageSize; // Area that is at least reserved in caller of this function. unsigned MinReservedArea = ArgOffset; @@ -3042,11 +3179,21 @@ PPCTargetLowering::LowerFormalArguments_Darwin( InVals.push_back(ArgVal); } + // Allow for Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + MinReservedArea = ((MinReservedArea+15)/16)*16; + MinReservedArea += 16*nAltivecParamsAtEnd; + } + + // Area that is at least reserved in the caller of this function. + MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); + // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, isPPC64); + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. @@ -3085,75 +3232,6 @@ PPCTargetLowering::LowerFormalArguments_Darwin( return Chain; } -/// CalculateParameterAndLinkageAreaSize - Get the size of the parameter plus -/// linkage area for the Darwin ABI, or the 64-bit SVR4 ABI. -static unsigned -CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG, - bool isPPC64, - bool isVarArg, - unsigned CC, - const SmallVectorImpl - &Outs, - const SmallVectorImpl &OutVals, - unsigned &nAltivecParamsAtEnd) { - // Count how many bytes are to be pushed on the stack, including the linkage - // area, and parameter passing area. We start with 24/48 bytes, which is - // prereserved space for [SP][CR][LR][3 x unused]. - unsigned NumBytes = PPCFrameLowering::getLinkageSize(isPPC64, true); - unsigned NumOps = Outs.size(); - unsigned PtrByteSize = isPPC64 ? 8 : 4; - - // Add up all the space actually used. - // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually - // they all go in registers, but we must reserve stack space for them for - // possible use by the caller. In varargs or 64-bit calls, parameters are - // assigned stack space in order, with padding so Altivec parameters are - // 16-byte aligned. - nAltivecParamsAtEnd = 0; - for (unsigned i = 0; i != NumOps; ++i) { - ISD::ArgFlagsTy Flags = Outs[i].Flags; - EVT ArgVT = Outs[i].VT; - // Varargs Altivec parameters are padded to a 16 byte boundary. - if (ArgVT==MVT::v4f32 || ArgVT==MVT::v4i32 || - ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8 || - ArgVT==MVT::v2f64 || ArgVT==MVT::v2i64) { - if (!isVarArg && !isPPC64) { - // Non-varargs Altivec parameters go after all the non-Altivec - // parameters; handle those later so we know how much padding we need. - nAltivecParamsAtEnd++; - continue; - } - // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. - NumBytes = ((NumBytes+15)/16)*16; - } - NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); - } - - // Allow for Altivec parameters at the end, if needed. - if (nAltivecParamsAtEnd) { - NumBytes = ((NumBytes+15)/16)*16; - NumBytes += 16*nAltivecParamsAtEnd; - } - - // The prolog code of the callee may store up to 8 GPR argument registers to - // the stack, allowing va_start to index over them in memory if its varargs. - // Because we cannot tell if this is needed on the caller side, we have to - // conservatively assume that it is needed. As such, make sure we have at - // least enough stack space for the caller to store the 8 GPRs. - NumBytes = std::max(NumBytes, - PPCFrameLowering::getMinCallFrameSize(isPPC64, true)); - - // Tail call needs the stack to be aligned. - if (CC == CallingConv::Fast && DAG.getTarget().Options.GuaranteedTailCallOpt){ - unsigned TargetAlign = DAG.getMachineFunction().getTarget(). - getFrameLowering()->getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - NumBytes = (NumBytes + AlignMask) & ~AlignMask; - } - - return NumBytes; -} - /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be /// adjusted to accommodate the arguments for the tailcall. static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, @@ -3422,6 +3500,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isELFv2ABI = Subtarget.isELFv2ABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); NodeTys.push_back(MVT::Other); // Returns a chain @@ -3443,15 +3522,18 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, // far-call stubs may be outside relocation limits for a BL instruction. if (!DAG.getTarget().getSubtarget().isJITCodeModel()) { unsigned OpFlags = 0; - if (DAG.getTarget().getRelocationModel() != Reloc::Static && + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && (Subtarget.getTargetTriple().isMacOSX() && Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && (G->getGlobal()->isDeclaration() || - G->getGlobal()->isWeakForLinker())) { + G->getGlobal()->isWeakForLinker())) || + (Subtarget.isTargetELF() && !isPPC64 && + !G->getGlobal()->hasLocalLinkage() && + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. - OpFlags = PPCII::MO_DARWIN_STUB; + OpFlags = PPCII::MO_PLT_OR_STUB; } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, @@ -3467,13 +3549,15 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { unsigned char OpFlags = 0; - if (DAG.getTarget().getRelocationModel() != Reloc::Static && - (Subtarget.getTargetTriple().isMacOSX() && - Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) { + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) || + (Subtarget.isTargetELF() && !isPPC64 && + DAG.getTarget().getRelocationModel() == Reloc::PIC_) ) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. - OpFlags = PPCII::MO_DARWIN_STUB; + OpFlags = PPCII::MO_PLT_OR_STUB; } Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), @@ -3486,7 +3570,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, // to do the call, we can't use PPCISD::CALL. SDValue MTCTROps[] = {Chain, Callee, InFlag}; - if (isSVR4ABI && isPPC64) { + if (isSVR4ABI && isPPC64 && !isELFv2ABI) { // Function pointers in the 64-bit SVR4 ABI do not point to the function // entry point, but to the function descriptor (the function entry point // address is part of the function descriptor though). @@ -3566,7 +3650,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, CallOpc = PPCISD::BCTRL; Callee.setNode(nullptr); // Add use of X11 (holding environment pointer) - if (isSVR4ABI && isPPC64) + if (isSVR4ABI && isPPC64 && !isELFv2ABI) Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) @@ -3588,6 +3672,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); + // Direct calls in the ELFv2 ABI need the TOC register live into the call. + if (Callee.getNode() && isELFv2ABI) + Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); + return CallOpc; } @@ -3657,6 +3745,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, int SPDiff, unsigned NumBytes, const SmallVectorImpl &Ins, SmallVectorImpl &InVals) const { + + bool isELFv2ABI = Subtarget.isELFv2ABI(); std::vector NodeTys; SmallVector Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff, @@ -3732,7 +3822,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); - unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset); SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag); @@ -3825,7 +3915,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, getTargetMachine(), ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. - CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); + CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false), + PtrByteSize); if (isVarArg) { // Handle fixed and variable vector arguments differently. @@ -4034,6 +4125,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { + bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); @@ -4050,16 +4143,44 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); - unsigned nAltivecParamsAtEnd = 0; - // Count how many bytes are to be pushed on the stack, including the linkage - // area, and parameter passing area. We start with at least 48 bytes, which - // is reserved space for [SP][CR][LR][3 x unused]. - // NOTE: For PPC64, nAltivecParamsAtEnd always remains zero as a result - // of this call. - unsigned NumBytes = - CalculateParameterAndLinkageAreaSize(DAG, true, isVarArg, CallConv, - Outs, OutVals, nAltivecParamsAtEnd); + // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes + // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage + // area is 32 bytes reserved space for [SP][CR][LR][TOC]. + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); + unsigned NumBytes = LinkageSize; + + // Add up all the space actually used. + for (unsigned i = 0; i != NumOps; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; + + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + NumBytes = ((NumBytes + Align - 1) / Align) * Align; + + NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + } + + unsigned NumBytesActuallyUsed = NumBytes; + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + + // Tail call needs the stack to be aligned. + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. @@ -4091,8 +4212,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true); - unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -4120,6 +4241,17 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; + + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, NumGPRs); // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. @@ -4152,15 +4284,6 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, if (Size == 0) continue; - unsigned BVAlign = Flags.getByValAlign(); - if (BVAlign > 8) { - if (BVAlign % PtrByteSize != 0) - llvm_unreachable( - "ByVal alignment is not a multiple of the pointer size"); - - ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign; - } - // All aggregates smaller than 8 bytes must be passed right-justified. if (Size==1 || Size==2 || Size==4) { EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); @@ -4169,7 +4292,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), VT, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); ArgOffset += PtrByteSize; continue; @@ -4177,9 +4300,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, } if (GPR_idx == NumGPRs && Size < 8) { - SDValue Const = DAG.getConstant(PtrByteSize - Size, - PtrOff.getValueType()); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + SDValue AddPtr = PtrOff; + if (!isLittleEndian) { + SDValue Const = DAG.getConstant(PtrByteSize - Size, + PtrOff.getValueType()); + AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + } Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); @@ -4214,8 +4340,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // small aggregates, particularly for packed ones. // FIXME: It would be preferable to use the slot in the // parameter save area instead of a new local variable. - SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType()); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + SDValue AddPtr = PtrOff; + if (!isLittleEndian) { + SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType()); + AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + } Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); @@ -4225,7 +4354,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); // Done with this argument. ArgOffset += PtrByteSize; @@ -4257,8 +4386,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, case MVT::i1: case MVT::i32: case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != NumGPRs) { - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg)); } else { LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, @@ -4267,40 +4399,70 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, ArgOffset += PtrByteSize; break; case MVT::f32: - case MVT::f64: - if (FPR_idx != NumFPRs) { + case MVT::f64: { + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. + + // Named arguments go into FPRs first, and once they overflow, the + // remaining arguments go into GPRs and then the parameter save area. + // Unnamed arguments for vararg functions always go to GPRs and + // then the parameter save area. For now, put all arguments to vararg + // routines always in both locations (FPR *and* GPR or stack slot). + bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; + + // First load the argument into the next available FPR. + if (FPR_idx != NumFPRs) RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); - if (isVarArg) { - // A single float or an aggregate containing only a single float - // must be passed right-justified in the stack doubleword, and - // in the GPR, if one is available. - SDValue StoreOff; - if (Arg.getSimpleValueType().SimpleTy == MVT::f32) { - SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); - StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); - } else - StoreOff = PtrOff; - - SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff, - MachinePointerInfo(), false, false, 0); - MemOpChains.push_back(Store); - - // Float varargs are always shadowed in available integer registers - if (GPR_idx != NumGPRs) { - SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, - MachinePointerInfo(), false, false, - false, 0); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - } - } else if (GPR_idx != NumGPRs) - // If we have any FPRs remaining, we may also have GPRs remaining. - ++GPR_idx; + // Next, load the argument into GPR or stack slot if needed. + if (!NeedGPROrStack) + ; + else if (GPR_idx != NumGPRs) { + // In the non-vararg case, this can only ever happen in the + // presence of f32 array types, since otherwise we never run + // out of FPRs before running out of GPRs. + SDValue ArgVal; + + // Double values are always passed in a single GPR. + if (Arg.getValueType() != MVT::f32) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); + + // Non-array float values are extended and passed in a GPR. + } else if (!Flags.isInConsecutiveRegs()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + + // If we have an array of floats, we collect every odd element + // together with its predecessor into one GPR. + } else if (ArgOffset % PtrByteSize != 0) { + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); + Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + if (!isLittleEndian) + std::swap(Lo, Hi); + ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + + // The final element, if even, goes into the first half of a GPR. + } else if (Flags.isInConsecutiveRegsLast()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + if (!isLittleEndian) + ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + + // Non-final even elements are skipped; they will be handled + // together the with subsequent argument on the next go-around. + } else + ArgVal = SDValue(); + + if (ArgVal.getNode()) + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal)); } else { // Single-precision floating-point values are mapped to the // second (rightmost) word of the stack doubleword. - if (Arg.getValueType() == MVT::f32) { + if (Arg.getValueType() == MVT::f32 && + !isLittleEndian && !Flags.isInConsecutiveRegs()) { SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } @@ -4309,29 +4471,32 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, true, isTailCall, false, MemOpChains, TailCallArguments, dl); } - ArgOffset += 8; + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgOffset += (Arg.getValueType() == MVT::f32 && + Flags.isInConsecutiveRegs()) ? 4 : 8; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; break; + } case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + + // For a varargs call, named arguments go into VRs or on the stack as + // usual; unnamed arguments always go to the stack or the corresponding + // GPRs when within range. For now, we always put the value in both + // locations (or even all three). if (isVarArg) { - // These go aligned on the stack, or in the corresponding R registers - // when within range. The Darwin PPC ABI doc claims they also go in - // V registers; in fact gcc does this only for arguments that are - // prototyped, not for those that match the ... We do it for all - // arguments, seems to work. - while (ArgOffset % 16 !=0) { - ArgOffset += PtrByteSize; - if (GPR_idx != NumGPRs) - GPR_idx++; - } // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. - PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, - DAG.getConstant(ArgOffset, PtrVT)); SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo(), false, false, 0); MemOpChains.push_back(Store); @@ -4362,10 +4527,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, break; } - // Non-varargs Altivec params generally go in registers, but have - // stack space allocated at the end. + // Non-varargs Altivec params go into VRs or on the stack. if (VR_idx != NumVRs) { - // Doesn't have GPR space allocated. unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 || Arg.getSimpleValueType() == MVT::v2i64) ? VSRH[VR_idx] : VR[VR_idx]; @@ -4376,12 +4539,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); - ArgOffset += 16; } + ArgOffset += 16; break; } } + assert(NumBytesActuallyUsed == ArgOffset); + (void)NumBytesActuallyUsed; + if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); @@ -4394,11 +4560,16 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // Load r2 into a virtual register and store it to the TOC save area. SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); // TOC save area offset. - unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(), false, false, 0); + // In the ELFv2 ABI, R12 must contain the address of an indirect callee. + // This does not mean the MTCTR instruction must use R12; it's easier + // to model this as an extra parameter, so do that. + if (isELFv2ABI) + RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); } // Build a sequence of copy-to-reg nodes chained together with token chain @@ -4446,15 +4617,56 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); - unsigned nAltivecParamsAtEnd = 0; - // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. We start with 24/48 bytes, which is // prereserved space for [SP][CR][LR][3 x unused]. - unsigned NumBytes = - CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CallConv, - Outs, OutVals, - nAltivecParamsAtEnd); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, + false); + unsigned NumBytes = LinkageSize; + + // Add up all the space actually used. + // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually + // they all go in registers, but we must reserve stack space for them for + // possible use by the caller. In varargs or 64-bit calls, parameters are + // assigned stack space in order, with padding so Altivec parameters are + // 16-byte aligned. + unsigned nAltivecParamsAtEnd = 0; + for (unsigned i = 0; i != NumOps; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + // Varargs Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { + if (!isVarArg && !isPPC64) { + // Non-varargs Altivec parameters go after all the non-Altivec + // parameters; handle those later so we know how much padding we need. + nAltivecParamsAtEnd++; + continue; + } + // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. + NumBytes = ((NumBytes+15)/16)*16; + } + NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + } + + // Allow for Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + NumBytes = ((NumBytes+15)/16)*16; + NumBytes += 16*nAltivecParamsAtEnd; + } + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + + // Tail call needs the stack to be aligned. + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. @@ -4490,7 +4702,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; static const MCPhysReg GPR_32[] = { // 32-bit registers. @@ -6639,7 +6851,10 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Since FP is only updated here but NOT referenced, it's treated as GPR. unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; - unsigned BP = (PVT == MVT::i64) ? PPC::X30 : PPC::R30; + unsigned BP = (PVT == MVT::i64) ? PPC::X30 : + (Subtarget.isSVR4ABI() && + MF->getTarget().getRelocationModel() == Reloc::PIC_ ? + PPC::R29 : PPC::R30); MachineInstrBuilder MIB; @@ -6859,13 +7074,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) - BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ANDC); + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) - BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ANDC); + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) - BB = EmitAtomicBinary(MI, BB, false, PPC::ANDC); + BB = EmitAtomicBinary(MI, BB, false, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) - BB = EmitAtomicBinary(MI, BB, true, PPC::ANDC8); + BB = EmitAtomicBinary(MI, BB, true, PPC::NAND8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); @@ -7360,10 +7575,9 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { if (!Visited.count(ChainLD->getChain().getNode())) Queue.push_back(ChainLD->getChain().getNode()); } else if (ChainNext->getOpcode() == ISD::TokenFactor) { - for (SDNode::op_iterator O = ChainNext->op_begin(), - OE = ChainNext->op_end(); O != OE; ++O) - if (!Visited.count(O->getNode())) - Queue.push_back(O->getNode()); + for (const SDUse &O : ChainNext->ops()) + if (!Visited.count(O.getNode())) + Queue.push_back(O.getNode()); } else LoadRoots.insert(ChainNext); } @@ -8259,12 +8473,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, ++UI; SmallVector Ops; - for (SDNode::op_iterator O = User->op_begin(), - OE = User->op_end(); O != OE; ++O) { - if (*O == Use) + for (const SDUse &O : User->ops()) { + if (O == Use) Ops.push_back(To); else - Ops.push_back(*O); + Ops.push_back(O); } DAG.UpdateNodeOperands(User, Ops); diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 2b69208fea7e..ae8c300a4ffa 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -510,6 +510,20 @@ namespace llvm { FastISel *createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const override; + /// \brief Returns true if an argument of type Ty needs to be passed in a + /// contiguous block of registers in calling convention CallConv. + bool functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override { + // We support any array type as "consecutive" block in the parameter + // save area. The element type defines the alignment requirement and + // whether the argument should go in GPRs, FPRs, or VRs if available. + // + // Note that clang uses this capability both to implement the ELFv2 + // homogeneous float/vector aggregate ABI, and to avoid having to use + // "byval" when passing aggregates that might fully fit in registers. + return Ty->isArrayTy(); + } + private: SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const; @@ -610,11 +624,6 @@ namespace llvm { extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG, SDValue ArgVal, SDLoc dl) const; - void - setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG, - unsigned nAltivecParamsAtEnd, - unsigned MinReservedArea, bool isPPC64) const; - SDValue LowerFormalArguments_Darwin(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 431cfd754997..9bac91d7d412 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -90,7 +90,7 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer( unsigned Directive = DAG->TM.getSubtarget().getDarwinDirective(); - if (Directive == PPC::DIR_PWR7) + if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8) return new PPCDispatchGroupSBHazardRecognizer(II, DAG); // Most subtargets use a PPC970 recognizer. @@ -146,6 +146,7 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, case PPC::DIR_PWR6: case PPC::DIR_PWR6X: case PPC::DIR_PWR7: + case PPC::DIR_PWR8: Latency += 2; break; } @@ -323,6 +324,7 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, default: Opcode = PPC::NOP; break; case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break; case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break; + case PPC::DIR_PWR8: Opcode = PPC::NOP_GT_PWR7; break; /* FIXME: Update when P8 InstrScheduling model is ready */ } DebugLoc DL; diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index c2e3382b3e79..42b740f4fa46 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -57,6 +57,9 @@ def SDT_PPCTC_ret : SDTypeProfile<0, 2, [ SDTCisPtrTy<0>, SDTCisVT<1, i32> ]>; +def tocentry32 : Operand { + let MIOperandInfo = (ops i32imm:$imm); +} //===----------------------------------------------------------------------===// // PowerPC specific DAG Nodes. @@ -2400,6 +2403,18 @@ def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg), def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g), (ADD4TLS $in, tglobaltlsaddr:$g)>; +// Support for Position-independent code +def LWZtoc: Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg), + "#LWZtoc", + [(set i32:$rD, + (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>; +// Get Global (GOT) Base Register offset, from the word immediately preceding +// the function label. +def GetGBRO: Pseudo<(outs gprc:$rT), (ins gprc:$rI), "#GetGBRO", []>; +// Update the Global(GOT) Base Register with the above offset. +def UpdateGBR: Pseudo<(outs gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>; + + // Standard shifts. These are represented separately from the real shifts above // so that we can distinguish between shifts that allow 5-bit and 6-bit shift // amounts. diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp index f8e84a5731c5..668041371780 100644 --- a/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "PPC.h" +#include "PPCSubtarget.h" #include "MCTargetDesc/PPCMCExpr.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/Twine.h" @@ -39,12 +40,14 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){ Mangler *Mang = AP.Mang; const DataLayout *DL = TM.getDataLayout(); MCContext &Ctx = AP.OutContext; + bool isDarwin = TM.getSubtarget().isDarwin(); SmallString<128> Name; StringRef Suffix; - if (MO.getTargetFlags() == PPCII::MO_DARWIN_STUB) - Suffix = "$stub"; - else if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) + if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB) { + if (isDarwin) + Suffix = "$stub"; + } else if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) Suffix = "$non_lazy_ptr"; if (!Suffix.empty()) @@ -68,7 +71,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){ // If the target flags on the operand changes the name of the symbol, do that // before we return the symbol. - if (MO.getTargetFlags() == PPCII::MO_DARWIN_STUB) { + if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && isDarwin) { MachineModuleInfoImpl::StubValueTy &StubSym = getMachOMMI(AP).getFnStubEntry(Sym); if (StubSym.getPointer()) @@ -136,6 +139,9 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, break; } + if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && !isDarwin) + RefKind = MCSymbolRefExpr::VK_PLT; + const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, RefKind, Ctx); if (!MO.isJTI() && MO.getOffset()) diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp index 6a0aec842be7..9da1b1b5c754 100644 --- a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp +++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp @@ -8,8 +8,16 @@ //===----------------------------------------------------------------------===// #include "PPCMachineFunctionInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/MC/MCContext.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; void PPCFunctionInfo::anchor() { } +MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const { + const DataLayout *DL = MF.getTarget().getDataLayout(); + return MF.getContext().GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+ + Twine(MF.getFunctionNumber())+"$poff"); +} diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h index 33f843dfb432..9a2cec744274 100644 --- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -92,6 +92,12 @@ class PPCFunctionInfo : public MachineFunctionInfo { /// 64-bit SVR4 ABI. SmallVector MustSaveCRs; + /// Hold onto our MachineFunction context. + MachineFunction &MF; + + /// Whether this uses the PIC Base register or not. + bool UsesPICBase; + public: explicit PPCFunctionInfo(MachineFunction &MF) : FramePointerSaveIndex(0), @@ -109,7 +115,9 @@ class PPCFunctionInfo : public MachineFunctionInfo { VarArgsStackOffset(0), VarArgsNumGPR(0), VarArgsNumFPR(0), - CRSpillFrameIndex(0) {} + CRSpillFrameIndex(0), + MF(MF), + UsesPICBase(0) {} int getFramePointerSaveIndex() const { return FramePointerSaveIndex; } void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; } @@ -170,6 +178,11 @@ class PPCFunctionInfo : public MachineFunctionInfo { const SmallVectorImpl & getMustSaveCRs() const { return MustSaveCRs; } void addMustSaveCR(unsigned Reg) { MustSaveCRs.push_back(Reg); } + + void setUsesPICBase(bool uses) { UsesPICBase = uses; } + bool usesPICBase() const { return UsesPICBase; } + + MCSymbol *getPICOffsetSymbol() const; }; } // end of namespace llvm diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index e333b51b8774..9895ee6267aa 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -199,7 +199,16 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { if (PPCFI->needsFP(MF)) Reserved.set(PPC::R31); - if (hasBasePointer(MF)) + if (hasBasePointer(MF)) { + if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64() && + MF.getTarget().getRelocationModel() == Reloc::PIC_) + Reserved.set(PPC::R29); + else + Reserved.set(PPC::R30); + } + + if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64() && + MF.getTarget().getRelocationModel() == Reloc::PIC_) Reserved.set(PPC::R30); // Reserve Altivec registers when Altivec is unavailable. @@ -843,7 +852,14 @@ unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const { if (!hasBasePointer(MF)) return getFrameRegister(MF); - return Subtarget.isPPC64() ? PPC::X30 : PPC::R30; + if (Subtarget.isPPC64()) + return PPC::X30; + + if (Subtarget.isSVR4ABI() && + MF.getTarget().getRelocationModel() == Reloc::PIC_) + return PPC::R29; + + return PPC::R30; } bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const { @@ -885,16 +901,6 @@ bool PPCRegisterInfo:: needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { assert(Offset < 0 && "Local offset must be negative"); - unsigned FIOperandNum = 0; - while (!MI->getOperand(FIOperandNum).isFI()) { - ++FIOperandNum; - assert(FIOperandNum < MI->getNumOperands() && - "Instr doesn't have FrameIndex operand!"); - } - - unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum); - Offset += MI->getOperand(OffsetOperandNo).getImm(); - // It's the load/store FI references that cause issues, as it can be difficult // to materialize the offset if it won't fit in the literal field. Estimate // based on the size of the local frame and some conservative assumptions @@ -973,10 +979,28 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum); Offset += MI.getOperand(OffsetOperandNo).getImm(); MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset); + + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const MCInstrDesc &MCID = MI.getDesc(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.constrainRegClass(BaseReg, + TII.getRegClass(MCID, FIOperandNum, this, MF)); } bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const { + unsigned FIOperandNum = 0; + while (!MI->getOperand(FIOperandNum).isFI()) { + ++FIOperandNum; + assert(FIOperandNum < MI->getNumOperands() && + "Instr doesn't have FrameIndex operand!"); + } + + unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum); + Offset += MI->getOperand(OffsetOperandNo).getImm(); + return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0)); } diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index 0759200ce353..b51512d335fc 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -222,22 +222,6 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV, GV->hasCommonLinkage() || isDecl; } -bool PPCSubtarget::enablePostRAScheduler( - CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const { - Mode = TargetSubtargetInfo::ANTIDEP_ALL; - - CriticalPathRCs.clear(); - - if (isPPC64()) - CriticalPathRCs.push_back(&PPC::G8RCRegClass); - else - CriticalPathRCs.push_back(&PPC::GPRCRegClass); - - return OptLevel >= CodeGenOpt::Default; -} - // Embedded cores need aggressive scheduling (and some others also benefit). static bool needsAggressiveScheduling(unsigned Directive) { switch (Directive) { @@ -247,6 +231,7 @@ static bool needsAggressiveScheduling(unsigned Directive) { case PPC::DIR_E500mc: case PPC::DIR_E5500: case PPC::DIR_PWR7: + case PPC::DIR_PWR8: return true; } } @@ -258,6 +243,19 @@ bool PPCSubtarget::enableMachineScheduler() const { return needsAggressiveScheduling(DarwinDirective); } +// This overrides the PostRAScheduler bit in the SchedModel for each CPU. +bool PPCSubtarget::enablePostMachineScheduler() const { return true; } + +PPCGenSubtargetInfo::AntiDepBreakMode PPCSubtarget::getAntiDepBreakMode() const { + return TargetSubtargetInfo::ANTIDEP_ALL; +} + +void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const { + CriticalPathRCs.clear(); + CriticalPathRCs.push_back(isPPC64() ? + &PPC::G8RCRegClass : &PPC::GPRCRegClass); +} + void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin, MachineInstr *end, diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index 2d8399e78110..a3cedafb5ef2 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -56,6 +56,7 @@ namespace PPC { DIR_PWR6, DIR_PWR6X, DIR_PWR7, + DIR_PWR8, DIR_64 }; } @@ -221,18 +222,24 @@ class PPCSubtarget : public PPCGenSubtargetInfo { /// isBGQ - True if this is a BG/Q platform. bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; } + bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } + bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + bool isDarwinABI() const { return isDarwin(); } bool isSVR4ABI() const { return !isDarwin(); } - - /// enablePostRAScheduler - True at 'More' optimization. - bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const override; + /// FIXME: Should use a command-line option. + bool isELFv2ABI() const { return isPPC64() && isSVR4ABI() && + isLittleEndian(); } bool enableEarlyIfConversion() const override { return hasISEL(); } // Scheduling customization. bool enableMachineScheduler() const override; + // This overrides the PostRAScheduler bit in the SchedModel for each CPU. + bool enablePostMachineScheduler() const override; + AntiDepBreakMode getAntiDepBreakMode() const override; + void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override; + void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin, MachineInstr *end, diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 3cf8063b70ef..4c7029ca7a36 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -14,11 +14,7 @@ #ifndef PPC_TARGETMACHINE_H #define PPC_TARGETMACHINE_H -#include "PPCFrameLowering.h" -#include "PPCISelLowering.h" #include "PPCInstrInfo.h" -#include "PPCJITInfo.h" -#include "PPCSelectionDAGInfo.h" #include "PPCSubtarget.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h index 74b5f4584525..73fb69101353 100644 --- a/lib/Target/PowerPC/PPCTargetStreamer.h +++ b/lib/Target/PowerPC/PPCTargetStreamer.h @@ -19,6 +19,8 @@ class PPCTargetStreamer : public MCTargetStreamer { virtual ~PPCTargetStreamer(); virtual void emitTCEntry(const MCSymbol &S) = 0; virtual void emitMachine(StringRef CPU) = 0; + virtual void emitAbiVersion(int AbiVersion) = 0; + virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) = 0; }; } diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index f92bde853770..d7e94f75e123 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -39,8 +39,10 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); +FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); @@ -56,8 +58,20 @@ FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ImmutablePass * createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM); +void initializeSIFixSGPRLiveRangesPass(PassRegistry&); +extern char &SIFixSGPRLiveRangesID; + + extern Target TheAMDGPUTarget; +namespace AMDGPU { +enum TargetIndex { + TI_CONSTDATA_START +}; +} + +#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel" + } // End namespace llvm namespace ShaderType { diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index d3dff531a7f6..5645f1a2322e 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -25,6 +25,11 @@ def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer", "false", "Disable IR Structurizer">; +def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", + "EnablePromoteAlloca", + "true", + "Enable promote alloca pass">; + // Target features def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", @@ -32,30 +37,39 @@ def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", "false", "Disable the if conversion pass">; -def FeatureFP64 : SubtargetFeature<"fp64", +def FeatureFP64 : SubtargetFeature<"fp64", "FP64", "true", - "Enable 64bit double precision operations">; + "Enable double precision operations">; + +def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", + "FP64Denormals", + "true", + "Enable double precision denormal handling", + [FeatureFP64]>; + +// Some instructions do not support denormals despite this flag. Using +// fp32 denormals also causes instructions to run at the double +// precision rate for the device. +def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", + "FP32Denormals", + "true", + "Enable single precision denormal handling">; def Feature64BitPtr : SubtargetFeature<"64BitPtr", "Is64bit", "true", - "Specify if 64bit addressing should be used.">; - -def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr", - "Is32on64bit", - "false", - "Specify if 64bit sized pointers with 32bit addressing should be used.">; + "Specify if 64-bit addressing should be used">; def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", "R600ALUInst", "false", - "Older version of ALU instructions encoding.">; + "Older version of ALU instructions encoding">; def FeatureVertexCache : SubtargetFeature<"HasVertexCache", "HasVertexCache", "true", - "Specify use of dedicated vertex cache.">; + "Specify use of dedicated vertex cache">; def FeatureCaymanISA : SubtargetFeature<"caymanISA", "CaymanISA", @@ -116,10 +130,12 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", >; def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>; + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, + FeatureWavefrontSize64]>; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>; + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64]>; //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 2da7792a88a6..73faaa183581 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -16,7 +16,6 @@ //===----------------------------------------------------------------------===// // - #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" @@ -26,6 +25,7 @@ #include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" @@ -36,6 +36,41 @@ using namespace llvm; +// TODO: This should get the default rounding mode from the kernel. We just set +// the default here, but this could change if the OpenCL rounding mode pragmas +// are used. +// +// The denormal mode here should match what is reported by the OpenCL runtime +// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but +// can also be override to flush with the -cl-denorms-are-zero compiler flag. +// +// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double +// precision, and leaves single precision to flush all and does not report +// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports +// CL_FP_DENORM for both. +// +// FIXME: It seems some instructions do not support single precision denormals +// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, +// and sin_f32, cos_f32 on most parts). + +// We want to use these instructions, and using fp32 denormals also causes +// instructions to run at the double precision rate for the device so it's +// probably best to just report no single precision denormals. +static uint32_t getFPMode(const MachineFunction &F) { + const AMDGPUSubtarget& ST = F.getTarget().getSubtarget(); + // TODO: Is there any real use for the flush in only / flush out only modes? + + uint32_t FP32Denormals = + ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + uint32_t FP64Denormals = + ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | + FP_DENORM_MODE_SP(FP32Denormals) | + FP_DENORM_MODE_DP(FP64Denormals); +} static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, MCStreamer &Streamer) { @@ -51,6 +86,16 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) DisasmEnabled = TM.getSubtarget().dumpCode(); } +void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { + + // This label is used to mark the end of the .text section. + const TargetLoweringObjectFile &TLOF = getObjFileLowering(); + OutStreamer.SwitchSection(TLOF.getTextSection()); + MCSymbol *EndOfTextLabel = + OutContext.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + OutStreamer.EmitLabel(EndOfTextLabel); +} + bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { SetupMachineFunction(MF); @@ -93,6 +138,12 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), false); + OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), + false); + OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), + false); + OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), + false); } else { R600MachineFunctionInfo *MFI = MF.getInfo(); OutStreamer.emitRawComment( @@ -123,25 +174,21 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { return false; } -void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { +void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { unsigned MaxGPR = 0; bool killPixel = false; - const R600RegisterInfo * RI = - static_cast(TM.getRegisterInfo()); - R600MachineFunctionInfo *MFI = MF.getInfo(); + const R600RegisterInfo *RI + = static_cast(TM.getRegisterInfo()); + const R600MachineFunctionInfo *MFI = MF.getInfo(); const AMDGPUSubtarget &STM = TM.getSubtarget(); - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { if (MI.getOpcode() == AMDGPU::KILLGT) killPixel = true; unsigned numOperands = MI.getNumOperands(); for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - MachineOperand & MO = MI.getOperand(op_idx); + const MachineOperand &MO = MI.getOperand(op_idx); if (!MO.isReg()) continue; unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; @@ -157,7 +204,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { unsigned RsrcReg; if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { // Evergreen / Northern Islands - switch (MFI->ShaderType) { + switch (MFI->getShaderType()) { default: // Fall through case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; @@ -166,7 +213,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { } } else { // R600 / R700 - switch (MFI->ShaderType) { + switch (MFI->getShaderType()) { default: // Fall through case ShaderType::GEOMETRY: // Fall through case ShaderType::COMPUTE: // Fall through @@ -181,34 +228,29 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); - if (MFI->ShaderType == ShaderType::COMPUTE) { + if (MFI->getShaderType() == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); } } void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, - MachineFunction &MF) const { + const MachineFunction &MF) const { uint64_t CodeSize = 0; unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; - const SIRegisterInfo * RI = - static_cast(TM.getRegisterInfo()); - - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; + const SIRegisterInfo *RI + = static_cast(TM.getRegisterInfo()); + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. CodeSize += MI.getDesc().Size; unsigned numOperands = MI.getNumOperands(); for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - MachineOperand &MO = MI.getOperand(op_idx); + const MachineOperand &MO = MI.getOperand(op_idx); unsigned width = 0; bool isSGPR = false; @@ -280,18 +322,32 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (VCCUsed) MaxSGPR += 2; - ProgInfo.CodeLen = CodeSize; - ProgInfo.NumSGPR = MaxSGPR; ProgInfo.NumVGPR = MaxVGPR; + ProgInfo.NumSGPR = MaxSGPR; + + // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode + // register. + ProgInfo.FloatMode = getFPMode(MF); + + // XXX: Not quite sure what this does, but sc seems to unset this. + ProgInfo.IEEEMode = 0; + + // Do not clamp NAN to 0. + ProgInfo.DX10Clamp = 0; + + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); + + ProgInfo.CodeLen = CodeSize; } -void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, +void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo) { const AMDGPUSubtarget &STM = TM.getSubtarget(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); - SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned RsrcReg; - switch (MFI->ShaderType) { + switch (MFI->getShaderType()) { default: // Fall through case ShaderType::COMPUTE: RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break; case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break; @@ -299,26 +355,58 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break; } - OutStreamer.EmitIntValue(RsrcReg, 4); - OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | - S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); - unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - // LDS is allocated in 64 dword blocks + // LDS is allocated in 64 dword blocks. LDSAlignShift = 8; } else { - // LDS is allocated in 128 dword blocks + // LDS is allocated in 128 dword blocks. LDSAlignShift = 9; } + unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + + // Scratch is allocated in 256 dword blocks. + unsigned ScratchAlignShift = 10; + // We need to program the hardware with the amount of scratch memory that + // is used by the entire wave. KernelInfo.ScratchSize is the amount of + // scratch memory used per thread. + unsigned ScratchBlocks = + RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(), + 1 << ScratchAlignShift) >> ScratchAlignShift; + + if (MFI->getShaderType() == ShaderType::COMPUTE) { + OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); + + const uint32_t ComputePGMRSrc1 = + S_00B848_VGPRS(KernelInfo.NumVGPR / 4) | + S_00B848_SGPRS(KernelInfo.NumSGPR / 8) | + S_00B848_PRIORITY(KernelInfo.Priority) | + S_00B848_FLOAT_MODE(KernelInfo.FloatMode) | + S_00B848_PRIV(KernelInfo.Priv) | + S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) | + S_00B848_IEEE_MODE(KernelInfo.DebugMode) | + S_00B848_IEEE_MODE(KernelInfo.IEEEMode); + + OutStreamer.EmitIntValue(ComputePGMRSrc1, 4); - if (MFI->ShaderType == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4); + const uint32_t ComputePGMRSrc2 = + S_00B84C_LDS_SIZE(LDSBlocks) | + S_00B02C_SCRATCH_EN(ScratchBlocks > 0); + + OutStreamer.EmitIntValue(ComputePGMRSrc2, 4); + + OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); + OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4); + } else { + OutStreamer.EmitIntValue(RsrcReg, 4); + OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | + S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); } - if (MFI->ShaderType == ShaderType::PIXEL) { + + if (MFI->getShaderType() == ShaderType::PIXEL) { OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4); OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index 71adc9a4d1f6..19907cfd013e 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -16,7 +16,6 @@ #define AMDGPU_ASMPRINTER_H #include "llvm/CodeGen/AsmPrinter.h" -#include #include namespace llvm { @@ -25,24 +24,41 @@ class AMDGPUAsmPrinter : public AsmPrinter { private: struct SIProgramInfo { SIProgramInfo() : - CodeLen(0), + NumVGPR(0), NumSGPR(0), - NumVGPR(0) {} + Priority(0), + FloatMode(0), + Priv(0), + DX10Clamp(0), + DebugMode(0), + IEEEMode(0), + ScratchSize(0), + CodeLen(0) {} + // Fields set in PGM_RSRC1 pm4 packet. + uint32_t NumVGPR; + uint32_t NumSGPR; + uint32_t Priority; + uint32_t FloatMode; + uint32_t Priv; + uint32_t DX10Clamp; + uint32_t DebugMode; + uint32_t IEEEMode; + uint32_t ScratchSize; + + // Bonus information for debugging. uint64_t CodeLen; - unsigned NumSGPR; - unsigned NumVGPR; }; - void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const; - void findNumUsedRegistersSI(MachineFunction &MF, + void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; + void findNumUsedRegistersSI(const MachineFunction &MF, unsigned &NumSGPR, unsigned &NumVGPR) const; /// \brief Emit register usage information so that the GPU driver /// can correctly setup the GPU state. - void EmitProgramInfoR600(MachineFunction &MF); - void EmitProgramInfoSI(MachineFunction &MF, const SIProgramInfo &KernelInfo); + void EmitProgramInfoR600(const MachineFunction &MF); + void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); public: explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer); @@ -56,6 +72,8 @@ class AMDGPUAsmPrinter : public AsmPrinter { /// Implemented in AMDGPUMCInstLower.cpp void EmitInstruction(const MachineInstr *MI) override; + void EmitEndOfAsmFile(Module &M) override; + protected: bool DisasmEnabled; std::vector DisasmLines, HexLines; diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td index 5f8ad8c3b171..3586c8826908 100644 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -62,11 +62,11 @@ def CC_AMDGPU : CallingConv<[ CCIf<"State.getTarget().getSubtarget().getGeneration() >= " "AMDGPUSubtarget::SOUTHERN_ISLANDS && " "State.getMachineFunction().getInfo()->"# - "ShaderType == ShaderType::COMPUTE", CCDelegateTo>, + "getShaderType() == ShaderType::COMPUTE", CCDelegateTo>, CCIf<"State.getTarget().getSubtarget().getGeneration() < " "AMDGPUSubtarget::SOUTHERN_ISLANDS && " "State.getMachineFunction().getInfo()->" - "ShaderType == ShaderType::COMPUTE", CCDelegateTo>, + "getShaderType() == ShaderType::COMPUTE", CCDelegateTo>, CCIf<"State.getTarget().getSubtarget()"# ".getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo>, CCIf<"State.getTarget().getSubtarget()"# diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index b4e86ce3a1a3..cc17b7ec6183 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -16,9 +16,13 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" +#include "SIDefines.h" #include "SIISelLowering.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Function.h" @@ -84,6 +88,17 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { SDValue& Offset); bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset, + SDValue &ImmOffset) const; + bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, + SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; + + SDNode *SelectADD_SUB_I64(SDNode *N); + SDNode *SelectDIV_SCALE(SDNode *N); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" @@ -212,50 +227,13 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // We are selecting i64 ADD here instead of custom lower it during // DAG legalization, so we can fold some i64 ADDs used for address // calculation into the LOAD and STORE instructions. - case ISD::ADD: { + case ISD::ADD: + case ISD::SUB: { if (N->getValueType(0) != MVT::i64 || ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; - SDLoc DL(N); - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); - - SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, LHS, Sub0); - SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, LHS, Sub1); - - SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, RHS, Sub0); - SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, RHS, Sub1); - - SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); - - SmallVector AddLoArgs; - AddLoArgs.push_back(SDValue(Lo0, 0)); - AddLoArgs.push_back(SDValue(Lo1, 0)); - - SDNode *AddLo = CurDAG->getMachineNode( - isCFDepth0() ? AMDGPU::S_ADD_I32 : AMDGPU::V_ADD_I32_e32, - DL, VTList, AddLoArgs); - SDValue Carry = SDValue(AddLo, 1); - SDNode *AddHi = CurDAG->getMachineNode( - isCFDepth0() ? AMDGPU::S_ADDC_U32 : AMDGPU::V_ADDC_U32_e32, - DL, MVT::i32, SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); - - SDValue Args[5] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), - SDValue(AddLo,0), - Sub0, - SDValue(AddHi,0), - Sub1, - }; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); + return SelectADD_SUB_I64(N); } case ISD::SCALAR_TO_VECTOR: case AMDGPUISD::BUILD_VERTICAL_VECTOR: @@ -489,6 +467,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { PackedOffsetWidth); } + case AMDGPUISD::DIV_SCALE: { + return SelectDIV_SCALE(N); + } } return SelectCode(N); } @@ -682,6 +663,222 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } +SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + bool IsAdd = (N->getOpcode() == ISD::ADD); + + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); + + SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub0); + SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub1); + + SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub0); + SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub1); + + SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); + SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; + + + unsigned Opc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32; + unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + + if (!isCFDepth0()) { + Opc = IsAdd ? AMDGPU::V_ADD_I32_e32 : AMDGPU::V_SUB_I32_e32; + CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e32 : AMDGPU::V_SUBB_U32_e32; + } + + SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); + SDValue Carry(AddLo, 1); + SDNode *AddHi + = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32, + SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); + + SDValue Args[5] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + SDValue(AddLo,0), + Sub0, + SDValue(AddHi,0), + Sub1, + }; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); +} + +SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; + + const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + + SDValue Ops[] = { + N->getOperand(0), + N->getOperand(1), + N->getOperand(2), + Zero, + Zero, + Zero, + Zero + }; + + return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); +} + +static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { + return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32, + Ptr), 0); +} + +static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { + return isUInt<12>(Imm->getZExtValue()); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, + SDValue &Offset, + SDValue &ImmOffset) const { + SDLoc DL(Addr); + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast(N1); + + if (isLegalMUBUFImmOffset(C1)) { + + if (N0.getOpcode() == ISD::ADD) { + // (add (add N2, N3), C1) + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Ptr = wrapAddr64Rsrc(CurDAG, DL, N2); + Offset = N3; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return true; + } + + // (add N0, C1) + Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));; + Offset = N0; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return true; + } + } + if (Addr.getOpcode() == ISD::ADD) { + // (add N0, N1) + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + Ptr = wrapAddr64Rsrc(CurDAG, DL, N0); + Offset = N1; + ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); + return true; + } + + // default case + Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64)); + Offset = Addr; + ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); + return true; +} + +/// \brief Return a resource descriptor with the 'Add TID' bit enabled +/// The TID (Thread ID) is multipled by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to the +/// resource ponter. +static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { + + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; + + SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); + SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); + SDValue DataLo = DAG->getTargetConstant( + Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32); + SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32); + + const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi }; + return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL, + MVT::v4i32, Ops), 0); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &ImmOffset) const { + + SDLoc DL(Addr); + MachineFunction &MF = CurDAG->getMachineFunction(); + const SIRegisterInfo *TRI = static_cast(MF.getTarget().getRegisterInfo()); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + + unsigned ScratchPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchOffsetReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + + Rsrc = buildScratchRSRC(CurDAG, DL, CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64)); + SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, + MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + + // (add n0, c1) + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast(N1); + + if (isLegalMUBUFImmOffset(C1)) { + VAddr = Addr.getOperand(0); + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return true; + } + } + + // (add FI, n0) + if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + isa(Addr.getOperand(0))) { + VAddr = Addr.getOperand(1); + ImmOffset = Addr.getOperand(0); + return true; + } + + // (FI) + if (isa(Addr)) { + VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, + CurDAG->getConstant(0, MVT::i32)), 0); + ImmOffset = Addr; + return true; + } + + // (node) + VAddr = Addr; + ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const { + + GLC = CurDAG->getTargetConstant(0, MVT::i1); + SLC = CurDAG->getTargetConstant(0, MVT::i1); + TFE = CurDAG->getTargetConstant(0, MVT::i1); + + Idxen = CurDAG->getTargetConstant(0, MVT::i1); + Offen = CurDAG->getTargetConstant(1, MVT::i1); + + return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index dc39bee6511e..ffd6357a9a07 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -16,12 +16,11 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" #include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" #include "R600MachineFunctionInfo.h" #include "SIMachineFunctionInfo.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -108,8 +107,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : Subtarget = &TM.getSubtarget(); - // Initialize target lowering borrowed from AMDIL - InitAMDILLowering(); + setOperationAction(ISD::Constant, MVT::i32, Legal); + setOperationAction(ISD::Constant, MVT::i64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); // We need to custom lower some of the intrinsics setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -134,6 +138,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v2f32, Promote); AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::i64, Promote); + AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); @@ -177,6 +184,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::v2f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); @@ -231,13 +241,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); } + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::SDIV, VT, Expand); // GPU does not have divrem function for signed or unsigned. - setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Custom); setOperationAction(ISD::UDIVREM, VT, Custom); // GPU does not have [S|U]MUL_LOHI functions as a single instruction. @@ -260,16 +276,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::ROTL, MVT::i64, Expand); setOperationAction(ISD::ROTR, MVT::i64, Expand); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand); setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::MULHU, MVT::i64, Expand); setOperationAction(ISD::MULHS, MVT::i64, Expand); - setOperationAction(ISD::SUB, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + if (!Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + + if (!Subtarget->hasFFBL()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; @@ -291,16 +311,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); // TODO: Implement custom UREM / SREM routines. - setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Custom); setOperationAction(ISD::UDIVREM, VT, Custom); + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::XOR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); @@ -308,6 +333,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); } static const MVT::SimpleValueType FloatVectorTypes[] = { @@ -320,29 +346,43 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FMUL, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); } + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::STORE); setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); + setSelectIsExpensive(false); + PredictableSelectIsExpensive = false; + // There are no integer divide instructions, and these expand to a pretty // large sequence of instructions. setIntDivIsCheap(false); + setPow2DivIsCheap(false); // TODO: Investigate this when 64-bit divides are implemented. addBypassSlowDiv(64, 32); @@ -361,6 +401,10 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const { return MVT::i32; } +bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { + return true; +} + // The backend supports 32 and 64 bit floating point immediates. // FIXME: Why are we reporting vectors of FP immediates as legal? bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { @@ -428,6 +472,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { return Src == MVT::i32 && Dest == MVT::i64; } +bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + return isZExtFree(Val.getValueType(), VT2); +} + bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // There aren't really 64-bit registers, but pairs of 32-bit ones and only a // limited number of native 64-bit operations. Shrinking an operation to fit @@ -489,7 +537,6 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, llvm_unreachable("Custom lowering code for this" "instruction is not implemented yet!"); break; - // AMDGPU DAG lowering. case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); @@ -498,14 +545,13 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::SDIV: return LowerSDIV(Op, DAG); case ISD::SREM: return LowerSREM(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); + case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); case ISD::FCEIL: return LowerFCEIL(Op, DAG); case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); case ISD::FRINT: return LowerFRINT(Op, DAG); + case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); - - // AMDIL DAG lowering. - case ISD::BRCOND: return LowerBRCOND(Op, DAG); } return Op; } @@ -522,95 +568,23 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do // nothing here and let the illegal result integer be handled normally. return; - case ISD::UDIV: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), - N->getOperand(0), N->getOperand(1)); - Results.push_back(UDIVREM); - break; - } - case ISD::UREM: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), - N->getOperand(0), N->getOperand(1)); - Results.push_back(UDIVREM.getValue(1)); - break; - } - case ISD::UDIVREM: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - - SDValue one = DAG.getConstant(1, HalfVT); - SDValue zero = DAG.getConstant(0, HalfVT); - - //HiLo split - SDValue LHS = N->getOperand(0); - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); - SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); - - SDValue RHS = N->getOperand(1); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); - SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); - - // Get Speculative values - SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - - SDValue REM_Hi = zero; - SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - - SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); - SDValue DIV_Lo = zero; - - const unsigned halfBitWidth = HalfVT.getSizeInBits(); - - for (unsigned i = 0; i < halfBitWidth; ++i) { - SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); - // Get Value of high bit - SDValue HBit; - if (halfBitWidth == 32 && Subtarget->hasBFE()) { - HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); - } else { - HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); - HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); - } - - SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, - DAG.getConstant(halfBitWidth - 1, HalfVT)); - REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); - REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); - - REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); - REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); - - - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - - SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); - SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); - - DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); - - // Update REM - - SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - - REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); - REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); - REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); - } + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + if (!Node) + return; - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); - Results.push_back(DIV); - Results.push_back(REM); - break; + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } + case ISD::STORE: { + SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); + if (Lowered.getNode()) + Results.push_back(Lowered); + return; } default: return; @@ -837,6 +811,46 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::AMDGPU_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT, + Src0, Denominator, Numerator); + } + + case Intrinsic::AMDGPU_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::AMDGPU_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_legacy_rsq: + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq_clamped: + return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_imax: return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -909,6 +923,8 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. + return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); } } @@ -999,22 +1015,36 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const { LoadSDNode *Load = dyn_cast(Op); EVT MemEltVT = Load->getMemoryVT().getVectorElementType(); + EVT LoadVT = Op.getValueType(); EVT EltVT = Op.getValueType().getVectorElementType(); EVT PtrVT = Load->getBasePtr().getValueType(); + unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); SmallVector Loads; + SmallVector Chains; + SDLoc SL(Op); for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT)); - Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, - Load->getChain(), Ptr, - MachinePointerInfo(Load->getMemOperand()->getValue()), - MemEltVT, Load->isVolatile(), Load->isNonTemporal(), - Load->getAlignment())); + + SDValue NewLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, + Load->getChain(), Ptr, + MachinePointerInfo(Load->getMemOperand()->getValue()), + MemEltVT, Load->isVolatile(), Load->isNonTemporal(), + Load->getAlignment()); + Loads.push_back(NewLoad.getValue(0)); + Chains.push_back(NewLoad.getValue(1)); } - return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads); + + SDValue Ops[] = { + DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) + }; + + return DAG.getMergeValues(Ops, SL); } SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, @@ -1117,7 +1147,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { Load->getBasePtr(), MemVT, Load->getMemOperand()); - return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32); + + SDValue Ops[] = { + DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32), + ExtLoad32.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); } if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { @@ -1131,21 +1167,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr, MVT::i8, MMO); - return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); - } - // Lower loads constant address space global variable loads - if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa( - GetUnderlyingObject(Load->getMemOperand()->getValue()))) { + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), + NewLD.getValue(1) + }; - SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, - getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); - Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(2, MVT::i32)); - return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), - Load->getChain(), Ptr, - DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); + return DAG.getMergeValues(Ops, DL); } if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || @@ -1170,10 +1198,21 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT MemEltVT = MemVT.getScalarType(); if (ExtType == ISD::SEXTLOAD) { SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); + + SDValue Ops[] = { + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); } - return DAG.getZeroExtendInReg(Ret, DL, MemEltVT); + SDValue Ops[] = { + DAG.getZeroExtendInReg(Ret, DL, MemEltVT), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); } SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { @@ -1276,7 +1315,8 @@ SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); // float fq = native_divide(fa, fb); - SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb); + SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY, + fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb)); // fq = trunc(fq); fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); @@ -1581,6 +1621,44 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, return DAG.getMergeValues(Ops, DL); } +SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Zero = DAG.getConstant(0, VT); + SDValue NegOne = DAG.getConstant(-1, VT); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); + SDValue RSign = LHSign; // Remainder sign is the same as LHS + + LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); + + LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); + + SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); + SDValue Rem = Div.getValue(1); + + Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); + + Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); + + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); +} + SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -1631,7 +1709,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(1023, MVT::i32)); // Extract the sign bit. - const SDValue SignBitMask = DAG.getConstant(1ul << 31, MVT::i32); + const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32); SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); // Extend back to to 64-bits. @@ -1640,7 +1718,8 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); - const SDValue FractMask = DAG.getConstant((1L << FractBits) - 1, MVT::i64); + const SDValue FractMask + = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64); SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); @@ -1683,6 +1762,13 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); } +SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { + // FNEARBYINT and FRINT are the same, except in their handling of FP + // exceptions. Those aren't really meaningful for us, and OpenCL only has + // rint, so just treat them as equivalent. + return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); +} + SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -1807,41 +1893,96 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, return DAG.getConstant(Src0 >> Offset, MVT::i32); } +static bool usesAllNormalStores(SDNode *LoadVal) { + for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { + if (!ISD::isNormalStore(*I)) + return false; + } + + return true; +} + +// If we have a copy of an illegal type, replace it with a load / store of an +// equivalently sized legal type. This avoids intermediate bit pack / unpack +// instructions emitted when handling extloads and truncstores. Ideally we could +// recognize the pack / unpack pattern to eliminate it. +SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + StoreSDNode *SN = cast(N); + SDValue Value = SN->getValue(); + EVT VT = Value.getValueType(); + + if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode())) + return SDValue(); + + LoadSDNode *LoadVal = cast(Value); + if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + return SDValue(); + + EVT MemVT = LoadVal->getMemoryVT(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); + + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + LoadVT, SL, + LoadVal->getChain(), + LoadVal->getBasePtr(), + LoadVal->getOffset(), + LoadVT, + LoadVal->getMemOperand()); + + SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); + DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + + return DAG.getStore(SN->getChain(), SL, NewLoad, + SN->getBasePtr(), SN->getMemOperand()); +} + +SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + + if (VT.isVector() || VT.getSizeInBits() > 32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mul; + + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + } else { + return SDValue(); + } + + // We need to use sext even for MUL_U24, because MUL_U24 is used + // for signed multiply of 8 and 16-bit types. + return DAG.getSExtOrTrunc(Mul, DL, VT); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); switch(N->getOpcode()) { default: break; - case ISD::MUL: { - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDValue Mul; - - // FIXME: Add support for 24-bit multiply with 64-bit output on SI. - if (VT.isVector() || VT.getSizeInBits() > 32) - break; - - if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { - N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); - } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { - N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); - } else { - break; - } - - // We need to use sext even for MUL_U24, because MUL_U24 is used - // for signed multiply of 8 and 16-bit types. - SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT); - - return Reg; - } + case ISD::MUL: + return performMulCombine(N, DCI); case AMDGPUISD::MUL_I24: case AMDGPUISD::MUL_U24: { SDValue N0 = N->getOperand(0); @@ -1932,6 +2073,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, break; } + + case ISD::STORE: + return performStoreCombine(N, DCI); } return SDValue(); } @@ -2015,7 +2159,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { // AMDIL DAG nodes NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); - NODE_NAME_CASE(DIV_INF); NODE_NAME_CASE(RET_FLAG); NODE_NAME_CASE(BRANCH_COND); @@ -2029,6 +2172,16 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DIV_SCALE) + NODE_NAME_CASE(DIV_FMAS) + NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(TRIG_PREOP) + NODE_NAME_CASE(RCP) + NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(RSQ_CLAMPED) + NODE_NAME_CASE(DOT4) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) @@ -2038,8 +2191,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) - NODE_NAME_CASE(URECIP) - NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) @@ -2055,6 +2206,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE2) NODE_NAME_CASE(CVT_F32_UBYTE3) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) + NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 5be3070f589b..624d4e0c1967 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -54,6 +54,7 @@ class AMDGPUTargetLowering : public TargetLowering { SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; @@ -63,24 +64,21 @@ class AMDGPUTargetLowering : public TargetLowering { SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + protected: static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); - /// \brief Helper function that adds Reg to the LiveIn list of the DAG's - /// MachineFunction. - /// - /// \returns a RegisterSDNode representing Reg. - virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const; - SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, - SelectionDAG &DAG) const; + virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const; /// \brief Split a vector load into multiple scalar loads. SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const; SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; @@ -107,10 +105,12 @@ class AMDGPUTargetLowering : public TargetLowering { bool isZExtFree(Type *Src, Type *Dest) const override; bool isZExtFree(EVT Src, EVT Dest) const override; + bool isZExtFree(SDValue Val, EVT VT2) const override; bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; MVT getVectorIdxTy() const override; + bool isSelectSupported(SelectSupportKind) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; bool ShouldShrinkFPConstant(EVT VT) const override; @@ -154,10 +154,13 @@ class AMDGPUTargetLowering : public TargetLowering { const SelectionDAG &DAG, unsigned Depth = 0) const override; -private: - // Functions defined in AMDILISelLowering.cpp - void InitAMDILLowering(); - SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + /// \brief Helper function that adds Reg to the LiveIn list of the DAG's + /// MachineFunction. + /// + /// \returns a RegisterSDNode representing Reg. + virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const; }; namespace AMDGPUISD { @@ -167,13 +170,15 @@ enum { FIRST_NUMBER = ISD::BUILTIN_OP_END, CALL, // Function call based on a single integer UMUL, // 32bit unsigned multiplication - DIV_INF, // Divide with infinity returned on zero divisor RET_FLAG, BRANCH_COND, // End AMDIL ISD Opcodes DWORDADDR, FRACT, CLAMP, + + // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. + // Denormals handled on some parts. COS_HW, SIN_HW, FMAX, @@ -183,6 +188,17 @@ enum { SMIN, UMIN, URECIP, + DIV_SCALE, + DIV_FMAS, + DIV_FIXUP, + TRIG_PREOP, // 1 ULP max error for f64 + + // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. + // For f64, max error 2^29 ULP, handles denormals. + RCP, + RSQ, + RSQ_LEGACY, + RSQ_CLAMPED, DOT4, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. @@ -218,6 +234,8 @@ enum { /// T2|v.z| | | | /// T3|v.w| | | | BUILD_VERTICAL_VECTOR, + /// Pointer to the start of the shader's constant data. + CONST_DATA_PTR, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index ee23020988f3..d5041f558163 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -137,14 +137,6 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; // Helper functions that check the opcode for status information - bool isLoadInst(llvm::MachineInstr *MI) const; - bool isExtLoadInst(llvm::MachineInstr *MI) const; - bool isSWSExtLoadInst(llvm::MachineInstr *MI) const; - bool isSExtLoadInst(llvm::MachineInstr *MI) const; - bool isZExtLoadInst(llvm::MachineInstr *MI) const; - bool isAExtLoadInst(llvm::MachineInstr *MI) const; - bool isStoreInst(llvm::MachineInstr *MI) const; - bool isTruncStoreInst(llvm::MachineInstr *MI) const; bool isRegisterStore(const MachineInstr &MI) const; bool isRegisterLoad(const MachineInstr &MI) const; @@ -152,7 +144,6 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { // Pure virtual funtions to be implemented by sub-classes. //===---------------------------------------------------------------------===// - virtual unsigned getIEQOpcode() const = 0; virtual bool isMov(unsigned opcode) const = 0; /// \brief Calculate the "Indirect Address" for the given \p RegIndex and diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index 942a9e8ff351..820f1a80d75e 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -19,6 +19,14 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> ]>; +def AMDGPUTrigPreOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPUDivScaleOp : SDTypeProfile<2, 3, + [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] +>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -26,9 +34,24 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; +def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; +def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; + // out = a - floor(a) def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; +// out = 1.0 / a +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) result clamped to +/- max_float. +def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; + // out = max(a, b) a and b are floats def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] @@ -78,6 +101,21 @@ def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", // e is rounding error def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; +// Special case divide preop and flags. +def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; + +// Special case divide FMA with scale and flags (src0 = Quotient, +// src1 = Denominator, src2 = Numerator). +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>; + +// Single or double precision division fixup. +// Special case divide fixup and flags(src0 = Quotient, src1 = +// Denominator, src2 = Numerator). +def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; + +// Look Up 2.0 / pi src0 with segment select src1[4:0] +def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; + def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, [SDNPHasChain, SDNPMayLoad]>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 8bfc11cd468c..cd3560378e57 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -34,9 +34,15 @@ class AMDGPUShaderInst pattern> } +def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; +def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; +def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; + def InstFlag : OperandWithDefaultOps ; def ADDRIndirect : ComplexPattern; +let OperandType = "OPERAND_IMMEDIATE" in { + def u32imm : Operand { let PrintMethod = "printU32ImmOperand"; } @@ -49,6 +55,8 @@ def u8imm : Operand { let PrintMethod = "printU8ImmOperand"; } +} // End OperandType = "OPERAND_IMMEDIATE" + //===--------------------------------------------------------------------===// // Custom Operands //===--------------------------------------------------------------------===// @@ -132,6 +140,28 @@ def COND_NULL : PatLeaf < // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// +class PrivateMemOp : PatFrag (N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; +}]>; + +class PrivateLoad : PrivateMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +class PrivateStore : PrivateMemOp < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +>; + +def extloadi8_private : PrivateLoad ; +def sextloadi8_private : PrivateLoad ; +def extloadi16_private : PrivateLoad ; +def sextloadi16_private : PrivateLoad ; +def load_private : PrivateLoad ; + +def truncstorei8_private : PrivateStore ; +def truncstorei16_private : PrivateStore ; +def store_private : PrivateStore ; + def global_store : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return isGlobalStore(dyn_cast(N)); @@ -519,6 +549,23 @@ multiclass Expand24UBitOps { >; } +class RcpPat : Pat < + (fdiv FP_ONE, vt:$src), + (RcpInst $src) +>; + +multiclass RsqPat { + def : Pat < + (fdiv FP_ONE, (fsqrt vt:$src)), + (RsqInst $src) + >; + + def : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) + >; +} + include "R600Instructions.td" include "R700Instructions.td" include "EvergreenInstructions.td" diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp similarity index 60% rename from lib/Target/R600/AMDILIntrinsicInfo.cpp rename to lib/Target/R600/AMDGPUIntrinsicInfo.cpp index fab4a3b8961a..58916a995496 100644 --- a/lib/Target/R600/AMDILIntrinsicInfo.cpp +++ b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp @@ -1,4 +1,4 @@ -//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===// +//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -12,7 +12,7 @@ // //===-----------------------------------------------------------------------===// -#include "AMDILIntrinsicInfo.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Intrinsics.h" @@ -24,14 +24,12 @@ using namespace llvm; #include "AMDGPUGenIntrinsics.inc" #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN -AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) - : TargetIntrinsicInfo() { -} +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) + : TargetIntrinsicInfo() {} -std::string -AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, - unsigned int numTys) const { - static const char* const names[] = { +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned numTys) const { + static const char *const names[] = { #define GET_INTRINSIC_NAME_TABLE #include "AMDGPUGenIntrinsics.inc" #undef GET_INTRINSIC_NAME_TABLE @@ -40,23 +38,23 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, if (IntrID < Intrinsic::num_intrinsics) { return nullptr; } - assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics - && "Invalid intrinsic ID"); + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && + "Invalid intrinsic ID"); std::string Result(names[IntrID - Intrinsic::num_intrinsics]); return Result; } -unsigned int -AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const { +unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name, + unsigned Len) const { if (!StringRef(Name, Len).startswith("llvm.")) return 0; // All intrinsics start with 'llvm.' #define GET_FUNCTION_RECOGNIZER #include "AMDGPUGenIntrinsics.inc" #undef GET_FUNCTION_RECOGNIZER - AMDGPUIntrinsic::ID IntrinsicID - = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; + AMDGPUIntrinsic::ID IntrinsicID = + (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { @@ -65,17 +63,15 @@ AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const { return 0; } -bool -AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { - // Overload Table +bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { +// Overload Table #define GET_INTRINSIC_OVERLOAD_TABLE #include "AMDGPUGenIntrinsics.inc" #undef GET_INTRINSIC_OVERLOAD_TABLE } -Function* -AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, - Type **Tys, - unsigned numTys) const { +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + Type **Tys, + unsigned numTys) const { llvm_unreachable("Not implemented"); } diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h similarity index 64% rename from lib/Target/R600/AMDILIntrinsicInfo.h rename to lib/Target/R600/AMDGPUIntrinsicInfo.h index 924275aec2da..5be68a217da5 100644 --- a/lib/Target/R600/AMDILIntrinsicInfo.h +++ b/lib/Target/R600/AMDGPUIntrinsicInfo.h @@ -1,4 +1,4 @@ -//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// +//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,8 +11,8 @@ /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. // //===-----------------------------------------------------------------------===// -#ifndef AMDIL_INTRINSICS_H -#define AMDIL_INTRINSICS_H +#ifndef AMDGPU_INTRINSICINFO_H +#define AMDGPU_INTRINSICINFO_H #include "llvm/IR/Intrinsics.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -34,16 +34,15 @@ enum ID { class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { public: AMDGPUIntrinsicInfo(TargetMachine *tm); - std::string getName(unsigned int IntrId, Type **Tys = nullptr, - unsigned int numTys = 0) const override; - unsigned int lookupName(const char *Name, unsigned int Len) const override; - bool isOverloaded(unsigned int IID) const override; - Function *getDeclaration(Module *M, unsigned int ID, + std::string getName(unsigned IntrId, Type **Tys = nullptr, + unsigned numTys = 0) const override; + unsigned lookupName(const char *Name, unsigned Len) const override; + bool isOverloaded(unsigned IID) const override; + Function *getDeclaration(Module *M, unsigned ID, Type **Tys = nullptr, - unsigned int numTys = 0) const override; + unsigned numTys = 0) const override; }; } // end namespace llvm -#endif // AMDIL_INTRINSICS_H - +#endif // AMDGPU_INTRINSICINFO_H diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td index 6dc7612d46fb..eee9c29038d0 100644 --- a/lib/Target/R600/AMDGPUIntrinsics.td +++ b/lib/Target/R600/AMDGPUIntrinsics.td @@ -13,9 +13,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { - def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; @@ -24,14 +21,20 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + + // This is named backwards (instead of rsq_legacy) so we don't have + // to define it with the public builtins intrinsics. This is a + // workaround for how intrinsic names are parsed. If the name is + // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant + // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. + def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; def int_AMDGPU_kilp : Intrinsic<[], [], []>; def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; @@ -69,7 +72,7 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; } -// Legacy names for compatability. +// Legacy names for compatibility. let TargetPrefix = "AMDIL", isTarget = 1 in { def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp index ac82e88c9266..ce5c41ceb267 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.cpp +++ b/lib/Target/R600/AMDGPUMCInstLower.cpp @@ -22,7 +22,9 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectStreamer.h" @@ -77,6 +79,20 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { case MachineOperand::MO_MachineBasicBlock: MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( MO.getMBB()->getSymbol(), Ctx)); + break; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(GV->getName())); + MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(Sym, Ctx)); + break; + } + case MachineOperand::MO_TargetIndex: { + assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); + MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx); + MCOp = MCOperand::CreateExpr(Expr); + break; + } } OutMI.addOperand(MCOp); } diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h index 2b7f1e3074fe..58fe34d32d31 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.h +++ b/lib/Target/R600/AMDGPUMCInstLower.h @@ -14,9 +14,9 @@ namespace llvm { class AMDGPUSubtarget; -class MCInst; -class MCContext; class MachineInstr; +class MCContext; +class MCInst; class AMDGPUMCInstLower { diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp index 14171f46020a..90af80113ece 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.cpp +++ b/lib/Target/R600/AMDGPUMachineFunction.cpp @@ -10,9 +10,9 @@ static const char *const ShaderTypeAttribute = "ShaderType"; void AMDGPUMachineFunction::anchor() {} AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : - MachineFunctionInfo() { - ShaderType = ShaderType::COMPUTE; - LDSSize = 0; + MachineFunctionInfo(), + ShaderType(ShaderType::COMPUTE), + LDSSize(0) { AttributeSet Set = MF.getFunction()->getAttributes(); Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, ShaderTypeAttribute); diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h index fea0b39e91e5..0854d588eeba 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.h +++ b/lib/Target/R600/AMDGPUMachineFunction.h @@ -20,14 +20,19 @@ namespace llvm { class AMDGPUMachineFunction : public MachineFunctionInfo { virtual void anchor(); + unsigned ShaderType; + public: AMDGPUMachineFunction(const MachineFunction &MF); - unsigned ShaderType; /// A map to keep track of local memory objects and their offsets within /// the local memory space. std::map LocalMemoryObjects; /// Number of bytes in the LDS that are being used. unsigned LDSSize; + + unsigned getShaderType() const { + return ShaderType; + } }; } diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp index 2d3a7fdd4feb..218750d445e6 100644 --- a/lib/Target/R600/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp @@ -129,6 +129,22 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { return GEP->getOperand(2); } +// Not an instruction handled below to turn into a vector. +// +// TODO: Check isTriviallyVectorizable for calls and handle other +// instructions. +static bool canVectorizeInst(Instruction *Inst) { + switch (Inst->getOpcode()) { + case Instruction::Load: + case Instruction::Store: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + return true; + default: + return false; + } +} + static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { Type *AllocaTy = Alloca->getAllocatedType(); @@ -149,6 +165,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { for (User *AllocaUser : Alloca->users()) { GetElementPtrInst *GEP = dyn_cast(AllocaUser); if (!GEP) { + if (!canVectorizeInst(cast(AllocaUser))) + return false; + WorkList.push_back(AllocaUser); continue; } @@ -158,20 +177,23 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. if (!Index) { - DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << "\n"); + DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); return false; } GEPVectorIdx[GEP] = Index; for (User *GEPUser : AllocaUser->users()) { + if (!canVectorizeInst(cast(GEPUser))) + return false; + WorkList.push_back(GEPUser); } } VectorType *VectorTy = arrayTypeToVecType(AllocaTy); - DEBUG(dbgs() << " Converting alloca to vector "; AllocaTy->dump(); - dbgs() << " -> "; VectorTy->dump(); dbgs() << "\n"); + DEBUG(dbgs() << " Converting alloca to vector " + << *AllocaTy << " -> " << *VectorTy << '\n'); for (std::vector::iterator I = WorkList.begin(), E = WorkList.end(); I != E; ++I) { @@ -201,12 +223,12 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { break; } case Instruction::BitCast: + case Instruction::AddrSpaceCast: break; default: Inst->dump(); - llvm_unreachable("Do not know how to replace this instruction " - "with vector op"); + llvm_unreachable("Inconsistency in instructions promotable to vector"); } } return true; @@ -233,7 +255,7 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { // First try to replace the alloca with a vector Type *AllocaTy = I.getAllocatedType(); - DEBUG(dbgs() << "Trying to promote " << I); + DEBUG(dbgs() << "Trying to promote " << I << '\n'); if (tryPromoteAllocaToVector(&I)) return; diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h index 4731595d4f71..46aa7a17dfca 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -51,7 +51,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { unsigned getSubRegFromChannel(unsigned Channel) const; const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; - void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override; unsigned getFrameRegister(const MachineFunction &MF) const override; diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 4fd43905b210..e3c2a50ab828 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -15,6 +15,9 @@ #include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" #include "SIInstrInfo.h" +#include "llvm/ADT/SmallString.h" + +#include "llvm/ADT/SmallString.h" using namespace llvm; @@ -25,101 +28,62 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" -AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : - AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) { - InstrItins = getInstrItineraryForCPU(CPU); +AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) : + AMDGPUGenSubtargetInfo(TT, GPU, FS), + DevName(GPU), + Is64bit(false), + DumpCode(false), + R600ALUInst(false), + HasVertexCache(false), + TexVTXClauseSize(0), + Gen(AMDGPUSubtarget::R600), + FP64(false), + FP64Denormals(false), + FP32Denormals(false), + CaymanISA(false), + EnableIRStructurizer(true), + EnablePromoteAlloca(false), + EnableIfCvt(true), + WavefrontSize(0), + CFALUBug(false), + LocalMemorySize(0), + InstrItins(getInstrItineraryForCPU(GPU)) { + // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be + // enabled, but some instructions do not respect them and they run at the + // double precision rate, so don't enable by default. + // + // We want to be able to turn these off, but making this a subtarget feature + // for SI has the unhelpful behavior that it unsets everything else if you + // disable it. + + SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + FullFS += FS; - // Default card - StringRef GPU = CPU; - Is64bit = false; - HasVertexCache = false; - TexVTXClauseSize = 0; - Gen = AMDGPUSubtarget::R600; - FP64 = false; - CaymanISA = false; - EnableIRStructurizer = true; - EnableIfCvt = true; - WavefrontSize = 0; - CFALUBug = false; - LocalMemorySize = 0; - ParseSubtargetFeatures(GPU, FS); - DevName = GPU; + ParseSubtargetFeatures(GPU, FullFS); if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { InstrInfo.reset(new R600InstrInfo(*this)); + + // FIXME: I don't think think Evergreen has any useful support for + // denormals, but should be checked. Should we issue a warning somewhere if + // someone tries to enable these? + FP32Denormals = false; + FP64Denormals = false; } else { InstrInfo.reset(new SIInstrInfo(*this)); } } -bool -AMDGPUSubtarget::is64bit() const { - return Is64bit; -} -bool -AMDGPUSubtarget::hasVertexCache() const { - return HasVertexCache; -} -short -AMDGPUSubtarget::getTexVTXClauseSize() const { - return TexVTXClauseSize; -} -enum AMDGPUSubtarget::Generation -AMDGPUSubtarget::getGeneration() const { - return Gen; -} -bool -AMDGPUSubtarget::hasHWFP64() const { - return FP64; -} -bool -AMDGPUSubtarget::hasCaymanISA() const { - return CaymanISA; -} -bool -AMDGPUSubtarget::IsIRStructurizerEnabled() const { - return EnableIRStructurizer; -} -bool -AMDGPUSubtarget::isIfCvtEnabled() const { - return EnableIfCvt; -} -unsigned -AMDGPUSubtarget::getWavefrontSize() const { - return WavefrontSize; -} -unsigned -AMDGPUSubtarget::getStackEntrySize() const { +unsigned AMDGPUSubtarget::getStackEntrySize() const { assert(getGeneration() <= NORTHERN_ISLANDS); switch(getWavefrontSize()) { case 16: return 8; case 32: - if (hasCaymanISA()) - return 4; - else - return 8; + return hasCaymanISA() ? 4 : 8; case 64: return 4; default: llvm_unreachable("Illegal wavefront size."); } } -bool -AMDGPUSubtarget::hasCFAluBug() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - return CFALUBug; -} -int -AMDGPUSubtarget::getLocalMemorySize() const { - return LocalMemorySize; -} -bool -AMDGPUSubtarget::isTargetELF() const { - return false; -} - -std::string -AMDGPUSubtarget::getDeviceName() const { - return DevName; -} diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 9c78f35df3e2..a844b37b6be5 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -44,15 +44,17 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { private: std::string DevName; bool Is64bit; - bool Is32on64bit; bool DumpCode; bool R600ALUInst; bool HasVertexCache; short TexVTXClauseSize; - enum Generation Gen; + Generation Gen; bool FP64; + bool FP64Denormals; + bool FP32Denormals; bool CaymanISA; bool EnableIRStructurizer; + bool EnablePromoteAlloca; bool EnableIfCvt; unsigned WavefrontSize; bool CFALUBug; @@ -66,15 +68,44 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { const AMDGPUInstrInfo *getInstrInfo() const { return InstrInfo.get(); } - const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + + const InstrItineraryData &getInstrItineraryData() const { + return InstrItins; + } + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - bool is64bit() const; - bool hasVertexCache() const; - short getTexVTXClauseSize() const; - enum Generation getGeneration() const; - bool hasHWFP64() const; - bool hasCaymanISA() const; + bool is64bit() const { + return Is64bit; + } + + bool hasVertexCache() const { + return HasVertexCache; + } + + short getTexVTXClauseSize() const { + return TexVTXClauseSize; + } + + Generation getGeneration() const { + return Gen; + } + + bool hasHWFP64() const { + return FP64; + } + + bool hasCaymanISA() const { + return CaymanISA; + } + + bool hasFP32Denormals() const { + return FP32Denormals; + } + + bool hasFP64Denormals() const { + return FP64Denormals; + } bool hasBFE() const { return (getGeneration() >= EVERGREEN); @@ -92,8 +123,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { if (Size == 32) return (getGeneration() >= EVERGREEN); - assert(Size == 64); - return (getGeneration() >= SOUTHERN_ISLANDS); + if (Size == 64) + return (getGeneration() >= SOUTHERN_ISLANDS); + + return false; } bool hasMulU24() const { @@ -105,23 +138,60 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { hasCaymanISA()); } - bool IsIRStructurizerEnabled() const; - bool isIfCvtEnabled() const; - unsigned getWavefrontSize() const; + bool hasFFBL() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasFFBH() const { + return (getGeneration() >= EVERGREEN); + } + + bool IsIRStructurizerEnabled() const { + return EnableIRStructurizer; + } + + bool isPromoteAllocaEnabled() const { + return EnablePromoteAlloca; + } + + bool isIfCvtEnabled() const { + return EnableIfCvt; + } + + unsigned getWavefrontSize() const { + return WavefrontSize; + } + unsigned getStackEntrySize() const; - bool hasCFAluBug() const; - int getLocalMemorySize() const; + + bool hasCFAluBug() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + return CFALUBug; + } + + int getLocalMemorySize() const { + return LocalMemorySize; + } bool enableMachineScheduler() const override { return getGeneration() <= NORTHERN_ISLANDS; } // Helper functions to simplify if statements - bool isTargetELF() const; - std::string getDeviceName() const; - bool dumpCode() const { return DumpCode; } - bool r600ALUEncoding() const { return R600ALUInst; } + bool isTargetELF() const { + return false; + } + + StringRef getDeviceName() const { + return DevName; + } + bool dumpCode() const { + return DumpCode; + } + bool r600ALUEncoding() const { + return R600ALUInst; + } }; } // End namespace llvm diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index be1eceaaa00d..56ba719e6863 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -33,7 +33,6 @@ #include "llvm/Transforms/Scalar.h" #include - using namespace llvm; extern "C" void LLVMInitializeR600Target() { @@ -53,7 +52,7 @@ static std::string computeDataLayout(const AMDGPUSubtarget &ST) { std::string Ret = "e-p:32:32"; if (ST.is64bit()) { - // 32-bit private, local, and region pointers. 64-bit global and constant. + // 32-bit local, and region pointers. 64-bit private, global, and constant. Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; } @@ -137,8 +136,11 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) { void AMDGPUPassConfig::addCodeGenPrepare() { const AMDGPUSubtarget &ST = TM->getSubtarget(); - addPass(createAMDGPUPromoteAlloca(ST)); - addPass(createSROAPass()); + if (ST.isPromoteAllocaEnabled()) { + addPass(createAMDGPUPromoteAlloca(ST)); + addPass(createSROAPass()); + } + TargetPassConfig::addCodeGenPrepare(); } @@ -174,6 +176,9 @@ bool AMDGPUPassConfig::addPreRegAlloc() { // SIFixSGPRCopies can generate a lot of duplicate instructions, // so we need to run MachineCSE afterwards. addPass(&MachineCSEID); + addPass(createSIShrinkInstructionsPass()); + initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); + insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID); } return false; } @@ -181,6 +186,7 @@ bool AMDGPUPassConfig::addPreRegAlloc() { bool AMDGPUPassConfig::addPostRegAlloc() { const AMDGPUSubtarget &ST = TM->getSubtarget(); + addPass(createSIShrinkInstructionsPass()); if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { addPass(createSIInsertWaits(*TM)); } diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h index 2eb36a3366f2..3bb15beb6bf1 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -17,8 +17,8 @@ #include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" #include "R600ISelLowering.h" #include "llvm/IR/DataLayout.h" diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp index ea78f431588f..88934b65876e 100644 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -77,6 +77,12 @@ class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo { void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const override; + PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override; + + unsigned getNumberOfRegisters(bool Vector) const override; + unsigned getRegisterBitWidth(bool Vector) const override; + unsigned getMaximumUnrollFactor() const override; + /// @} }; @@ -95,14 +101,12 @@ bool AMDGPUTTI::hasBranchDivergence() const { return true; } void AMDGPUTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { - for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end(); - BI != BE; ++BI) { - BasicBlock *BB = *BI; - for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); - I != E; ++I) { - const GetElementPtrInst *GEP = dyn_cast(I); - if (!GEP) + for (const BasicBlock *BB : L->getBlocks()) { + for (const Instruction &I : *BB) { + const GetElementPtrInst *GEP = dyn_cast(&I); + if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) continue; + const Value *Ptr = GEP->getPointerOperand(); const AllocaInst *Alloca = dyn_cast(GetUnderlyingObject(Ptr)); if (Alloca) { @@ -121,3 +125,29 @@ void AMDGPUTTI::getUnrollingPreferences(Loop *L, } } } + +AMDGPUTTI::PopcntSupportKind +AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software; +} + +unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const { + if (Vec) + return 0; + + // Number of VGPRs on SI. + if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 256; + + return 4 * 128; // XXX - 4 channels. Should these count as vector instead? +} + +unsigned AMDGPUTTI::getRegisterBitWidth(bool) const { + return 32; +} + +unsigned AMDGPUTTI::getMaximumUnrollFactor() const { + // Semi-arbitrary large amount. + return 64; +} diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp deleted file mode 100644 index fa48e65be4e6..000000000000 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ /dev/null @@ -1,96 +0,0 @@ -//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief TargetLowering functions borrowed from AMDIL. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUISelLowering.h" -#include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/SelectionDAG.h" - -using namespace llvm; - -//===----------------------------------------------------------------------===// -// TargetLowering Class Implementation Begins -//===----------------------------------------------------------------------===// -void AMDGPUTargetLowering::InitAMDILLowering() { - static const MVT::SimpleValueType types[] = { - MVT::i32, - MVT::f32, - MVT::f64, - MVT::i64, - MVT::v4f32, - MVT::v4i32, - MVT::v2f32, - MVT::v2i32 - }; - - static const MVT::SimpleValueType FloatTypes[] = { - MVT::f32, - MVT::f64 - }; - - static const MVT::SimpleValueType VectorTypes[] = { - MVT::v4f32, - MVT::v4i32, - MVT::v2f32, - MVT::v2i32 - }; - - const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget(); - - for (MVT VT : types) { - setOperationAction(ISD::SUBE, VT, Expand); - setOperationAction(ISD::SUBC, VT, Expand); - setOperationAction(ISD::ADDE, VT, Expand); - setOperationAction(ISD::ADDC, VT, Expand); - setOperationAction(ISD::BRCOND, VT, Custom); - setOperationAction(ISD::BR_JT, VT, Expand); - setOperationAction(ISD::BRIND, VT, Expand); - } - - for (MVT VT : FloatTypes) { - setOperationAction(ISD::FP_ROUND_INREG, VT, Expand); - } - - for (MVT VT : VectorTypes) { - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); - } - - if (STM.hasHWFP64()) { - setOperationAction(ISD::ConstantFP, MVT::f64, Legal); - setOperationAction(ISD::FABS, MVT::f64, Expand); - } - - setOperationAction(ISD::SUBC, MVT::Other, Expand); - setOperationAction(ISD::ADDE, MVT::Other, Expand); - setOperationAction(ISD::ADDC, MVT::Other, Expand); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - setOperationAction(ISD::BRIND, MVT::Other, Expand); - - setOperationAction(ISD::Constant, MVT::i32, Legal); - setOperationAction(ISD::Constant, MVT::i64, Legal); - setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - - setPow2DivIsCheap(false); - setSelectIsExpensive(true); // FIXME: This makes no sense at all -} - -SDValue AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); - SDValue Cond = Op.getOperand(1); - SDValue Jump = Op.getOperand(2); - - return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), - Chain, Jump, Cond); -} diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index be2ca06ef34d..49a7f8aa18c8 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -13,10 +13,9 @@ add_public_tablegen_target(AMDGPUCommonTableGen) add_llvm_target(R600CodeGen AMDILCFGStructurizer.cpp - AMDILIntrinsicInfo.cpp - AMDILISelLowering.cpp AMDGPUAsmPrinter.cpp AMDGPUFrameLowering.cpp + AMDGPUIntrinsicInfo.cpp AMDGPUISelDAGToDAG.cpp AMDGPUMCInstLower.cpp AMDGPUMachineFunction.cpp @@ -41,6 +40,7 @@ add_llvm_target(R600CodeGen R600TextureIntrinsicsReplacer.cpp SIAnnotateControlFlow.cpp SIFixSGPRCopies.cpp + SIFixSGPRLiveRanges.cpp SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp @@ -48,6 +48,7 @@ add_llvm_target(R600CodeGen SILowerI1Copies.cpp SIMachineFunctionInfo.cpp SIRegisterInfo.cpp + SIShrinkInstructions.cpp SITypeRewriter.cpp ) diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td index dcb7e982c7fc..484e52250d1b 100644 --- a/lib/Target/R600/EvergreenInstructions.td +++ b/lib/Target/R600/EvergreenInstructions.td @@ -328,6 +328,9 @@ defm CUBE_eg : CUBE_Common<0xC0>; def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; +def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>; +def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; + let hasSideEffects = 1 in { def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; } diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 11ae09102188..0927040cb5bc 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -99,9 +99,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { return; } - // The low 8 bits encoding value is the register index, for both VGPRs and - // SGPRs. - unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + // The low 8 bits of the encoding value is the register index, for both VGPRs + // and SGPRs. + unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); if (NumRegs == 1) { O << Type << RegIdx; return; @@ -216,13 +216,8 @@ void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - union Literal { - float f; - int32_t i; - } L; - - L.i = MI->getOperand(OpNo).getImm(); - O << L.i << "(" << L.f << ")"; + int32_t Imm = MI->getOperand(OpNo).getImm(); + O << Imm << '(' << BitsToFloat(Imm) << ')'; } void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt index 408ed758dbed..908872b55cd2 100644 --- a/lib/Target/R600/LLVMBuild.txt +++ b/lib/Target/R600/LLVMBuild.txt @@ -28,5 +28,5 @@ has_asmprinter = 1 type = Library name = R600CodeGen parent = R600 -required_libraries = Analysis AsmPrinter CodeGen Core MC R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target TransformUtils +required_libraries = Analysis AsmPrinter CodeGen Core MC R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target add_to_library_groups = R600 diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp index 489cec742bca..d55f27b04554 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -9,9 +9,11 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/TargetRegistry.h" @@ -43,7 +45,7 @@ class AMDGPUAsmBackend : public MCAsmBackend { AMDGPUAsmBackend(const Target &T) : MCAsmBackend() {} - unsigned getNumFixupKinds() const override { return 0; }; + unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, @@ -58,6 +60,8 @@ class AMDGPUAsmBackend : public MCAsmBackend { bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override { return true; } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; }; } //End anonymous namespace @@ -73,9 +77,43 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const { - uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); - assert(Fixup.getKind() == FK_PCRel_4); - *Dst = (Value - 4) / 4; + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("Unknown fixup kind"); + case AMDGPU::fixup_si_sopp_br: { + uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); + *Dst = (Value - 4) / 4; + break; + } + + case AMDGPU::fixup_si_rodata: { + uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); + *Dst = Value; + break; + } + + case AMDGPU::fixup_si_end_of_text: { + uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); + // The value points to the last instruction in the text section, so we + // need to add 4 bytes to get to the start of the constants. + *Dst = Value + 4; + break; + } + } +} + +const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( + MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { + // name offset bits flags + { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_si_rodata", 0, 32, 0 }, + { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel } + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + return Infos[Kind - FirstTargetFixupKind]; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 53b0e85751de..5fb94d5914d4 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -10,6 +10,7 @@ #include "AMDGPUMCTargetDesc.h" #include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixup.h" using namespace llvm; @@ -21,7 +22,7 @@ class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { protected: unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override { - llvm_unreachable("Not implemented"); + return Fixup.getKind(); } }; diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h new file mode 100644 index 000000000000..4b12e548a56f --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h @@ -0,0 +1,34 @@ +//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AMDGPUFIXUPKINDS_H +#define LLVM_AMDGPUFIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace AMDGPU { +enum Fixups { + /// 16-bit PC relative fixup for SOPP branch instructions. + fixup_si_sopp_br = FirstTargetFixupKind, + + /// fixup for global addresses with constant initializers + fixup_si_rodata, + + /// fixup for offset from instruction to end of text section + fixup_si_end_of_text, + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif // LLVM_AMDGPUFIXUPKINDS_H diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h index 6a5cd67bc0dc..d5e432de564c 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -37,6 +37,12 @@ class AMDGPUMCCodeEmitter : public MCCodeEmitter { const MCSubtargetInfo &STI) const { return 0; } + + virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } }; } // End namespace llvm diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 5e7cefed0ace..dc1344fb8d3f 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -172,17 +172,13 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, SmallVectorImpl &Fixup, const MCSubtargetInfo &STI) const { if (MO.isReg()) { - if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) { + if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) return MRI.getEncodingValue(MO.getReg()); - } else { - return getHWReg(MO.getReg()); - } - } else if (MO.isImm()) { - return MO.getImm(); - } else { - assert(0); - return 0; + return getHWReg(MO.getReg()); } + + assert(MO.isImm()); + return MO.getImm(); } #include "AMDGPUGenMCCodeEmitter.inc" diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp index ee021115ded2..78776c11d75d 100644 --- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp @@ -13,8 +13,10 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPU.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixup.h" @@ -39,6 +41,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; + MCContext &Ctx; /// \brief Can this operand also contain immediate values? bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; @@ -49,7 +52,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { public: SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, MCContext &ctx) - : MCII(mcii), MRI(mri) { } + : MCII(mcii), MRI(mri), Ctx(ctx) { } ~SIMCCodeEmitter() { } @@ -62,6 +65,12 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const override; + + /// \brief Use a fixup to encode the simm16 field for SOPP branch + /// instructions. + unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; }; } // End anonymous namespace @@ -90,6 +99,8 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const { Imm.I = MO.getImm(); else if (MO.isFPImm()) Imm.F = MO.getFPImm(); + else if (MO.isExpr()) + return 255; else return ~0; @@ -157,8 +168,13 @@ void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, IntFloatUnion Imm; if (Op.isImm()) Imm.I = Op.getImm(); - else + else if (Op.isFPImm()) Imm.F = Op.getFPImm(); + else { + assert(Op.isExpr()); + // This will be replaced with a fixup value. + Imm.I = 0; + } for (unsigned j = 0; j < 4; j++) { OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff)); @@ -169,6 +185,21 @@ void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, } } +unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; + Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); + return 0; + } + + return getMachineOpValue(MI, MO, Fixups, STI); +} + uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl &Fixups, @@ -177,10 +208,19 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, return MRI.getEncodingValue(MO.getReg()); if (MO.isExpr()) { - const MCExpr *Expr = MO.getExpr(); - MCFixupKind Kind = MCFixupKind(FK_PCRel_4); - Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); - return 0; + const MCSymbolRefExpr *Expr = cast(MO.getExpr()); + MCFixupKind Kind; + const MCSymbol *Sym = + Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + + if (&Expr->getSymbol() == Sym) { + // Add the offset to the beginning of the constant values. + Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text; + } else { + // This is used for constant data stored in .rodata. + Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; + } + Fixups.push_back(MCFixup::Create(4, Expr, Kind, MI.getLoc())); } // Figure out the operand number, needed for isSrcOperand check diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index d98a6dbb37bd..e37767a0719d 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -481,14 +481,14 @@ class R600ControlFlowFinalizer : public MachineFunctionPass { TRI=static_cast(MF.getTarget().getRegisterInfo()); R600MachineFunctionInfo *MFI = MF.getInfo(); - CFStack CFStack(ST, MFI->ShaderType); + CFStack CFStack(ST, MFI->getShaderType()); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector > > LoopStack; std::vector IfThenElseStack; - if (MFI->ShaderType == 1) { + if (MFI->getShaderType() == ShaderType::VERTEX) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index f0e13e56d8ff..52315bf0f338 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -13,12 +13,13 @@ //===----------------------------------------------------------------------===// #include "R600ISelLowering.h" -#include "AMDILIntrinsicInfo.h" #include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -68,6 +69,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::BR_CC, MVT::i32, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::FSUB, MVT::f32, Expand); @@ -81,6 +83,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Expand); setOperationAction(ISD::SELECT, MVT::f32, Expand); @@ -152,10 +156,14 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setOperationAction(ISD::SUB, MVT::i64, Expand); + // These should be replaced by UDVIREM, but it does not happen automatically // during Type Legalization setOperationAction(ISD::UDIV, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i64, Custom); + setOperationAction(ISD::SDIV, MVT::i64, Custom); + setOperationAction(ISD::SREM, MVT::i64, Custom); // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 // to be Legal/Custom in order to avoid library calls. @@ -165,6 +173,14 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); + } + setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::Source); @@ -565,7 +581,15 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::FSIN: return LowerTrig(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::LOAD: { + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; + } + + case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); @@ -800,6 +824,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T0_Z, VT); + case Intrinsic::AMDGPU_rsq: + // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); } // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) break; @@ -815,21 +842,135 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, default: AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); return; - case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); - return; - case ISD::LOAD: { - SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); - Results.push_back(SDValue(Node, 0)); - Results.push_back(SDValue(Node, 1)); - // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode - // function - DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + case ISD::FP_TO_UINT: + if (N->getValueType(0) == MVT::i1) { + Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + return; + } + // Fall-through. Since we don't care about out of bounds values + // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint + // considers some extra cases which are not necessary here. + case ISD::FP_TO_SINT: { + SDValue Result; + if (expandFP_TO_SINT(N, Result, DAG)) + Results.push_back(Result); return; } - case ISD::STORE: - SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); - Results.push_back(SDValue(Node, 0)); - return; + case ISD::UDIV: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(UDIVREM); + break; + } + case ISD::UREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(UDIVREM.getValue(1)); + break; + } + case ISD::SDIV: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(SDIVREM); + break; + } + case ISD::SREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(SDIVREM.getValue(1)); + break; + } + case ISD::SDIVREM: { + SDValue Op = SDValue(N, 1); + SDValue RES = LowerSDIVREM(Op, DAG); + Results.push_back(RES); + Results.push_back(RES.getValue(1)); + break; + } + case ISD::UDIVREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + SDValue one = DAG.getConstant(1, HalfVT); + SDValue zero = DAG.getConstant(0, HalfVT); + + //HiLo split + SDValue LHS = N->getOperand(0); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); + + SDValue RHS = N->getOperand(1); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + + // Get Speculative values + SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); + SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); + + SDValue REM_Hi = zero; + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); + + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); + SDValue DIV_Lo = zero; + + const unsigned halfBitWidth = HalfVT.getSizeInBits(); + + for (unsigned i = 0; i < halfBitWidth; ++i) { + SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); + // Get Value of high bit + SDValue HBit; + if (halfBitWidth == 32 && Subtarget->hasBFE()) { + HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); + } else { + HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + } + + SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, + DAG.getConstant(halfBitWidth - 1, HalfVT)); + REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); + REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); + + REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); + REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); + + + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); + + SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); + + DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); + + // Update REM + + SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); + + REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); + REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); + REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); + } + + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + Results.push_back(DIV); + Results.push_back(REM); + break; + } } } @@ -1386,6 +1527,19 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const return DAG.getMergeValues(Ops, DL); } + // Lower loads constant address space global variable loads + if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + isa( + GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) { + + SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, + getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(2, MVT::i32)); + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), + LoadNode->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); + } if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { SDValue MergedValues[2] = { @@ -1517,6 +1671,15 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const return DAG.getMergeValues(Ops, DL); } +SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Jump = Op.getOperand(2); + + return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), + Chain, Jump, Cond); +} + /// XXX Only kernel functions are supported, so we can assume for now that /// every function is a kernel function, but in the future we should use /// separate calling conventions for kernel and non-kernel functions. @@ -1531,7 +1694,7 @@ SDValue R600TargetLowering::LowerFormalArguments( CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), getTargetMachine(), ArgLocs, *DAG.getContext()); MachineFunction &MF = DAG.getMachineFunction(); - unsigned ShaderType = MF.getInfo()->ShaderType; + unsigned ShaderType = MF.getInfo()->getShaderType(); SmallVector LocalIns; @@ -2050,9 +2213,8 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SDValue FakeOp; std::vector Ops; - for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end(); - I != E; ++I) - Ops.push_back(*I); + for (const SDUse &I : Node->ops()) + Ops.push_back(I); if (Opcode == AMDGPU::DOT_4) { int OperandIdx[] = { diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index 381642aa600c..d22c8c98a542 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -59,6 +59,7 @@ class R600TargetLowering : public AMDGPUTargetLowering { SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 3972e2f03730..99920b7761a7 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -92,10 +92,6 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, return true; } -unsigned R600InstrInfo::getIEQOpcode() const { - return AMDGPU::SETE_INT; -} - bool R600InstrInfo::isMov(unsigned Opcode) const { @@ -209,8 +205,10 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { } bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { - const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo(); - return MFI->ShaderType != ShaderType::COMPUTE && usesVertexCache(MI->getOpcode()); + const MachineFunction *MF = MI->getParent()->getParent(); + const R600MachineFunctionInfo *MFI = MF->getInfo(); + return MFI->getShaderType() != ShaderType::COMPUTE && + usesVertexCache(MI->getOpcode()); } bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { @@ -218,9 +216,11 @@ bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { } bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { - const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo(); - return (MFI->ShaderType == ShaderType::COMPUTE && usesVertexCache(MI->getOpcode())) || - usesTextureCache(MI->getOpcode()); + const MachineFunction *MF = MI->getParent()->getParent(); + const R600MachineFunctionInfo *MFI = MF->getInfo(); + return (MFI->getShaderType() == ShaderType::COMPUTE && + usesVertexCache(MI->getOpcode())) || + usesTextureCache(MI->getOpcode()); } bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { @@ -319,7 +319,7 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const { Result.push_back(std::pair(&MO, Sel)); continue; } - + } return Result; } diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index 45a57d367b8b..1c3cb637a178 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -152,7 +152,6 @@ namespace llvm { /// instruction slots within an instruction group. bool isVector(const MachineInstr &MI) const; - unsigned getIEQOpcode() const override; bool isMov(unsigned Opcode) const override; DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM, diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 58c704d8ec85..704507d368ec 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -216,7 +216,7 @@ class R600_REDUCTION inst, dag ins, string asm, list pattern, def TEX_SHADOW : PatLeaf< (imm), [{uint32_t TType = (uint32_t)N->getZExtValue(); - return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13); + return (TType >= 6 && TType <= 8) || TType == 13; }] >; @@ -721,14 +721,11 @@ def SETNE_DX10 : R600_2OP < >; def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; -def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>; +def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; -// Add also ftrunc intrinsic pattern -def : Pat<(ftrunc f32:$src0), (TRUNC $src0)>; - def MOV : R600_1OP <0x19, "MOV", []>; let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { @@ -1082,18 +1079,21 @@ class RECIP_UINT_Common inst> : R600_1OP_Helper < let Itinerary = TransALU; } +// Clamped to maximum. class RECIPSQRT_CLAMPED_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped > { let Itinerary = TransALU; } -class RECIPSQRT_IEEE_Common inst> : R600_1OP < - inst, "RECIPSQRT_IEEE", [] +class RECIPSQRT_IEEE_Common inst> : R600_1OP_Helper < + inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy > { let Itinerary = TransALU; } +// TODO: There is also RECIPSQRT_FF which clamps to zero. + class SIN_Common inst> : R600_1OP < inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{ let Trig = 1; diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp index d6e445136faf..91eb60beb6bb 100644 --- a/lib/Target/R600/SIAnnotateControlFlow.cpp +++ b/lib/Target/R600/SIAnnotateControlFlow.cpp @@ -65,7 +65,6 @@ class SIAnnotateControlFlow : public FunctionPass { DominatorTree *DT; StackVector Stack; - SSAUpdater PhiInserter; bool isTopOfStack(BasicBlock *BB); @@ -81,7 +80,7 @@ class SIAnnotateControlFlow : public FunctionPass { void insertElse(BranchInst *Term); - void handleLoopCondition(Value *Cond); + Value *handleLoopCondition(Value *Cond, PHINode *Broken); void handleLoop(BranchInst *Term); @@ -177,7 +176,7 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) { } else { if (Phi->getIncomingValue(i) != BoolFalse) return false; - + } } return true; @@ -204,20 +203,26 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { } /// \brief Recursively handle the condition leading to a loop -void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) { +Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) { if (PHINode *Phi = dyn_cast(Cond)) { + BasicBlock *Parent = Phi->getParent(); + PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); + Value *Ret = NewPhi; // Handle all non-constant incoming values first for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { Value *Incoming = Phi->getIncomingValue(i); - if (isa(Incoming)) + BasicBlock *From = Phi->getIncomingBlock(i); + if (isa(Incoming)) { + NewPhi->addIncoming(Broken, From); continue; + } Phi->setIncomingValue(i, BoolFalse); - handleLoopCondition(Incoming); + Value *PhiArg = handleLoopCondition(Incoming, Broken); + NewPhi->addIncoming(PhiArg, From); } - BasicBlock *Parent = Phi->getParent(); BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { @@ -230,33 +235,28 @@ void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) { if (From == IDom) { CallInst *OldEnd = dyn_cast(Parent->getFirstInsertionPt()); if (OldEnd && OldEnd->getCalledFunction() == EndCf) { - Value *Args[] = { - OldEnd->getArgOperand(0), - PhiInserter.GetValueAtEndOfBlock(Parent) - }; - Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); - PhiInserter.AddAvailableValue(Parent, Ret); + Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; + Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); continue; } } - TerminatorInst *Insert = From->getTerminator(); - Value *Arg = PhiInserter.GetValueAtEndOfBlock(From); - Value *Ret = CallInst::Create(Break, Arg, "", Insert); - PhiInserter.AddAvailableValue(From, Ret); + Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); + NewPhi->setIncomingValue(i, PhiArg); } eraseIfUnused(Phi); + return Ret; } else if (Instruction *Inst = dyn_cast(Cond)) { BasicBlock *Parent = Inst->getParent(); TerminatorInst *Insert = Parent->getTerminator(); - Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) }; - Value *Ret = CallInst::Create(IfBreak, Args, "", Insert); - PhiInserter.AddAvailableValue(Parent, Ret); + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Insert); } else { llvm_unreachable("Unhandled loop condition!"); } + return 0; } /// \brief Handle a back edge (loop) @@ -264,15 +264,11 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { BasicBlock *Target = Term->getSuccessor(1); PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); - PhiInserter.Initialize(Int64, ""); - PhiInserter.AddAvailableValue(Target, Broken); - Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); - handleLoopCondition(Cond); + Value *Arg = handleLoopCondition(Cond, Broken); BasicBlock *BB = Term->getParent(); - Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB); for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); PI != PE; ++PI) { diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index 2cbce282cbe7..b7e7a2d000b3 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -32,7 +32,61 @@ enum { #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) #define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C +#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0) #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC + +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B848_VGPRS 0xFFFFFFC0 +#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B848_SGPRS 0xFFFFFC3F +#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B848_PRIORITY 0xFFFFF3FF +#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B848_FLOAT_MODE 0xFFF00FFF +#define S_00B848_PRIV(x) (((x) & 0x1) << 20) +#define G_00B848_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B848_PRIV 0xFFEFFFFF +#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B848_DX10_CLAMP 0xFFDFFFFF +#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B848_DEBUG_MODE 0xFFBFFFFF +#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B848_IEEE_MODE 0xFF7FFFFF + + +// Helpers for setting FLOAT_MODE +#define FP_ROUND_ROUND_TO_NEAREST 0 +#define FP_ROUND_ROUND_TO_INF 1 +#define FP_ROUND_ROUND_TO_NEGINF 2 +#define FP_ROUND_ROUND_TO_ZERO 3 + +// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double +// precision. +#define FP_ROUND_MODE_SP(x) ((x) & 0x3) +#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2) + +#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0 +#define FP_DENORM_FLUSH_OUT 1 +#define FP_DENORM_FLUSH_IN 2 +#define FP_DENORM_FLUSH_NONE 3 + + +// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double +// precision. +#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) +#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) + +#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 +#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) + #endif // SIDEFINES_H_ diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp new file mode 100644 index 000000000000..7d116eef396c --- /dev/null +++ b/lib/Target/R600/SIFixSGPRLiveRanges.cpp @@ -0,0 +1,110 @@ +//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// SALU instructions ignore control flow, so we need to modify the live ranges +/// of the registers they define. +/// +/// The strategy is to view the entire program as if it were a single basic +/// block and calculate the intervals accordingly. We implement this +/// by walking this list of segments for each LiveRange and setting the +/// end of each segment equal to the start of the segment that immediately +/// follows it. + +#include "AMDGPU.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-sgpr-live-ranges" + +namespace { + +class SIFixSGPRLiveRanges : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixSGPRLiveRanges() : MachineFunctionPass(ID) { + initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnMachineFunction(MachineFunction &MF) override; + + virtual const char *getPassName() const override { + return "SI Fix SGPR live ranges"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) + +char SIFixSGPRLiveRanges::ID = 0; + +char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID; + +FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { + return new SIFixSGPRLiveRanges(); +} + +bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIRegisterInfo *TRI = static_cast( + MF.getTarget().getRegisterInfo()); + LiveIntervals *LIS = &getAnalysis(); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + MachineOperand *ExecUse = MI.findRegisterUseOperand(AMDGPU::EXEC); + if (ExecUse) + continue; + + for (const MachineOperand &Def : MI.operands()) { + if (!Def.isReg() || !Def.isDef() ||!TargetRegisterInfo::isVirtualRegister(Def.getReg())) + continue; + + const TargetRegisterClass *RC = MRI.getRegClass(Def.getReg()); + + if (!TRI->isSGPRClass(RC)) + continue; + LiveInterval &LI = LIS->getInterval(Def.getReg()); + for (unsigned i = 0, e = LI.size() - 1; i != e; ++i) { + LiveRange::Segment &Seg = LI.segments[i]; + LiveRange::Segment &Next = LI.segments[i + 1]; + Seg.end = Next.start; + } + } + } + } + + return false; +} diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 4e61d5b03aa6..5a148a24810a 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -12,10 +12,16 @@ // //===----------------------------------------------------------------------===// +#ifdef _MSC_VER +// Provide M_PI. +#define _USE_MATH_DEFINES +#include +#endif + #include "SIISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -77,9 +83,13 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::ADD, MVT::i32, Legal); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); + setOperationAction(ISD::SUBC, MVT::i32, Legal); + setOperationAction(ISD::SUBE, MVT::i32, Legal); + + setOperationAction(ISD::FSIN, MVT::f32, Custom); + setOperationAction(ISD::FCOS, MVT::f32, Custom); // We need to custom lower vector stores from local memory - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); @@ -87,16 +97,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); - // We need to custom lower loads/stores from private memory - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::i64, Custom); - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - setOperationAction(ISD::LOAD, MVT::v8i32, Custom); - setOperationAction(ISD::STORE, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::i64, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); @@ -136,6 +138,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); @@ -164,6 +167,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); @@ -178,6 +184,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 }; + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT, MVT::i1, Promote); + for (MVT VT : VecTypes) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { @@ -212,9 +221,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::FRINT, MVT::f64, Legal); } - // FIXME: This should be removed and handled the same was as f32 fneg. Source + // FIXME: These should be removed and handled the same was as f32 fneg. Source // modifiers also work for the double instructions. setOperationAction(ISD::FNEG, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + + setOperationAction(ISD::FDIV, MVT::f32, Custom); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); @@ -246,15 +258,6 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, // see what for specifically. The wording everywhere else seems to be the // same. - // 3.6.4 - Operations using pairs of VGPRs (for example: double-floats) have - // no alignment restrictions. - if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { - // Using any pair of GPRs should be the same as any other pair. - if (IsFast) - *IsFast = true; - return VT.bitsGE(MVT::i64); - } - // XXX - The only mention I see of this in the ISA manual is for LDS direct // reads the "byte address and must be dword aligned". Is it also true for the // normal loads and stores? @@ -263,13 +266,18 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the // byte-address are ignored, thus forcing Dword alignment. + // This applies to private, global, and constant memory. if (IsFast) *IsFast = true; return VT.bitsGT(MVT::i32); } -bool SITargetLowering::shouldSplitVectorType(EVT VT) const { - return VT.getScalarType().bitsLE(MVT::i16); +TargetLoweringBase::LegalizeTypeAction +SITargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); } bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, @@ -318,7 +326,7 @@ SDValue SITargetLowering::LowerFormalArguments( const ISD::InputArg &Arg = Ins[i]; // First check if it's a PS input addr - if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() && + if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && !Arg.Flags.isByVal()) { assert((PSInputNum <= 15) && "Too many PS inputs!"); @@ -334,7 +342,7 @@ SDValue SITargetLowering::LowerFormalArguments( } // Second split vertices into their elements - if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { + if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { ISD::InputArg NewArg = Arg; NewArg.Flags.setSplit(); NewArg.VT = Arg.VT.getVectorElementType(); @@ -350,7 +358,7 @@ SDValue SITargetLowering::LowerFormalArguments( NewArg.PartOffset += NewArg.VT.getStoreSize(); } - } else if (Info->ShaderType != ShaderType::COMPUTE) { + } else if (Info->getShaderType() != ShaderType::COMPUTE) { Splits.push_back(Arg); } } @@ -360,20 +368,26 @@ SDValue SITargetLowering::LowerFormalArguments( getTargetMachine(), ArgLocs, *DAG.getContext()); // At least one interpolation mode must be enabled or else the GPU will hang. - if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { + if (Info->getShaderType() == ShaderType::PIXEL && + (Info->PSInputAddr & 0x7F) == 0) { Info->PSInputAddr |= 1; CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); } // The pointer to the list of arguments is stored in SGPR0, SGPR1 - if (Info->ShaderType == ShaderType::COMPUTE) { + // The pointer to the scratch buffer is stored in SGPR2, SGPR3 + if (Info->getShaderType() == ShaderType::COMPUTE) { + Info->NumUserSGPRs = 4; CCInfo.AllocateReg(AMDGPU::SGPR0); CCInfo.AllocateReg(AMDGPU::SGPR1); + CCInfo.AllocateReg(AMDGPU::SGPR2); + CCInfo.AllocateReg(AMDGPU::SGPR3); MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass); } - if (Info->ShaderType == ShaderType::COMPUTE) { + if (Info->getShaderType() == ShaderType::COMPUTE) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); } @@ -485,6 +499,36 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } + case AMDGPU::SI_BUFFER_RSRC: { + unsigned SuperReg = MI->getOperand(0).getReg(); + unsigned Args[4]; + for (unsigned i = 0, e = 4; i < e; ++i) { + MachineOperand &Arg = MI->getOperand(i + 1); + + if (Arg.isReg()) { + Args[i] = Arg.getReg(); + continue; + } + + assert(Arg.isImm()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg) + .addImm(Arg.getImm()); + Args[i] = Reg; + } + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), + SuperReg) + .addReg(Args[0]) + .addImm(AMDGPU::sub0) + .addReg(Args[1]) + .addImm(AMDGPU::sub1) + .addReg(Args[2]) + .addImm(AMDGPU::sub2) + .addReg(Args[3]) + .addImm(AMDGPU::sub3); + MI->eraseFromParent(); + break; + } case AMDGPU::V_SUB_F64: { unsigned DestReg = MI->getOperand(0).getReg(); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) @@ -596,25 +640,21 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { - LoadSDNode *Load = dyn_cast(Op); - if (Op.getValueType().isVector() && - (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || - (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && - Op.getValueType().getVectorNumElements() > 4))) { - SDValue MergedValues[2] = { - SplitVectorLoad(Op, DAG), - Load->getChain() - }; - return DAG.getMergeValues(MergedValues, SDLoc(Op)); - } else { - return LowerLOAD(Op, DAG); - } + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; } + case ISD::FSIN: + case ISD::FCOS: + return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::FDIV: return LowerFDIV(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_WO_CHAIN: { @@ -622,8 +662,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { cast(Op.getOperand(0))->getZExtValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); - //XXX: Hardcoded we only use two to store the pointer to the parameters. - unsigned NumUserSGPRs = 2; switch (IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case Intrinsic::r600_read_ngroups_x: @@ -646,13 +684,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT); case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT); case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT); case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, AMDGPU::VGPR0, VT); @@ -746,6 +784,21 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { return nullptr; } +SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + FrameIndexSDNode *FINode = cast(Op); + unsigned FrameIndex = FINode->getIndex(); + + CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32); + + return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -824,47 +877,57 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } -SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - LoadSDNode *Load = cast(Op); - SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); - SDValue MergedValues[2]; - MergedValues[1] = Load->getChain(); - if (Ret.getNode()) { - MergedValues[0] = Ret; - return DAG.getMergeValues(MergedValues, DL); - } +SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, + SDValue Op, + SelectionDAG &DAG) const { + GlobalAddressSDNode *GSD = cast(Op); - if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { - return SDValue(); - } + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); - EVT MemVT = Load->getMemoryVT(); + SDLoc DL(GSD); + const GlobalValue *GV = GSD->getGlobal(); + MVT PtrVT = getPointerTy(GSD->getAddressSpace()); - assert(!MemVT.isVector() && "Private loads should be scalarized"); - assert(!MemVT.isFloatingPoint() && "FP loads should be promoted to int"); + SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), - DAG.getConstant(2, MVT::i32)); - Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Load->getChain(), Ptr, - DAG.getTargetConstant(0, MVT::i32), - Op.getOperand(2)); - if (MemVT.getSizeInBits() == 64) { - SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(1, MVT::i32)); + SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, + DAG.getConstant(0, MVT::i32)); + SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, + DAG.getConstant(1, MVT::i32)); + + SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), + PtrLo, GA); + SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), + PtrHi, DAG.getConstant(0, MVT::i32), + SDValue(Lo.getNode(), 1)); + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); +} - SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Load->getChain(), IncPtr, - DAG.getTargetConstant(0, MVT::i32), - Op.getOperand(2)); +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast(Op); - Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper); + if (Op.getValueType().isVector()) { + assert(Op.getValueType().getVectorElementType() == MVT::i32 && + "Custom lowering for non-i32 vectors hasn't been implemented."); + unsigned NumElements = Op.getValueType().getVectorNumElements(); + assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + switch (Load->getAddressSpace()) { + default: break; + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::PRIVATE_ADDRESS: + // v4 loads are supported for private and global memory. + if (NumElements <= 4) + break; + // fall-through + case AMDGPUAS::LOCAL_ADDRESS: + return SplitVectorLoad(Op, DAG); + } } - MergedValues[0] = Ret; - return DAG.getMergeValues(MergedValues, DL); - + return AMDGPUTargetLowering::LowerLOAD(Op, DAG); } SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, @@ -903,11 +966,117 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); } +// Catch division cases where we can use shortcuts with rcp and rsq +// instructions. +SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + EVT VT = Op.getValueType(); + bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; + + if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { + if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && + CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + + // 1.0 / sqrt(x) -> rsq(x) + // + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // error seems really high at 2^29 ULP. + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + } + } + + if (Unsafe) { + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); + } + + return SDValue(); +} + +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + SDValue FastLowered = LowerFastFDIV(Op, DAG); + if (FastLowered.getNode()) + return FastLowered; + + // This uses v_rcp_f32 which does not handle denormals. Let this hit a + // selection error for now rather than do something incorrect. + if (Subtarget->hasFP32Denormals()) + return SDValue(); + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); + + const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} + +SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(); +} + +SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFDIV32(Op, DAG); + + if (VT == MVT::f64) + return LowerFDIV64(Op, DAG); + + llvm_unreachable("Unexpected type for fdiv"); +} + SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); StoreSDNode *Store = cast(Op); EVT VT = Store->getMemoryVT(); + // These stores are legal. + if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + VT.isVector() && VT.getVectorNumElements() == 2 && + VT.getVectorElementType() == MVT::i32) + return SDValue(); + + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (VT.isVector() && VT.getVectorNumElements() > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + } + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); if (Ret.getNode()) return Ret; @@ -920,67 +1089,24 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), Store->getBasePtr(), MVT::i1, Store->getMemOperand()); - if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) - return SDValue(); + return SDValue(); +} - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Store->getBasePtr(), - DAG.getConstant(2, MVT::i32)); - SDValue Chain = Store->getChain(); - SmallVector Values; - - if (Store->isTruncatingStore()) { - unsigned Mask = 0; - if (Store->getMemoryVT() == MVT::i8) { - Mask = 0xff; - } else if (Store->getMemoryVT() == MVT::i16) { - Mask = 0xffff; - } - SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Store->getBasePtr(), - DAG.getConstant(0, MVT::i32)); - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(), - DAG.getConstant(0x3, MVT::i32)); - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, MVT::i32)); - SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(), - DAG.getConstant(Mask, MVT::i32)); - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, - MaskedValue, ShiftAmt); - SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32, - DAG.getConstant(32, MVT::i32), ShiftAmt); - SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32, - DAG.getConstant(Mask, MVT::i32), - RotrAmt); - Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); - Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); - - Values.push_back(Dst); - } else if (VT == MVT::i64) { - for (unsigned i = 0; i < 2; ++i) { - Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, - Store->getValue(), DAG.getConstant(i, MVT::i32))); - } - } else if (VT == MVT::i128) { - for (unsigned i = 0; i < 2; ++i) { - for (unsigned j = 0; j < 2; ++j) { - Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, - Store->getValue(), DAG.getConstant(i, MVT::i32)), - DAG.getConstant(j, MVT::i32))); - } - } - } else { - Values.push_back(Store->getValue()); - } +SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDValue Arg = Op.getOperand(0); + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, + DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, + DAG.getConstantFP(0.5 / M_PI, VT))); - for (unsigned i = 0; i < Values.size(); ++i) { - SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, - Ptr, DAG.getConstant(i, MVT::i32)); - Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Values[i], PartPtr, - DAG.getTargetConstant(0, MVT::i32)); + switch (Op.getOpcode()) { + case ISD::FCOS: + return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); + case ISD::FSIN: + return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); + default: + llvm_unreachable("Wrong trig opcode"); } - return Chain; } //===----------------------------------------------------------------------===// @@ -1085,20 +1211,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - case ISD::SELECT_CC: { - ConstantSDNode *True, *False; - // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) - if ((True = dyn_cast(N->getOperand(2))) - && (False = dyn_cast(N->getOperand(3))) - && True->isAllOnesValue() - && False->isNullValue() - && VT == MVT::i1) { - return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), - N->getOperand(1), N->getOperand(4)); - - } - break; - } case ISD::SETCC: { SDValue Arg0 = N->getOperand(0); SDValue Arg1 = N->getOperand(1); @@ -1319,9 +1431,19 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, // This is a conservative aproach. It is possible that we can't determine the // correct register class and copy too often, but better safe than sorry. - SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); - SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), - Operand.getValueType(), Operand, RC); + + SDNode *Node; + // We can't use COPY_TO_REGCLASS with FrameIndex arguments. + if (isa(Operand)) { + unsigned Opcode = Operand.getValueType() == MVT::i32 ? + AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(), + Operand); + } else { + SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); + Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), + Operand.getValueType(), Operand, RC); + } Operand = SDValue(Node, 0); } @@ -1415,6 +1537,14 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); } continue; + } else { + // If it's not a VSrc or SSrc operand check if we have a GlobalAddress. + // These will be lowered to immediates, so we will need to insert a MOV. + if (isa(Ops[i])) { + SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(), + Operand.getValueType(), Operand); + Ops[i] = SDValue(Node, 0); + } } if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index 2f97a9ada8f0..d106d4abb187 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -25,9 +25,17 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue Chain, unsigned Offset, bool Signed) const; SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const override; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; bool foldImm(SDValue &Operand, int32_t &Immediate, @@ -50,7 +58,9 @@ class SITargetLowering : public AMDGPUTargetLowering { SITargetLowering(TargetMachine &tm); bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, bool *IsFast) const override; - bool shouldSplitVectorType(EVT VT) const override; + + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index a17fed7e7ea7..7dfc31bdfa01 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -273,17 +273,17 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, continue; NeedWait = true; - + if (Ordered[i]) { unsigned Value = LastIssued.Array[i] - Required.Array[i]; - // adjust the value to the real hardware posibilities + // Adjust the value to the real hardware possibilities. Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); } else Counts.Array[i] = 0; - // Remember on what we have waited on + // Remember on what we have waited on. WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; } @@ -341,6 +341,8 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { return Result; } +// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" +// around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 7cae9fc0d0eb..00e69ddbeea4 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -37,36 +37,35 @@ class InstSI pattern> : let TSFlags{9} = SALU; } -class Enc32 pattern> : - InstSI { +class Enc32 { field bits<32> Inst; - let Size = 4; + int Size = 4; } -class Enc64 pattern> : - InstSI { +class Enc64 { field bits<64> Inst; - let Size = 8; + int Size = 8; } class VOP3Common pattern> : - Enc64 { + InstSI { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let UseNamedOperandTable = 1; let VOP3 = 1; + + int Size = 8; } //===----------------------------------------------------------------------===// // Scalar operations //===----------------------------------------------------------------------===// -class SOP1 op, dag outs, dag ins, string asm, list pattern> : - Enc32 { +class SOP1e op> : Enc32 { bits<7> SDST; bits<8> SSRC0; @@ -75,16 +74,10 @@ class SOP1 op, dag outs, dag ins, string asm, list pattern> : let Inst{15-8} = op; let Inst{22-16} = SDST; let Inst{31-23} = 0x17d; //encoding; - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; } -class SOP2 op, dag outs, dag ins, string asm, list pattern> : - Enc32 { - +class SOP2e op> : Enc32 { + bits<7> SDST; bits<8> SSRC0; bits<8> SSRC1; @@ -94,15 +87,9 @@ class SOP2 op, dag outs, dag ins, string asm, list pattern> : let Inst{22-16} = SDST; let Inst{29-23} = op; let Inst{31-30} = 0x2; // encoding - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; } -class SOPC op, dag outs, dag ins, string asm, list pattern> : - Enc32 { +class SOPCe op> : Enc32 { bits<8> SSRC0; bits<8> SSRC1; @@ -111,62 +98,90 @@ class SOPC op, dag outs, dag ins, string asm, list pattern> : let Inst{15-8} = SSRC1; let Inst{22-16} = op; let Inst{31-23} = 0x17e; - - let DisableEncoding = "$dst"; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; } -class SOPK op, dag outs, dag ins, string asm, list pattern> : - Enc32 { +class SOPKe op> : Enc32 { bits <7> SDST; bits <16> SIMM16; - + let Inst{15-0} = SIMM16; let Inst{22-16} = SDST; let Inst{27-23} = op; let Inst{31-28} = 0xb; //encoding - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; } -class SOPP op, dag ins, string asm, list pattern> : Enc32 < - (outs), - ins, - asm, - pattern > { +class SOPPe op> : Enc32 { - bits <16> SIMM16; + bits <16> simm16; - let Inst{15-0} = SIMM16; + let Inst{15-0} = simm16; let Inst{22-16} = op; let Inst{31-23} = 0x17f; // encoding - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; } -class SMRD op, bits<1> imm, dag outs, dag ins, string asm, - list pattern> : Enc32 { +class SMRDe op, bits<1> imm> : Enc32 { bits<7> SDST; bits<7> SBASE; bits<8> OFFSET; - + let Inst{7-0} = OFFSET; let Inst{8} = imm; let Inst{14-9} = SBASE{6-1}; let Inst{21-15} = SDST; let Inst{26-22} = op; let Inst{31-27} = 0x18; //encoding +} + +class SOP1 op, dag outs, dag ins, string asm, list pattern> : + InstSI, SOP1e { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; +} + +class SOP2 op, dag outs, dag ins, string asm, list pattern> : + InstSI , SOP2e { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; +} + +class SOPC op, dag outs, dag ins, string asm, list pattern> : + InstSI, SOPCe { + + let DisableEncoding = "$dst"; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; +} + +class SOPK op, dag outs, dag ins, string asm, list pattern> : + InstSI , SOPKe { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; +} + +class SOPP op, dag ins, string asm, list pattern> : + InstSI <(outs), ins, asm, pattern >, SOPPe { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; +} + +class SMRD op, bits<1> imm, dag outs, dag ins, string asm, + list pattern> : InstSI, SMRDe { let LGKM_CNT = 1; let SMRD = 1; @@ -175,49 +190,32 @@ class SMRD op, bits<1> imm, dag outs, dag ins, string asm, //===----------------------------------------------------------------------===// // Vector ALU operations //===----------------------------------------------------------------------===// - -let Uses = [EXEC] in { -class VOP1 op, dag outs, dag ins, string asm, list pattern> : - Enc32 { +class VOP1e op> : Enc32 { bits<8> VDST; bits<9> SRC0; - + let Inst{8-0} = SRC0; let Inst{16-9} = op; let Inst{24-17} = VDST; let Inst{31-25} = 0x3f; //encoding - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP1 = 1; } -class VOP2 op, dag outs, dag ins, string asm, list pattern> : - Enc32 { +class VOP2e op> : Enc32 { bits<8> VDST; bits<9> SRC0; bits<8> VSRC1; - + let Inst{8-0} = SRC0; let Inst{16-9} = VSRC1; let Inst{24-17} = VDST; let Inst{30-25} = op; let Inst{31} = 0x0; //encoding - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP2 = 1; } -class VOP3 op, dag outs, dag ins, string asm, list pattern> : - VOP3Common { +class VOP3e op> : Enc64 { bits<8> dst; bits<2> src0_modifiers; @@ -243,11 +241,9 @@ class VOP3 op, dag outs, dag ins, string asm, list pattern> : let Inst{61} = src0_modifiers{0}; let Inst{62} = src1_modifiers{0}; let Inst{63} = src2_modifiers{0}; - } -class VOP3b op, dag outs, dag ins, string asm, list pattern> : - VOP3Common { +class VOP3be op> : Enc64 { bits<8> dst; bits<2> src0_modifiers; @@ -270,11 +266,9 @@ class VOP3b op, dag outs, dag ins, string asm, list pattern> : let Inst{61} = src0_modifiers{0}; let Inst{62} = src1_modifiers{0}; let Inst{63} = src2_modifiers{0}; - } -class VOPC op, dag ins, string asm, list pattern> : - Enc32 <(outs VCCReg:$dst), ins, asm, pattern> { +class VOPCe op> : Enc32 { bits<9> SRC0; bits<8> VSRC1; @@ -283,16 +277,9 @@ class VOPC op, dag ins, string asm, list pattern> : let Inst{16-9} = VSRC1; let Inst{24-17} = op; let Inst{31-25} = 0x3e; - - let DisableEncoding = "$dst"; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let VOPC = 1; } -class VINTRP op, dag outs, dag ins, string asm, list pattern> : - Enc32 { +class VINTRPe op> : Enc32 { bits<8> VDST; bits<8> VSRC; @@ -305,22 +292,9 @@ class VINTRP op, dag outs, dag ins, string asm, list pattern> : let Inst{17-16} = op; let Inst{25-18} = VDST; let Inst{31-26} = 0x32; // encoding - - let neverHasSideEffects = 1; - let mayLoad = 1; - let mayStore = 0; } -} // End Uses = [EXEC] - -//===----------------------------------------------------------------------===// -// Vector I/O operations -//===----------------------------------------------------------------------===// - -let Uses = [EXEC] in { - -class DS op, dag outs, dag ins, string asm, list pattern> : - Enc64 { +class DSe op> : Enc64 { bits<8> vdst; bits<1> gds; @@ -339,12 +313,9 @@ class DS op, dag outs, dag ins, string asm, list pattern> : let Inst{47-40} = data0; let Inst{55-48} = data1; let Inst{63-56} = vdst; - - let LGKM_CNT = 1; } -class MUBUF op, dag outs, dag ins, string asm, list pattern> : - Enc64 { +class MUBUFe op> : Enc64 { bits<12> offset; bits<1> offen; @@ -373,16 +344,9 @@ class MUBUF op, dag outs, dag ins, string asm, list pattern> : let Inst{54} = slc; let Inst{55} = tfe; let Inst{63-56} = soffset; - - let VM_CNT = 1; - let EXP_CNT = 1; - - let neverHasSideEffects = 1; - let UseNamedOperandTable = 1; } -class MTBUF op, dag outs, dag ins, string asm, list pattern> : - Enc64 { +class MTBUFe op> : Enc64 { bits<8> VDATA; bits<12> OFFSET; @@ -413,15 +377,9 @@ class MTBUF op, dag outs, dag ins, string asm, list pattern> : let Inst{54} = SLC; let Inst{55} = TFE; let Inst{63-56} = SOFFSET; - - let VM_CNT = 1; - let EXP_CNT = 1; - - let neverHasSideEffects = 1; } -class MIMG op, dag outs, dag ins, string asm, list pattern> : - Enc64 { +class MIMGe op> : Enc64 { bits<8> VDATA; bits<4> DMASK; @@ -434,7 +392,7 @@ class MIMG op, dag outs, dag ins, string asm, list pattern> : bits<1> SLC; bits<8> VADDR; bits<7> SRSRC; - bits<7> SSAMP; + bits<7> SSAMP; let Inst{11-8} = DMASK; let Inst{12} = UNORM; @@ -450,18 +408,9 @@ class MIMG op, dag outs, dag ins, string asm, list pattern> : let Inst{47-40} = VDATA; let Inst{52-48} = SRSRC{6-2}; let Inst{57-53} = SSAMP{6-2}; - - let VM_CNT = 1; - let EXP_CNT = 1; - let MIMG = 1; } -def EXP : Enc64< - (outs), - (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, - VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), - "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", - [] > { +class EXPe : Enc64 { bits<4> EN; bits<6> TGT; @@ -483,6 +432,102 @@ def EXP : Enc64< let Inst{47-40} = VSRC1; let Inst{55-48} = VSRC2; let Inst{63-56} = VSRC3; +} + +let Uses = [EXEC] in { + +class VOP1 op, dag outs, dag ins, string asm, list pattern> : + InstSI , VOP1e { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOP1 = 1; +} + +class VOP2 op, dag outs, dag ins, string asm, list pattern> : + InstSI , VOP2e { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOP2 = 1; +} + +class VOP3 op, dag outs, dag ins, string asm, list pattern> : + VOP3Common , VOP3e; + +class VOP3b op, dag outs, dag ins, string asm, list pattern> : + VOP3Common , VOP3be; + +class VOPC op, dag ins, string asm, list pattern> : + InstSI <(outs VCCReg:$dst), ins, asm, pattern>, VOPCe { + + let DisableEncoding = "$dst"; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOPC = 1; +} + +class VINTRP op, dag outs, dag ins, string asm, list pattern> : + InstSI , VINTRPe { + + let neverHasSideEffects = 1; + let mayLoad = 1; + let mayStore = 0; +} + +} // End Uses = [EXEC] + +//===----------------------------------------------------------------------===// +// Vector I/O operations +//===----------------------------------------------------------------------===// + +let Uses = [EXEC] in { + +class DS op, dag outs, dag ins, string asm, list pattern> : + InstSI , DSe { + + let LGKM_CNT = 1; +} + +class MUBUF op, dag outs, dag ins, string asm, list pattern> : + InstSI, MUBUFe { + + let VM_CNT = 1; + let EXP_CNT = 1; + + let neverHasSideEffects = 1; + let UseNamedOperandTable = 1; +} + +class MTBUF op, dag outs, dag ins, string asm, list pattern> : + InstSI, MTBUFe { + + let VM_CNT = 1; + let EXP_CNT = 1; + + let neverHasSideEffects = 1; +} + +class MIMG op, dag outs, dag ins, string asm, list pattern> : + InstSI , MIMGe { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MIMG = 1; +} + +def EXP : InstSI< + (outs), + (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, + VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), + "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", + [] >, EXPe { let EXP_CNT = 1; } diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index f5b82d53ba7a..8c3af77e0235 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -361,6 +361,26 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MI->eraseFromParent(); break; } + case AMDGPU::SI_CONSTDATA_PTR: { + unsigned Reg = MI->getOperand(0).getReg(); + unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); + + BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); + + // Add 32-bit offset from this instruction to the start of the constant data. + BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_I32), RegLo) + .addReg(RegLo) + .addTargetIndex(AMDGPU::TI_CONSTDATA_START) + .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .addImm(0) + .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) + .addReg(AMDGPU::SCC, RegState::Implicit); + MI->eraseFromParent(); + break; + } } return true; } @@ -524,6 +544,38 @@ bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const { return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO); } +static bool compareMachineOp(const MachineOperand &Op0, + const MachineOperand &Op1) { + if (Op0.getType() != Op1.getType()) + return false; + + switch (Op0.getType()) { + case MachineOperand::MO_Register: + return Op0.getReg() == Op1.getReg(); + case MachineOperand::MO_Immediate: + return Op0.getImm() == Op1.getImm(); + case MachineOperand::MO_FPImmediate: + return Op0.getFPImm() == Op1.getFPImm(); + default: + llvm_unreachable("Didn't expect to be comparing these operand types"); + } +} + +bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const { + const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; + + assert(MO.isImm() || MO.isFPImm()); + + if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) + return true; + + if (OpInfo.RegClass < 0) + return false; + + return RI.regClassCanUseImmediate(OpInfo.RegClass); +} + bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const { uint16_t Opcode = MI->getOpcode(); @@ -542,10 +594,21 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Make sure the register classes are correct for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) { switch (Desc.OpInfo[i].OperandType) { - case MCOI::OPERAND_REGISTER: + case MCOI::OPERAND_REGISTER: { + int RegClass = Desc.OpInfo[i].RegClass; + if (!RI.regClassCanUseImmediate(RegClass) && + (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm())) { + ErrInfo = "Expected register, but got immediate"; + return false; + } + } break; case MCOI::OPERAND_IMMEDIATE: - if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) { + // Check if this operand is an immediate. + // FrameIndex operands will be replaced by immediates, so they are + // allowed. + if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() && + !MI->getOperand(i).isFI()) { ErrInfo = "Expected immediate, but got non-immediate"; return false; } @@ -630,6 +693,24 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, return false; } } + + // Verify misc. restrictions on specific instructions. + if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || + Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { + MI->dump(); + + const MachineOperand &Src0 = MI->getOperand(2); + const MachineOperand &Src1 = MI->getOperand(3); + const MachineOperand &Src2 = MI->getOperand(4); + if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { + if (!compareMachineOp(Src0, Src1) && + !compareMachineOp(Src0, Src2)) { + ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; + return false; + } + } + } + return true; } @@ -1558,3 +1639,12 @@ void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); } + +const MachineOperand *SIInstrInfo::getNamedOperand(const MachineInstr& MI, + unsigned OperandName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); + if (Idx == -1) + return nullptr; + + return &MI.getOperand(Idx); +} diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 4c204d877809..13ab4843fdad 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -89,10 +89,6 @@ class SIInstrInfo : public AMDGPUInstrInfo { bool isTriviallyReMaterializable(const MachineInstr *MI, AliasAnalysis *AA = nullptr) const; - unsigned getIEQOpcode() const override { - llvm_unreachable("Unimplemented"); - } - MachineInstr *buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned DstReg, unsigned SrcReg) const override; @@ -110,6 +106,9 @@ class SIInstrInfo : public AMDGPUInstrInfo { bool isInlineConstant(const MachineOperand &MO) const; bool isLiteralConstant(const MachineOperand &MO) const; + bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const; + bool verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const override; @@ -175,17 +174,23 @@ class SIInstrInfo : public AMDGPUInstrInfo { unsigned SavReg, unsigned IndexReg) const; void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; + + /// \brief Returns the operand named \p Op. If \p MI does not have an + /// operand named \c Op, this function returns nullptr. + const MachineOperand *getNamedOperand(const MachineInstr& MI, + unsigned OperandName) const; }; namespace AMDGPU { int getVOPe64(uint16_t Opcode); + int getVOPe32(uint16_t Opcode); int getCommuteRev(uint16_t Opcode); int getCommuteOrig(uint16_t Opcode); int getMCOpcode(uint16_t Opcode, unsigned Gen); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; - + const uint64_t RSRC_TID_ENABLE = 1LL << 55; } // End namespace AMDGPU diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index eb9746779374..b0ac20f558d0 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -57,6 +57,10 @@ def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; +def SIconstdata_ptr : SDNode< + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> +>; + // Transformation function, extract the lower 32bit of a 64bit immediate def LO32 : SDNodeXFormgetTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32); @@ -142,10 +146,27 @@ class SGPRImm : PatLeaf; +//===----------------------------------------------------------------------===// +// Custom Operands +//===----------------------------------------------------------------------===// + def FRAMEri32 : Operand { let MIOperandInfo = (ops i32:$ptr, i32imm:$index); } +def sopp_brtarget : Operand { + let EncoderMethod = "getSOPPBrEncoding"; + let OperandType = "OPERAND_PCREL"; +} + +//===----------------------------------------------------------------------===// +// Complex patterns +//===----------------------------------------------------------------------===// + +def MUBUFAddr32 : ComplexPattern; +def MUBUFAddr64 : ComplexPattern; +def MUBUFScratch : ComplexPattern; + //===----------------------------------------------------------------------===// // SI assembler operands //===----------------------------------------------------------------------===// @@ -263,16 +284,54 @@ class SIMCInstr { int Subtarget = subtarget; } +class VOP3_Pseudo pattern, string opName> : + VOP3Common , + VOP , + SIMCInstr { + let isPseudo = 1; +} + +class VOP3_Real_si op, dag outs, dag ins, string asm, string opName> : + VOP3 , + SIMCInstr; + multiclass VOP3_m op, dag outs, dag ins, string asm, list pattern, string opName> { - def "" : VOP3Common , VOP , - SIMCInstr { - let isPseudo = 1; - } + def "" : VOP3_Pseudo ; + + def _si : VOP3_Real_si ; + +} + +multiclass VOP3_1_m op, dag outs, dag ins, string asm, + list pattern, string opName> { + + def "" : VOP3_Pseudo ; - def _si : VOP3 , SIMCInstr; + let src1 = 0, src1_modifiers = 0, src2 = 0, src2_modifiers = 0 in { + def _si : VOP3_Real_si < + {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + outs, ins, asm, opName + >; + + } // src1 = 0, src1_modifiers = 0, src2 = 0, src2_modifiers = 0 +} + +multiclass VOP3_2_m op, dag outs, dag ins, string asm, + list pattern, string opName, string revOp> { + + def "" : VOP3_Pseudo ; + + let src2 = 0, src2_modifiers = 0 in { + + def _si : VOP3_Real_si < + {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + outs, ins, asm, opName>, + VOP2_REV; + + } // src2 = 0, src2_modifiers = 0 } // This must always be right before the operand being input modified. @@ -288,15 +347,11 @@ multiclass VOP1_Helper op, RegisterClass drc, RegisterClass src, opName#"_e32 $dst, $src0", pattern >, VOP ; - def _e64 : VOP3 < - {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + defm _e64 : VOP3_1_m < + op, (outs drc:$dst), (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod), - opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", [] - >, VOP { - let src1 = SIOperand.ZERO; - let src2 = SIOperand.ZERO; - } + opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", [], opName>; } multiclass VOP1_32 op, string opName, list pattern> @@ -318,16 +373,14 @@ multiclass VOP2_Helper op, RegisterClass vrc, RegisterClass arc, opName#"_e32 $dst, $src0, $src1", pattern >, VOP , VOP2_REV; - def _e64 : VOP3 < - {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + defm _e64 : VOP3_2_m < + op, (outs vrc:$dst), (ins InputMods:$src0_modifiers, arc:$src0, InputMods:$src1_modifiers, arc:$src1, i32imm:$clamp, i32imm:$omod), - opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [] - >, VOP , VOP2_REV { - let src2 = SIOperand.ZERO; - } + opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [], + opName, revOp>; } multiclass VOP2_32 op, string opName, list pattern, @@ -354,7 +407,8 @@ multiclass VOP2b_32 op, string opName, list pattern, i32imm:$clamp, i32imm:$omod), opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [] >, VOP , VOP2_REV { - let src2 = SIOperand.ZERO; + let src2 = 0; + let src2_modifiers = 0; /* the VOP2 variant puts the carry out into VCC, the VOP3 variant can write it into any SGPR. We currently don't use the carry out, so for now hardcode it to VCC as well */ @@ -368,7 +422,7 @@ multiclass VOPC_Helper op, RegisterClass vrc, RegisterClass arc, op, (ins arc:$src0, vrc:$src1), opName#"_e32 $dst, $src0, $src1", [] >, VOP { - let Defs = !if(defExec, [VCC, EXEC], [VCC]); + let Defs = !if(defExec, [EXEC], []); } def _e64 : VOP3 < @@ -383,7 +437,7 @@ multiclass VOPC_Helper op, RegisterClass vrc, RegisterClass arc, ) >, VOP { let Defs = !if(defExec, [EXEC], []); - let src2 = SIOperand.ZERO; + let src2 = 0; let src2_modifiers = 0; } } @@ -418,7 +472,8 @@ class VOP3_64_32 op, string opName, list pattern> : VOP3 < opName#" $dst, $src0, $src1", pattern >, VOP { - let src2 = SIOperand.ZERO; + let src2 = 0; + let src2_modifiers = 0; let src0_modifiers = 0; let clamp = 0; let omod = 0; @@ -433,6 +488,22 @@ class VOP3_64 op, string opName, list pattern> : VOP3 < opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern >, VOP ; + +class VOP3b_Helper op, RegisterClass vrc, RegisterClass arc, + string opName, list pattern> : VOP3 < + op, (outs vrc:$dst0, SReg_64:$dst1), + (ins arc:$src0, arc:$src1, arc:$src2, + InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), + opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern +>, VOP ; + + +class VOP3b_64 op, string opName, list pattern> : + VOP3b_Helper ; + +class VOP3b_32 op, string opName, list pattern> : + VOP3b_Helper ; + //===----------------------------------------------------------------------===// // Vector I/O classes //===----------------------------------------------------------------------===// @@ -554,26 +625,28 @@ class MTBUF_Store_Helper op, string asm, RegisterClass regClass> : MTBU let mayLoad = 0; } -multiclass MUBUF_Load_Helper op, string asm, RegisterClass regClass> { +multiclass MUBUF_Load_Helper op, string asm, RegisterClass regClass, + ValueType load_vt = i32, + SDPatternOperator ld = null_frag> { let lds = 0, mayLoad = 1 in { let addr64 = 0 in { - let offen = 0, idxen = 0 in { + let offen = 0, idxen = 0, vaddr = 0 in { def _OFFSET : MUBUF ; } - let offen = 1, idxen = 0, offset = 0 in { + let offen = 1, idxen = 0 in { def _OFFEN : MUBUF ; + asm#" $vdata, $srsrc + $vaddr + $soffset + $offset, glc=$glc, slc=$slc, tfe=$tfe", []>; } let offen = 0, idxen = 1 in { @@ -596,29 +669,47 @@ multiclass MUBUF_Load_Helper op, string asm, RegisterClass regClass> { let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in { def _ADDR64 : MUBUF ; + asm#" $vdata, $srsrc + $vaddr + $offset", + [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, + i64:$vaddr, u16imm:$offset)))]>; } } } -class MUBUF_Store_Helper op, string name, RegisterClass vdataClass> : - MUBUF { +multiclass MUBUF_Store_Helper op, string name, RegisterClass vdataClass, + ValueType store_vt, SDPatternOperator st> { - let mayLoad = 0; - let mayStore = 1; + def "" : MUBUF < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset, + u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$slc, + i1imm:$tfe), + name#" $vdata, $srsrc, $vaddr, $soffset, $offset $offen $idxen $glc $slc $tfe", + [] + > { + let addr64 = 0; + } + + def _ADDR64 : MUBUF < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset), + name#" $vdata, $srsrc + $vaddr + $offset", + [(st store_vt:$vdata, + (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> { - // Encoding - let offen = 0; - let idxen = 0; - let glc = 0; - let addr64 = 1; - let lds = 0; - let slc = 0; - let tfe = 0; - let soffset = 128; // ZERO + let mayLoad = 0; + let mayStore = 1; + + // Encoding + let offen = 0; + let idxen = 0; + let glc = 0; + let addr64 = 1; + let lds = 0; + let slc = 0; + let tfe = 0; + let soffset = 128; // ZERO + } } class MTBUF_Load_Helper op, string asm, RegisterClass regClass> : MTBUF < @@ -712,6 +803,53 @@ multiclass MIMG_Sampler op, string asm> { defm _V4 : MIMG_Sampler_Src_Helper; } +class MIMG_Gather_Helper op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc, SReg_128:$ssamp), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", + []> { + let mayLoad = 1; + let mayStore = 0; + + // DMASK was repurposed for GATHER4. 4 components are always + // returned and DMASK works like a swizzle - it selects + // the component to fetch. The only useful DMASK values are + // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns + // (red,red,red,red) etc.) The ISA document doesn't mention + // this. + // Therefore, disable all code which updates DMASK by setting these two: + let MIMG = 0; + let hasPostISelHook = 0; +} + +multiclass MIMG_Gather_Src_Helper op, string asm, + RegisterClass dst_rc, + int channels> { + def _V1 : MIMG_Gather_Helper , + MIMG_Mask; + def _V2 : MIMG_Gather_Helper , + MIMG_Mask; + def _V4 : MIMG_Gather_Helper , + MIMG_Mask; + def _V8 : MIMG_Gather_Helper , + MIMG_Mask; + def _V16 : MIMG_Gather_Helper , + MIMG_Mask; +} + +multiclass MIMG_Gather op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper; + defm _V2 : MIMG_Gather_Src_Helper; + defm _V3 : MIMG_Gather_Src_Helper; + defm _V4 : MIMG_Gather_Src_Helper; +} + //===----------------------------------------------------------------------===// // Vector instruction mappings //===----------------------------------------------------------------------===// @@ -725,6 +863,15 @@ def getVOPe64 : InstrMapping { let ValueCols = [["8"]]; } +// Maps an opcode in e64 form to its e32 equivalent +def getVOPe32 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["Size"]; + let KeyCol = ["8"]; + let ValueCols = [["4"]]; +} + // Maps an original opcode to its commuted version def getCommuteRev : InstrMapping { let FilterClass = "VOP2_REV"; diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 428e49c6431c..aecd847a2ba1 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -139,7 +139,11 @@ def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", ////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>; ////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>; ////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>; -def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>; +def S_GETPC_B64 : SOP1 < + 0x0000001f, (outs SReg_64:$dst), (ins), "S_GETPC_B64 $dst", [] +> { + let SSRC0 = 0; +} def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>; def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>; def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>; @@ -365,56 +369,56 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>; // SOPP Instructions //===----------------------------------------------------------------------===// -def S_NOP : SOPP <0x00000000, (ins i16imm:$SIMM16), "S_NOP $SIMM16", []>; +def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "S_NOP $simm16", []>; let isTerminator = 1 in { def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM", [(IL_retflag)]> { - let SIMM16 = 0; + let simm16 = 0; let isBarrier = 1; let hasCtrlDep = 1; } let isBranch = 1 in { def S_BRANCH : SOPP < - 0x00000002, (ins brtarget:$target), "S_BRANCH $target", - [(br bb:$target)]> { + 0x00000002, (ins sopp_brtarget:$simm16), "S_BRANCH $simm16", + [(br bb:$simm16)]> { let isBarrier = 1; } let DisableEncoding = "$scc" in { def S_CBRANCH_SCC0 : SOPP < - 0x00000004, (ins brtarget:$target, SCCReg:$scc), - "S_CBRANCH_SCC0 $target", [] + 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc), + "S_CBRANCH_SCC0 $simm16", [] >; def S_CBRANCH_SCC1 : SOPP < - 0x00000005, (ins brtarget:$target, SCCReg:$scc), - "S_CBRANCH_SCC1 $target", + 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc), + "S_CBRANCH_SCC1 $simm16", [] >; } // End DisableEncoding = "$scc" def S_CBRANCH_VCCZ : SOPP < - 0x00000006, (ins brtarget:$target, VCCReg:$vcc), - "S_CBRANCH_VCCZ $target", + 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + "S_CBRANCH_VCCZ $simm16", [] >; def S_CBRANCH_VCCNZ : SOPP < - 0x00000007, (ins brtarget:$target, VCCReg:$vcc), - "S_CBRANCH_VCCNZ $target", + 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + "S_CBRANCH_VCCNZ $simm16", [] >; let DisableEncoding = "$exec" in { def S_CBRANCH_EXECZ : SOPP < - 0x00000008, (ins brtarget:$target, EXECReg:$exec), - "S_CBRANCH_EXECZ $target", + 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec), + "S_CBRANCH_EXECZ $simm16", [] >; def S_CBRANCH_EXECNZ : SOPP < - 0x00000009, (ins brtarget:$target, EXECReg:$exec), - "S_CBRANCH_EXECNZ $target", + 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec), + "S_CBRANCH_EXECNZ $simm16", [] >; } // End DisableEncoding = "$exec" @@ -427,7 +431,7 @@ let hasSideEffects = 1 in { def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER", [(int_AMDGPU_barrier_local)] > { - let SIMM16 = 0; + let simm16 = 0; let isBarrier = 1; let hasCtrlDep = 1; let mayLoad = 1; @@ -846,32 +850,46 @@ defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMA //def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>; //def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>; //def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>; -defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <0x00000008, "BUFFER_LOAD_UBYTE", VReg_32>; -defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <0x00000009, "BUFFER_LOAD_SBYTE", VReg_32>; -defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <0x0000000a, "BUFFER_LOAD_USHORT", VReg_32>; -defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32>; -defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>; +defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < + 0x00000008, "BUFFER_LOAD_UBYTE", VReg_32, i32, az_extloadi8_global +>; +defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < + 0x00000009, "BUFFER_LOAD_SBYTE", VReg_32, i32, sextloadi8_global +>; +defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < + 0x0000000a, "BUFFER_LOAD_USHORT", VReg_32, i32, az_extloadi16_global +>; +defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < + 0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32, i32, sextloadi16_global +>; +defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < + 0x0000000c, "BUFFER_LOAD_DWORD", VReg_32, i32, global_load +>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < + 0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64, v2i32, global_load +>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < + 0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load +>; -def BUFFER_STORE_BYTE : MUBUF_Store_Helper < - 0x00000018, "BUFFER_STORE_BYTE", VReg_32 +defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < + 0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global >; -def BUFFER_STORE_SHORT : MUBUF_Store_Helper < - 0x0000001a, "BUFFER_STORE_SHORT", VReg_32 +defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < + 0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global >; -def BUFFER_STORE_DWORD : MUBUF_Store_Helper < - 0x0000001c, "BUFFER_STORE_DWORD", VReg_32 +defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < + 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store >; -def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64 +defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < + 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store >; -def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128 +defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < + 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store >; //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; @@ -955,71 +973,71 @@ defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "IMAGE_GET_RESINFO">; //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>; //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>; //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>; -defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">; -//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">; -//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">; -defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">; -//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>; -//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>; -defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">; -//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">; -//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">; -//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>; -//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>; -//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>; -//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>; -//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>; -//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>; -//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>; -//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>; -//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>; -//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>; -//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>; -//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>; -//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>; -//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>; -//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>; -//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>; -//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>; -//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>; -//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>; -//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>; -//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>; -//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>; -//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>; -//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>; -//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>; -//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>; -//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>; -//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>; -//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>; -//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>; -//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>; -//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>; -//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>; -//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>; -//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>; -//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>; -//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>; -//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>; -//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>; -//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>; -//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>; -//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>; -//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>; -//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>; -//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>; -//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>; -//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>; -//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>; -//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>; -//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>; -//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>; +defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">; +defm IMAGE_SAMPLE_CL : MIMG_Sampler <0x00000021, "IMAGE_SAMPLE_CL">; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "IMAGE_SAMPLE_D_CL">; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">; +defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler <0x00000026, "IMAGE_SAMPLE_B_CL">; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "IMAGE_SAMPLE_LZ">; +defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler <0x00000029, "IMAGE_SAMPLE_C_CL">; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "IMAGE_SAMPLE_C_D_CL">; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler <0x0000002e, "IMAGE_SAMPLE_C_B_CL">; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "IMAGE_SAMPLE_C_LZ">; +defm IMAGE_SAMPLE_O : MIMG_Sampler <0x00000030, "IMAGE_SAMPLE_O">; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler <0x00000031, "IMAGE_SAMPLE_CL_O">; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "IMAGE_SAMPLE_D_O">; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "IMAGE_SAMPLE_D_CL_O">; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "IMAGE_SAMPLE_L_O">; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler <0x00000035, "IMAGE_SAMPLE_B_O">; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler <0x00000036, "IMAGE_SAMPLE_B_CL_O">; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "IMAGE_SAMPLE_LZ_O">; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler <0x00000038, "IMAGE_SAMPLE_C_O">; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler <0x00000039, "IMAGE_SAMPLE_C_CL_O">; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "IMAGE_SAMPLE_C_D_O">; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "IMAGE_SAMPLE_C_D_CL_O">; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "IMAGE_SAMPLE_C_L_O">; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler <0x0000003d, "IMAGE_SAMPLE_C_B_O">; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler <0x0000003e, "IMAGE_SAMPLE_C_B_CL_O">; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "IMAGE_SAMPLE_C_LZ_O">; +defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "IMAGE_GATHER4">; +defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">; +defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">; +defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">; +defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">; +defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">; +defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">; +defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">; +defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "IMAGE_GET_LOD">; +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "IMAGE_SAMPLE_CD">; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "IMAGE_SAMPLE_CD_CL">; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "IMAGE_SAMPLE_C_CD">; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "IMAGE_SAMPLE_C_CD_CL">; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "IMAGE_SAMPLE_CD_O">; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "IMAGE_SAMPLE_CD_CL_O">; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "IMAGE_SAMPLE_C_CD_O">; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "IMAGE_SAMPLE_C_CD_CL_O">; //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>; //def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>; @@ -1064,8 +1082,12 @@ defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", [(set i32:$dst, (fp_to_sint f32:$src0))] >; defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; -////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>; -//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>; +defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32", + [(set i32:$dst, (fp_to_f16 f32:$src0))] +>; +defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", + [(set f32:$dst, (f16_to_fp i32:$src0))] +>; //defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>; //defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>; //defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>; @@ -1098,7 +1120,7 @@ defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", [(set f32:$dst, (AMDGPUfract f32:$src0))] >; defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", - [(set f32:$dst, (int_AMDGPU_trunc f32:$src0))] + [(set f32:$dst, (ftrunc f32:$src0))] >; defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", [(set f32:$dst, (fceil f32:$src0))] @@ -1116,36 +1138,45 @@ defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", [(set f32:$dst, (flog2 f32:$src0))] >; + defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", - [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] + [(set f32:$dst, (AMDGPUrcp f32:$src0))] >; defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; -defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; +defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", + [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))] +>; defm V_RSQ_LEGACY_F32 : VOP1_32 < 0x0000002d, "V_RSQ_LEGACY_F32", - [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))] + [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))] >; defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", - [(set f32:$dst, (fdiv FP_ONE, (fsqrt f32:$src0)))] + [(set f32:$dst, (AMDGPUrsq f32:$src0))] >; defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", - [(set f64:$dst, (fdiv FP_ONE, f64:$src0))] + [(set f64:$dst, (AMDGPUrcp f64:$src0))] >; defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", - [(set f64:$dst, (fdiv FP_ONE, (fsqrt f64:$src0)))] + [(set f64:$dst, (AMDGPUrsq f64:$src0))] +>; +defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", + [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))] >; -defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>; defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", [(set f32:$dst, (fsqrt f32:$src0))] >; defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", [(set f64:$dst, (fsqrt f64:$src0))] >; -defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>; -defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>; +defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", + [(set f32:$dst, (AMDGPUsin f32:$src0))] +>; +defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", + [(set f32:$dst, (AMDGPUcos f32:$src0))] +>; defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>; defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>; defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>; @@ -1417,8 +1448,12 @@ defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; -defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>; -def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>; +defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", + [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))] +>; +def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", + [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))] +>; def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64", [(set i64:$dst, (shl i64:$src0, i32:$src1))] @@ -1450,14 +1485,23 @@ defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; } // isCommutable = 1 -defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; -def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; -defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; -def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>; +def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>; + +// Double precision division pre-scale. +def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>; + +defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", + [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))] +>; +def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", + [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))] +>; //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; -def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>; +def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64", + [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))] +>; //===----------------------------------------------------------------------===// // Pseudo Instructions @@ -1481,6 +1525,11 @@ def V_OR_I1 : InstSI < [(set i1:$dst, (or i1:$src0, i1:$src1))] >; +def V_XOR_I1 : InstSI < + (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "", + [(set i1:$dst, (xor i1:$src0, i1:$src1))] +>; + // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. @@ -1614,7 +1663,13 @@ let usesCustomInserter = 1 in { // constant that can be used with the ADDR64 MUBUF instructions. def SI_ADDR64_RSRC : InstSI < (outs SReg_128:$srsrc), - (ins SReg_64:$ptr), + (ins SSrc_64:$ptr), + "", [] +>; + +def SI_BUFFER_RSRC : InstSI < + (outs SReg_128:$srsrc), + (ins SReg_32:$ptr_lo, SReg_32:$ptr_hi, SSrc_32:$data_lo, SSrc_32:$data_hi), "", [] >; @@ -1622,7 +1677,7 @@ def V_SUB_F64 : InstSI < (outs VReg_64:$dst), (ins VReg_64:$src0, VReg_64:$src1), "V_SUB_F64 $dst, $src0, $src1", - [] + [(set f64:$dst, (fsub f64:$src0, f64:$src1))] >; } // end usesCustomInserter @@ -1649,6 +1704,16 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; defm SI_SPILL_S512 : SI_SPILL_SGPR ; +let Defs = [SCC] in { + +def SI_CONSTDATA_PTR : InstSI < + (outs SReg_64:$dst), + (ins), + "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))] +>; + +} // End Defs = [SCC] + } // end IsCodeGenOnly, isPseudo } // end SubtargetPredicate = SI @@ -1679,11 +1744,6 @@ def : Pat < $src0, $src1, $src2, $src3) >; -def : Pat < - (f64 (fsub f64:$src0, f64:$src1)), - (V_SUB_F64 $src0, $src1) ->; - //===----------------------------------------------------------------------===// // SMRD Patterns //===----------------------------------------------------------------------===// @@ -1711,7 +1771,6 @@ multiclass SMRD_Pattern { defm : SMRD_Pattern ; defm : SMRD_Pattern ; -defm : SMRD_Pattern ; defm : SMRD_Pattern ; defm : SMRD_Pattern ; defm : SMRD_Pattern ; @@ -1730,15 +1789,36 @@ def : Pat < (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) >; +} // Predicates = [isSI] in { + +//===----------------------------------------------------------------------===// +// SOP1 Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isSI, isCFDepth0] in { + +def : Pat < + (i64 (ctpop i64:$src)), + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (S_BCNT1_I32_B64 $src), sub0), + (S_MOV_B32 0), sub1) +>; + //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// +// V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector +// case, the sgpr-copies pass will fix this to use the vector version. def : Pat < - (i1 (xor i1:$src0, i1:$src1)), - (S_XOR_B64 $src0, $src1) + (i32 (addc i32:$src0, i32:$src1)), + (S_ADD_I32 $src0, $src1) >; +} // Predicates = [isSI, isCFDepth0] + +let Predicates = [isSI] in { + //===----------------------------------------------------------------------===// // SOPP Patterns //===----------------------------------------------------------------------===// @@ -1748,19 +1828,32 @@ def : Pat < (S_BARRIER) >; +//===----------------------------------------------------------------------===// +// VOP1 Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [UnsafeFPMath] in { +def : RcpPat; +defm : RsqPat; +defm : RsqPat; +} + //===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// -def : Pat < - (or i64:$src0, i64:$src1), +class BinOp64Pat : Pat < + (node i64:$src0, i64:$src1), (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub0), + (inst (EXTRACT_SUBREG i64:$src0, sub0), (EXTRACT_SUBREG i64:$src1, sub0)), sub0), - (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub1), + (inst (EXTRACT_SUBREG i64:$src0, sub1), (EXTRACT_SUBREG i64:$src1, sub1)), sub1) >; +def : BinOp64Pat ; +def : BinOp64Pat ; + class SextInReg : Pat < (sext_inreg i32:$src0, vt), (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0)) @@ -1769,10 +1862,164 @@ class SextInReg : Pat < def : SextInReg ; def : SextInReg ; +def : Pat < + (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), + (V_BCNT_U32_B32_e32 $popcnt, $val) +>; + +def : Pat < + (i32 (ctpop i32:$popcnt)), + (V_BCNT_U32_B32_e64 $popcnt, 0, 0, 0) +>; + +def : Pat < + (i64 (ctpop i64:$src)), + (INSERT_SUBREG + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (V_BCNT_U32_B32_e32 (EXTRACT_SUBREG $src, sub1), + (V_BCNT_U32_B32_e64 (EXTRACT_SUBREG $src, sub0), 0, 0, 0)), + sub0), + (V_MOV_B32_e32 0), sub1) +>; + +def : Pat < + (addc i32:$src0, i32:$src1), + (V_ADD_I32_e32 $src0, $src1) +>; + /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ +// Image + sampler +class SampleRawPattern : Pat < + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc, $sampler) +>; + +multiclass SampleRawPatterns { + def : SampleRawPattern(opcode # _V4_V1), i32>; + def : SampleRawPattern(opcode # _V4_V2), v2i32>; + def : SampleRawPattern(opcode # _V4_V4), v4i32>; + def : SampleRawPattern(opcode # _V4_V8), v8i32>; + def : SampleRawPattern(opcode # _V4_V16), v16i32>; +} + +// Image only +class ImagePattern : Pat < + (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc) +>; + +multiclass ImagePatterns { + def : ImagePattern(opcode # _V4_V1), i32>; + def : ImagePattern(opcode # _V4_V2), v2i32>; + def : ImagePattern(opcode # _V4_V4), v4i32>; +} + +// Basic sample +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; + +// Sample with comparison +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; + +// Sample with offsets +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; + +// Sample with comparison and offsets +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; + +// Gather opcodes +// Only the variants which make sense are defined. +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; + +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; + +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; + +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; + +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; + +def : ImagePattern; +defm : ImagePatterns; +defm : ImagePatterns; + /* SIsample for simple 1D texture lookup */ def : Pat < (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), @@ -2127,26 +2374,11 @@ def : Pat < (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) >; -def : Pat< - (fdiv f32:$src0, f32:$src1), - (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1)) ->; - def : Pat< (fdiv f64:$src0, f64:$src1), (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0)) >; -def : Pat < - (fcos f32:$src0), - (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) ->; - -def : Pat < - (fsin f32:$src0), - (V_SIN_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) ->; - def : Pat < (int_AMDGPU_cube v4f32:$src), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), @@ -2184,7 +2416,7 @@ def : Ext32Pat ; // Offset in an 32Bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0) + (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) >; // The multiplication scales from [0,1] to the unsigned integer range @@ -2257,7 +2489,7 @@ defm : DSReadPat ; defm : DSReadPat ; defm : DSReadPat ; defm : DSReadPat ; -defm : DSReadPat ; +defm : DSReadPat ; multiclass DSWritePat { def : Pat < @@ -2274,7 +2506,7 @@ multiclass DSWritePat { defm : DSWritePat ; defm : DSWritePat ; defm : DSWritePat ; -defm : DSWritePat ; +defm : DSWritePat ; multiclass DSAtomicRetPat { def : Pat < @@ -2368,99 +2600,53 @@ defm : DSAtomicCmpXChg; //===----------------------------------------------------------------------===// multiclass MUBUFLoad_Pattern { - def : Pat < - (vt (global_ld (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset))), - (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset)) - >; - - def : Pat < - (vt (global_ld (add i64:$ptr, (i64 IMM12bit:$offset)))), - (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset)) - >; - - def : Pat < - (vt (global_ld i64:$ptr)), - (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, 0) - >; - - def : Pat < - (vt (global_ld (add i64:$ptr, i64:$offset))), - (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0) - >; - + PatFrag constant_ld> { def : Pat < (vt (constant_ld (add i64:$ptr, i64:$offset))), (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0) >; -} - -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; - -multiclass MUBUFStore_Pattern { - - def : Pat < - (st vt:$value, (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset)), - (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset)) - >; - def : Pat < - (st vt:$value, (add i64:$ptr, IMM12bit:$offset)), - (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset)) - >; +} - def : Pat < - (st vt:$value, i64:$ptr), - (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0) - >; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; - def : Pat < - (st vt:$value, (add i64:$ptr, i64:$offset)), - (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0) - >; -} +class MUBUFScratchLoadPat : Pat < + (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) +>; -defm : MUBUFStore_Pattern ; -defm : MUBUFStore_Pattern ; -defm : MUBUFStore_Pattern ; -defm : MUBUFStore_Pattern ; -defm : MUBUFStore_Pattern ; -defm : MUBUFStore_Pattern ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword { def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, + (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, imm:$offset, 0, 0, imm:$glc, imm:$slc, imm:$tfe)), - (offset $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc), + (offset $rsrc, (as_i16imm $offset), $soffset, (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; def : Pat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm, 1, 0, imm:$glc, imm:$slc, + imm:$offset, 1, 0, imm:$glc, imm:$slc, imm:$tfe)), - (offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc), + (offen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; @@ -2488,6 +2674,34 @@ defm : MUBUF_Load_Dword ; +class MUBUFScratchStorePat : Pat < + (st vt:$value, (MUBUFAddr32 v4i32:$srsrc, i32:$vaddr, i32:$soffset, + u16imm:$offset, i1imm:$offen, i1imm:$idxen, + i1imm:$glc, i1imm:$slc, i1imm:$tfe)), + (Instr $value, $srsrc, $vaddr, $soffset, $offset, $offen, $idxen, + $glc, $slc, $tfe) +>; + +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; + +/* +class MUBUFStore_Pattern : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)), + (Instr $value, $srsrc, $vaddr, $offset) +>; + +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; + +*/ + //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// @@ -2703,25 +2917,6 @@ def : Pat < (V_CMP_EQ_I32_e64 (V_AND_B32_e32 (i32 1), $a), 1) >; -// V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector -// case, the sgpr-copies pass will fix this to use the vector version. -def : Pat < - (i32 (addc i32:$src0, i32:$src1)), - (S_ADD_I32 $src0, $src1) ->; - -def : Pat < - (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), - (V_BCNT_U32_B32_e32 $popcnt, $val) ->; - -def : Pat < - (i64 (ctpop i64:$src)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (S_BCNT1_I32_B64 $src), sub0), - (S_MOV_B32 0), sub1) ->; - //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 00e32c03a99e..027a0a2f5167 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -54,15 +54,132 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + // Fully-flexible SAMPLE instruction. + class SampleRaw : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_v4i32_ty, // sampler(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + + // Image instruction without a sampler. + class Image : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + + // Basic sample + def int_SI_image_sample : SampleRaw; + def int_SI_image_sample_cl : SampleRaw; + def int_SI_image_sample_d : SampleRaw; + def int_SI_image_sample_d_cl : SampleRaw; + def int_SI_image_sample_l : SampleRaw; + def int_SI_image_sample_b : SampleRaw; + def int_SI_image_sample_b_cl : SampleRaw; + def int_SI_image_sample_lz : SampleRaw; + def int_SI_image_sample_cd : SampleRaw; + def int_SI_image_sample_cd_cl : SampleRaw; + + // Sample with comparison + def int_SI_image_sample_c : SampleRaw; + def int_SI_image_sample_c_cl : SampleRaw; + def int_SI_image_sample_c_d : SampleRaw; + def int_SI_image_sample_c_d_cl : SampleRaw; + def int_SI_image_sample_c_l : SampleRaw; + def int_SI_image_sample_c_b : SampleRaw; + def int_SI_image_sample_c_b_cl : SampleRaw; + def int_SI_image_sample_c_lz : SampleRaw; + def int_SI_image_sample_c_cd : SampleRaw; + def int_SI_image_sample_c_cd_cl : SampleRaw; + + // Sample with offsets + def int_SI_image_sample_o : SampleRaw; + def int_SI_image_sample_cl_o : SampleRaw; + def int_SI_image_sample_d_o : SampleRaw; + def int_SI_image_sample_d_cl_o : SampleRaw; + def int_SI_image_sample_l_o : SampleRaw; + def int_SI_image_sample_b_o : SampleRaw; + def int_SI_image_sample_b_cl_o : SampleRaw; + def int_SI_image_sample_lz_o : SampleRaw; + def int_SI_image_sample_cd_o : SampleRaw; + def int_SI_image_sample_cd_cl_o : SampleRaw; + + // Sample with comparison and offsets + def int_SI_image_sample_c_o : SampleRaw; + def int_SI_image_sample_c_cl_o : SampleRaw; + def int_SI_image_sample_c_d_o : SampleRaw; + def int_SI_image_sample_c_d_cl_o : SampleRaw; + def int_SI_image_sample_c_l_o : SampleRaw; + def int_SI_image_sample_c_b_o : SampleRaw; + def int_SI_image_sample_c_b_cl_o : SampleRaw; + def int_SI_image_sample_c_lz_o : SampleRaw; + def int_SI_image_sample_c_cd_o : SampleRaw; + def int_SI_image_sample_c_cd_cl_o : SampleRaw; + + // Basic gather4 + def int_SI_gather4 : SampleRaw; + def int_SI_gather4_cl : SampleRaw; + def int_SI_gather4_l : SampleRaw; + def int_SI_gather4_b : SampleRaw; + def int_SI_gather4_b_cl : SampleRaw; + def int_SI_gather4_lz : SampleRaw; + + // Gather4 with comparison + def int_SI_gather4_c : SampleRaw; + def int_SI_gather4_c_cl : SampleRaw; + def int_SI_gather4_c_l : SampleRaw; + def int_SI_gather4_c_b : SampleRaw; + def int_SI_gather4_c_b_cl : SampleRaw; + def int_SI_gather4_c_lz : SampleRaw; + + // Gather4 with offsets + def int_SI_gather4_o : SampleRaw; + def int_SI_gather4_cl_o : SampleRaw; + def int_SI_gather4_l_o : SampleRaw; + def int_SI_gather4_b_o : SampleRaw; + def int_SI_gather4_b_cl_o : SampleRaw; + def int_SI_gather4_lz_o : SampleRaw; + + // Gather4 with comparison and offsets + def int_SI_gather4_c_o : SampleRaw; + def int_SI_gather4_c_cl_o : SampleRaw; + def int_SI_gather4_c_l_o : SampleRaw; + def int_SI_gather4_c_b_o : SampleRaw; + def int_SI_gather4_c_b_cl_o : SampleRaw; + def int_SI_gather4_c_lz_o : SampleRaw; + + def int_SI_getlod : SampleRaw; + + // Image instrinsics. + def int_SI_image_load : Image; + def int_SI_image_load_mip : Image; + def int_SI_getresinfo : Image; + + // Deprecated image and sample intrinsics. class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_sample : Sample; def int_SI_sampleb : Sample; def int_SI_sampled : Sample; def int_SI_samplel : Sample; - def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; /* Interpolation Intrinsics */ diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 9f5ff29ad93a..75b5a5e027ff 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -147,7 +147,7 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); - if (MBB.getParent()->getInfo()->ShaderType != + if (MBB.getParent()->getInfo()->getShaderType() != ShaderType::PIXEL || !shouldSkip(&MBB, &MBB.getParent()->back())) return; @@ -298,11 +298,13 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Op = MI.getOperand(0); - // Kill is only allowed in pixel / geometry shaders - assert(MBB.getParent()->getInfo()->ShaderType == - ShaderType::PIXEL || - MBB.getParent()->getInfo()->ShaderType == - ShaderType::GEOMETRY); +#ifndef NDEBUG + const SIMachineFunctionInfo *MFI + = MBB.getParent()->getInfo(); + // Kill is only allowed in pixel / geometry shaders. + assert(MFI->getShaderType() == ShaderType::PIXEL || + MFI->getShaderType() == ShaderType::GEOMETRY); +#endif // Clear this thread from the exec mask if the operand is negative if ((Op.isImm() || Op.isFPImm())) { @@ -540,7 +542,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { InitM0ForLDS(MBB.getFirstNonPHI()); } - if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) { + if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { MachineBasicBlock &MBB = MF.front(); BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC).addReg(AMDGPU::EXEC); diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp index 738c90b30e54..db19235995be 100644 --- a/lib/Target/R600/SILowerI1Copies.cpp +++ b/lib/Target/R600/SILowerI1Copies.cpp @@ -102,6 +102,12 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { continue; } + if (MI.getOpcode() == AMDGPU::V_XOR_I1) { + I1Defs.push_back(MI.getOperand(0).getReg()); + MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32)); + continue; + } + if (MI.getOpcode() != AMDGPU::COPY || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg())) diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index af609958129c..c53a7e10d548 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -27,7 +27,8 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), PSInputAddr(0), - SpillTracker() { } + SpillTracker(), + NumUserSGPRs(0) { } static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) { unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); @@ -62,8 +63,10 @@ static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) { return VGPR; } } - MF->getFunction()->getContext().emitError( - "Could not found S_ENGPGM instrtuction."); + + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Could not find S_ENDPGM instruction."); + return VGPR; } diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 96e619bde8d6..9684d285cec2 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -59,6 +59,7 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { SIMachineFunctionInfo(const MachineFunction &MF); unsigned PSInputAddr; struct RegSpillTracker SpillTracker; + unsigned NumUserSGPRs; }; } // End namespace llvm diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index 519a7ba63b26..2a9a2ac5dd61 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -16,6 +16,10 @@ #include "SIRegisterInfo.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" using namespace llvm; @@ -27,8 +31,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::EXEC); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); - const SIInstrInfo *TII = static_cast(ST.getInstrInfo()); - TII->reserveIndirectRegisters(Reserved, MF); return Reserved; } @@ -37,6 +39,30 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, return RC->getNumRegs(); } +bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { + return Fn.getFrameInfo()->hasStackObjects(); +} + +void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + MachineFunction *MF = MI->getParent()->getParent(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIInstrInfo *TII = static_cast(ST.getInstrInfo()); + MachineOperand &FIOp = MI->getOperand(FIOperandNum); + int Index = MI->getOperand(FIOperandNum).getIndex(); + int64_t Offset = FrameInfo->getObjectOffset(Index); + + FIOp.ChangeToImmediate(Offset); + if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addImm(Offset); + FIOp.ChangeToRegister(TmpReg, false); + } +} + const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( MVT VT) const { switch(VT.SimpleTy) { @@ -125,3 +151,38 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, unsigned Index = getHWRegIndex(Reg); return SubRC->getRegister(Index + Channel); } + +bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const { + switch (RCID) { + default: return false; + case AMDGPU::SSrc_32RegClassID: + case AMDGPU::SSrc_64RegClassID: + case AMDGPU::VSrc_32RegClassID: + case AMDGPU::VSrc_64RegClassID: + return true; + } +} + +bool SIRegisterInfo::regClassCanUseImmediate( + const TargetRegisterClass *RC) const { + return regClassCanUseImmediate(RC->getID()); +} + +unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const { + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + switch (Value) { + case SIRegisterInfo::TGID_X: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); + case SIRegisterInfo::TGID_Y: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); + case SIRegisterInfo::TGID_Z: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); + case SIRegisterInfo::SCRATCH_WAVE_OFFSET: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); + case SIRegisterInfo::SCRATCH_PTR: + return AMDGPU::SGPR2_SGPR3; + } + llvm_unreachable("unexpected preloaded value type"); +} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index 6bcf2f015f02..5d0235c0f427 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -29,6 +29,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const override; + /// \brief get the register class of the specified type to use in the /// CFGStructurizer const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; @@ -60,6 +66,27 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { /// \returns The sub-register of Reg that is in Channel. unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const; + + /// \returns True if operands defined with this register class can accept + /// inline immediates. + bool regClassCanUseImmediate(int RCID) const; + + /// \returns True if operands defined with this register class can accept + /// inline immediates. + bool regClassCanUseImmediate(const TargetRegisterClass *RC) const; + + enum PreloadedValue { + TGID_X, + TGID_Y, + TGID_Z, + SCRATCH_WAVE_OFFSET, + SCRATCH_PTR + }; + + /// \brief Returns the physical register that \p Value is stored in. + unsigned getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const; + }; } // End namespace llvm diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index f1f01deaf361..8974b6300625 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -168,7 +168,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64, (add SGPR_64Regs, VCCReg, EXECReg) >; -def SReg_128 : RegisterClass<"AMDGPU", [v4i32], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp new file mode 100644 index 000000000000..362a5c1e4e07 --- /dev/null +++ b/lib/Target/R600/SIShrinkInstructions.cpp @@ -0,0 +1,189 @@ +//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// The pass tries to use the 32-bit encoding for instructions when possible. +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-shrink-instructions" + +STATISTIC(NumInstructionsShrunk, + "Number of 64-bit instruction reduced to 32-bit."); + +namespace llvm { + void initializeSIShrinkInstructionsPass(PassRegistry&); +} + +using namespace llvm; + +namespace { + +class SIShrinkInstructions : public MachineFunctionPass { +public: + static char ID; + +public: + SIShrinkInstructions() : MachineFunctionPass(ID) { + } + + virtual bool runOnMachineFunction(MachineFunction &MF) override; + + virtual const char *getPassName() const override { + return "SI Shrink Instructions"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) +INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) + +char SIShrinkInstructions::ID = 0; + +FunctionPass *llvm::createSIShrinkInstructionsPass() { + return new SIShrinkInstructions(); +} + +static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + if (!MO->isReg()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO->getReg())) + return TRI.hasVGPRs(MRI.getRegClass(MO->getReg())); + + return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg())); +} + +static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + + const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + // Can't shrink instruction with three operands. + if (Src2) + return false; + + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src1Mod = + TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + + if (Src1 && (!isVGPR(Src1, TRI, MRI) || Src1Mod->getImm() != 0)) + return false; + + // We don't need to check src0, all input types are legal, so just make + // sure src0 isn't using any modifiers. + const MachineOperand *Src0Mod = + TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); + if (Src0Mod && Src0Mod->getImm() != 0) + return false; + + // Check output modifiers + const MachineOperand *Omod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); + if (Omod && Omod->getImm() != 0) + return false; + + const MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); + return !Clamp || Clamp->getImm() == 0; +} + +bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = static_cast( + MF.getTarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + std::vector I1Defs; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); + + if (Op32 == -1) + continue; + + if (!canShrink(MI, TII, TRI, MRI)) { + // Try commtuing the instruction and see if that enables us to shrink + // it. + if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || + !canShrink(MI, TII, TRI, MRI)) + continue; + } + + if (TII->isVOPC(Op32)) { + unsigned DstReg = MI.getOperand(0).getReg(); + if (TargetRegisterInfo::isVirtualRegister(DstReg)) { + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because the register allocator + // has trouble with sequences like this, which cause the allocator + // to run out of registes if vreg0 and vreg1 belong to the VCCReg + // register class: + // vreg0 = VOPC; + // vreg1 = VOPC; + // S_AND_B64 vreg0, vreg1 + // + // So, instead of forcing the instruction to write to VCC, we provide a + // hint to the register allocator to use VCC and then we + // we will run this pass again after RA and shrink it if it outpus to + // VCC. + MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); + continue; + } + if (DstReg != AMDGPU::VCC) + continue; + } + + // We can shrink this instruction + DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << "\n";); + + MachineInstrBuilder MIB = + BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); + + // dst + MIB.addOperand(MI.getOperand(0)); + + MIB.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + + const MachineOperand *Src1 = + TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) + MIB.addOperand(*Src1); + + for (const MachineOperand &MO : MI.implicit_operands()) + MIB.addOperand(MO); + + DEBUG(dbgs() << "e32 MI = "; MI.dump(); dbgs() << "\n";); + ++NumInstructionsShrunk; + MI.eraseFromParent(); + } + } + return false; +} diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp index a0b6907dd7d4..367963aebb00 100644 --- a/lib/Target/R600/SITypeRewriter.cpp +++ b/lib/Target/R600/SITypeRewriter.cpp @@ -119,8 +119,7 @@ void SITypeRewriter::visitCallInst(CallInst &I) { Type::getInt32Ty(I.getContext())){ Type *ElementTy = Arg->getType()->getVectorElementType(); std::string TypeName = "i32"; - InsertElementInst *Def = dyn_cast(Arg); - assert(Def); + InsertElementInst *Def = cast(Arg); Args.push_back(Def->getOperand(1)); Types.push_back(ElementTy); std::string VecTypeName = "v1" + TypeName; diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp index 261fb3838d37..5975a517994a 100644 --- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp +++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp @@ -173,6 +173,6 @@ void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum, bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum, raw_ostream &O) { - assert(0 && "FIXME: Implement SparcInstPrinter::printGetPCX."); + llvm_unreachable("FIXME: Implement SparcInstPrinter::printGetPCX."); return true; } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index 0fbac218cb6b..dcd81e3d6249 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -196,7 +196,7 @@ namespace { const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override { // FIXME. - assert(0 && "fixupNeedsRelaxation() unimplemented"); + llvm_unreachable("fixupNeedsRelaxation() unimplemented"); return false; } void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp index 6875fc653541..df66ca9006b8 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp @@ -43,7 +43,8 @@ SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) { SunStyleELFSectionSwitchSyntax = true; UsesELFSectionDirectiveForBSS = true; - if (TheTriple.getOS() == llvm::Triple::Solaris) + if (TheTriple.getOS() == llvm::Triple::Solaris || + TheTriple.getOS() == llvm::Triple::OpenBSD) UseIntegratedAssembler = true; } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp index b19ad7b45ca6..eea9626c17b0 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp @@ -133,7 +133,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, if (Expr->EvaluateAsAbsolute(Res)) return Res; - assert(0 && "Unhandled expression!"); + llvm_unreachable("Unhandled expression!"); return 0; } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp index 3ccdd038fb33..7f01ab06879f 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELF.h" +#include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Object/ELF.h" @@ -219,35 +220,6 @@ void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const { fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm); } -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -// FIXME: really do above: now that at least three other backends are using it. -static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - break; - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast(Value); - AddValueSymbolsImpl(BE->getLHS(), Asm); - AddValueSymbolsImpl(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbolsImpl(cast(Value)->getSubExpr(), Asm); - break; - } -} - -void SparcMCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbolsImpl(getSubExpr(), Asm); +void SparcMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h index 78dd945e2277..f0d0ef363ad8 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h @@ -88,7 +88,7 @@ class SparcMCExpr : public MCTargetExpr { void PrintImpl(raw_ostream &OS) const override; bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const override; - void AddValueSymbols(MCAssembler *) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; const MCSection *FindAssociatedSection() const override { return getSubExpr()->FindAssociatedSection(); } diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp index a37da94df1a9..3cdfda3e059a 100644 --- a/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/lib/Target/Sparc/SparcFrameLowering.cpp @@ -14,6 +14,7 @@ #include "SparcFrameLowering.h" #include "SparcInstrInfo.h" #include "SparcMachineFunctionInfo.h" +#include "SparcSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -32,6 +33,9 @@ DisableLeafProc("disable-sparc-leaf-proc", cl::desc("Disable Sparc leaf procedure optimization."), cl::Hidden); +SparcFrameLowering::SparcFrameLowering(const SparcSubtarget &ST) + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, + ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8) {} void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF, MachineBasicBlock &MBB, @@ -99,7 +103,9 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const { SAVEri = SP::ADDri; SAVErr = SP::ADDrr; } - NumBytes = - SubTarget.getAdjustedFrameSize(NumBytes); + NumBytes = + -MF.getTarget().getSubtarget().getAdjustedFrameSize( + NumBytes); emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri); MachineModuleInfo &MMI = MF.getMMI(); @@ -162,7 +168,8 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF, if (NumBytes == 0) return; - NumBytes = SubTarget.getAdjustedFrameSize(NumBytes); + NumBytes = MF.getTarget().getSubtarget().getAdjustedFrameSize( + NumBytes); emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri); } diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h index bda7b7cd185e..a7d1b8902dcd 100644 --- a/lib/Target/Sparc/SparcFrameLowering.h +++ b/lib/Target/Sparc/SparcFrameLowering.h @@ -15,19 +15,14 @@ #define SPARC_FRAMEINFO_H #include "Sparc.h" -#include "SparcSubtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { - class SparcSubtarget; +class SparcSubtarget; class SparcFrameLowering : public TargetFrameLowering { - const SparcSubtarget &SubTarget; public: - explicit SparcFrameLowering(const SparcSubtarget &ST) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, - ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8), - SubTarget(ST) {} + explicit SparcFrameLowering(const SparcSubtarget &ST); /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index ef614667ee6d..990f52a97275 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -2030,7 +2030,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG, } TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(SDLoc(Op)).setChain(Chain) - .setCallee(CallingConv::C, RetTyABI, Callee, &Args, 0); + .setCallee(CallingConv::C, RetTyABI, Callee, std::move(Args), 0); std::pair CallInfo = LowerCallTo(CLI); @@ -2086,7 +2086,7 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL).setChain(Chain) - .setCallee(CallingConv::C, RetTy, Callee, &Args, 0); + .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); std::pair CallInfo = LowerCallTo(CLI); diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp index c2b897c6081a..a308fc5e739e 100644 --- a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp +++ b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp @@ -11,13 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "SparcTargetMachine.h" +#include "SparcSelectionDAGInfo.h" using namespace llvm; #define DEBUG_TYPE "sparc-selectiondag-info" -SparcSelectionDAGInfo::SparcSelectionDAGInfo(const SparcTargetMachine &TM) - : TargetSelectionDAGInfo(TM.getDataLayout()) { +SparcSelectionDAGInfo::SparcSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) { } SparcSelectionDAGInfo::~SparcSelectionDAGInfo() { diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.h b/lib/Target/Sparc/SparcSelectionDAGInfo.h index dcd42037253d..2346f4109dcb 100644 --- a/lib/Target/Sparc/SparcSelectionDAGInfo.h +++ b/lib/Target/Sparc/SparcSelectionDAGInfo.h @@ -22,7 +22,7 @@ class SparcTargetMachine; class SparcSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit SparcSelectionDAGInfo(const SparcTargetMachine &TM); + explicit SparcSelectionDAGInfo(const DataLayout &DL); ~SparcSelectionDAGInfo(); }; diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp index e38fb02c9a89..eea0c8c33c6a 100644 --- a/lib/Target/Sparc/SparcSubtarget.cpp +++ b/lib/Target/Sparc/SparcSubtarget.cpp @@ -26,20 +26,44 @@ using namespace llvm; void SparcSubtarget::anchor() { } -SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit) : - SparcGenSubtargetInfo(TT, CPU, FS), - IsV9(false), - V8DeprecatedInsts(false), - IsVIS(false), - Is64Bit(is64Bit), - HasHardQuad(false), - UsePopc(false) { +static std::string computeDataLayout(const SparcSubtarget &ST) { + // Sparc is big endian. + std::string Ret = "E-m:e"; + + // Some ABIs have 32bit pointers. + if (!ST.is64Bit()) + Ret += "-p:32:32"; + + // Alignments for 64 bit integers. + Ret += "-i64:64"; + + // On SparcV9 128 floats are aligned to 128 bits, on others only to 64. + // On SparcV9 registers can hold 64 or 32 bits, on others only 32. + if (ST.is64Bit()) + Ret += "-n32:64"; + else + Ret += "-f128:64-n32"; + + if (ST.is64Bit()) + Ret += "-S128"; + else + Ret += "-S64"; + + return Ret; +} + +SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + IsV9 = false; + V8DeprecatedInsts = false; + IsVIS = false; + HasHardQuad = false; + UsePopc = false; // Determine default and user specified characteristics std::string CPUName = CPU; if (CPUName.empty()) - CPUName = (is64Bit) ? "v9" : "v8"; + CPUName = (Is64Bit) ? "v9" : "v8"; // Parse features string. ParseSubtargetFeatures(CPUName, FS); @@ -47,8 +71,16 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU, // Popc is a v9-only instruction. if (!IsV9) UsePopc = false; + + return *this; } +SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, TargetMachine &TM, + bool is64Bit) + : SparcGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), + DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))), + InstrInfo(*this), TLInfo(TM), TSInfo(DL), FrameLowering(*this) {} int SparcSubtarget::getAdjustedFrameSize(int frameSize) const { diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h index 4025622be8e4..a3357786cded 100644 --- a/lib/Target/Sparc/SparcSubtarget.h +++ b/lib/Target/Sparc/SparcSubtarget.h @@ -14,6 +14,13 @@ #ifndef SPARC_SUBTARGET_H #define SPARC_SUBTARGET_H +#include "SparcFrameLowering.h" +#include "SparcInstrInfo.h" +#include "SparcISelLowering.h" +#include "SparcJITInfo.h" +#include "SparcSelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include @@ -31,10 +38,26 @@ class SparcSubtarget : public SparcGenSubtargetInfo { bool Is64Bit; bool HasHardQuad; bool UsePopc; + const DataLayout DL; // Calculates type size & alignment + SparcInstrInfo InstrInfo; + SparcTargetLowering TLInfo; + SparcSelectionDAGInfo TSInfo; + SparcFrameLowering FrameLowering; + SparcJITInfo JITInfo; public: SparcSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64bit); + const std::string &FS, TargetMachine &TM, bool is64bit); + + const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; } + const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } + const SparcRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const SparcTargetLowering *getTargetLowering() const { return &TLInfo; } + const SparcSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + SparcJITInfo *getJITInfo() { return &JITInfo; } + const DataLayout *getDataLayout() const { return &DL; } bool isV9() const { return IsV9; } bool isVIS() const { return IsVIS; } @@ -47,6 +70,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo { /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + SparcSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); bool is64Bit() const { return Is64Bit; } diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 2469d9312c16..0130face3ff6 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -23,32 +23,6 @@ extern "C" void LLVMInitializeSparcTarget() { RegisterTargetMachine Y(TheSparcV9Target); } -static std::string computeDataLayout(const SparcSubtarget &ST) { - // Sparc is big endian. - std::string Ret = "E-m:e"; - - // Some ABIs have 32bit pointers. - if (!ST.is64Bit()) - Ret += "-p:32:32"; - - // Alignments for 64 bit integers. - Ret += "-i64:64"; - - // On SparcV9 128 floats are aligned to 128 bits, on others only to 64. - // On SparcV9 registers can hold 64 or 32 bits, on others only 32. - if (ST.is64Bit()) - Ret += "-n32:64"; - else - Ret += "-f128:64-n32"; - - if (ST.is64Bit()) - Ret += "-S128"; - else - Ret += "-S64"; - - return Ret; -} - /// SparcTargetMachine ctor - Create an ILP32 architecture model /// SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, @@ -58,11 +32,7 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, CodeGenOpt::Level OL, bool is64bit) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, is64bit), - DL(computeDataLayout(Subtarget)), - InstrInfo(Subtarget), - TLInfo(*this), TSInfo(*this), - FrameLowering(Subtarget) { + Subtarget(TT, CPU, FS, *this, is64bit) { initAsmInfo(); } diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h index 7d043388e8cf..03b513746dfe 100644 --- a/lib/Target/Sparc/SparcTargetMachine.h +++ b/lib/Target/Sparc/SparcTargetMachine.h @@ -14,50 +14,40 @@ #ifndef SPARCTARGETMACHINE_H #define SPARCTARGETMACHINE_H -#include "SparcFrameLowering.h" -#include "SparcISelLowering.h" #include "SparcInstrInfo.h" -#include "SparcJITInfo.h" -#include "SparcSelectionDAGInfo.h" #include "SparcSubtarget.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { class SparcTargetMachine : public LLVMTargetMachine { SparcSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - SparcInstrInfo InstrInfo; - SparcTargetLowering TLInfo; - SparcSelectionDAGInfo TSInfo; - SparcFrameLowering FrameLowering; - SparcJITInfo JITInfo; public: SparcTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool is64bit); - const SparcInstrInfo *getInstrInfo() const override { return &InstrInfo; } - const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; + const SparcInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const TargetFrameLowering *getFrameLowering() const override { + return getSubtargetImpl()->getFrameLowering(); } const SparcSubtarget *getSubtargetImpl() const override { return &Subtarget; } const SparcRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); } - const SparcTargetLowering* getTargetLowering() const override { - return &TLInfo; + const SparcTargetLowering *getTargetLowering() const override { + return getSubtargetImpl()->getTargetLowering(); } - const SparcSelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + const SparcSelectionDAGInfo *getSelectionDAGInfo() const override { + return getSubtargetImpl()->getSelectionDAGInfo(); } - SparcJITInfo *getJITInfo() override { - return &JITInfo; + SparcJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - const DataLayout *getDataLayout() const override { return &DL; } // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td index c4f641e7bdec..fb0d1d8a3fe7 100644 --- a/lib/Target/SystemZ/SystemZCallingConv.td +++ b/lib/Target/SystemZ/SystemZCallingConv.td @@ -13,7 +13,7 @@ class CCIfExtend : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; //===----------------------------------------------------------------------===// -// SVR4 return value calling convention +// z/Linux return value calling convention //===----------------------------------------------------------------------===// def RetCC_SystemZ : CallingConv<[ // Promote i32 to i64 if it has an explicit extension type. @@ -39,7 +39,7 @@ def RetCC_SystemZ : CallingConv<[ ]>; //===----------------------------------------------------------------------===// -// SVR4 argument calling conventions +// z/Linux argument calling conventions //===----------------------------------------------------------------------===// def CC_SystemZ : CallingConv<[ // Promote i32 to i64 if it has an explicit extension type. @@ -63,3 +63,9 @@ def CC_SystemZ : CallingConv<[ // Other arguments are passed in 8-byte-aligned 8-byte stack slots. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> ]>; + +//===----------------------------------------------------------------------===// +// z/Linux callee-saved registers +//===----------------------------------------------------------------------===// +def CSR_SystemZ : CalleeSavedRegs<(add (sequence "R%dD", 6, 15), + (sequence "F%dD", 8, 15))>; diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp index 65f3caf64e4c..055dbe914995 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -10,8 +10,9 @@ #include "SystemZFrameLowering.h" #include "SystemZCallingConv.h" #include "SystemZInstrBuilder.h" +#include "SystemZInstrInfo.h" #include "SystemZMachineFunctionInfo.h" -#include "SystemZTargetMachine.h" +#include "SystemZRegisterInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" @@ -44,11 +45,9 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = { }; } // end anonymous namespace -SystemZFrameLowering::SystemZFrameLowering(const SystemZTargetMachine &tm, - const SystemZSubtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, - -SystemZMC::CallFrameSize, 8), - TM(tm), STI(sti) { +SystemZFrameLowering::SystemZFrameLowering() + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, + -SystemZMC::CallFrameSize, 8) { // Create a mapping from register number to save slot offset. RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS); for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I) @@ -108,9 +107,8 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // instruction, or an implicit one that comes between the explicit start // and end registers. static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB, - const SystemZTargetMachine &TM, unsigned GPR64, bool IsImplicit) { - const SystemZRegisterInfo *RI = TM.getRegisterInfo(); + const TargetRegisterInfo *RI = MBB.getParent()->getTarget().getRegisterInfo(); unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32); bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32); if (!IsLive || !IsImplicit) { @@ -176,8 +174,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STMG)); // Add the explicit register operands. - addSavedGPR(MBB, MIB, TM, LowGPR, false); - addSavedGPR(MBB, MIB, TM, HighGPR, false); + addSavedGPR(MBB, MIB, LowGPR, false); + addSavedGPR(MBB, MIB, HighGPR, false); // Add the address. MIB.addReg(SystemZ::R15D).addImm(StartOffset); @@ -187,13 +185,13 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, for (unsigned I = 0, E = CSI.size(); I != E; ++I) { unsigned Reg = CSI[I].getReg(); if (SystemZ::GR64BitRegClass.contains(Reg)) - addSavedGPR(MBB, MIB, TM, Reg, true); + addSavedGPR(MBB, MIB, Reg, true); } // ...likewise GPR varargs. if (IsVarArg) for (unsigned I = ZFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I) - addSavedGPR(MBB, MIB, TM, SystemZ::ArgGPRs[I], true); + addSavedGPR(MBB, MIB, SystemZ::ArgGPRs[I], true); } // Save FPRs in the normal TargetInstrInfo way. diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h index 70e25fb243b2..4d5fe6dce62d 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/lib/Target/SystemZ/SystemZFrameLowering.h @@ -10,7 +10,6 @@ #ifndef SYSTEMZFRAMELOWERING_H #define SYSTEMZFRAMELOWERING_H -#include "SystemZSubtarget.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/Target/TargetFrameLowering.h" @@ -21,13 +20,8 @@ class SystemZSubtarget; class SystemZFrameLowering : public TargetFrameLowering { IndexedMap RegSpillOffsets; -protected: - const SystemZTargetMachine &TM; - const SystemZSubtarget &STI; - public: - SystemZFrameLowering(const SystemZTargetMachine &tm, - const SystemZSubtarget &sti); + SystemZFrameLowering(); // Override TargetFrameLowering. bool isFPCloseToIncomingSP() const override { return false; } diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 6fe1fb9f7d3d..00c65f5bba6b 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -80,9 +80,9 @@ static MachineOperand earlyUseOperand(MachineOperand Op) { return Op; } -SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) - : TargetLowering(tm, new TargetLoweringObjectFileELF()), - Subtarget(*tm.getSubtargetImpl()), TM(tm) { +SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm) + : TargetLowering(tm, new TargetLoweringObjectFileELF()), + Subtarget(tm.getSubtarget()) { MVT PtrVT = getPointerTy(); // Set up the register classes. @@ -673,11 +673,13 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, MachineRegisterInfo &MRI = MF.getRegInfo(); SystemZMachineFunctionInfo *FuncInfo = MF.getInfo(); - auto *TFL = static_cast(TM.getFrameLowering()); + auto *TFL = static_cast( + DAG.getTarget().getFrameLowering()); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; - CCState CCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ); unsigned NumFixedGPRs = 0; @@ -815,7 +817,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Analyze the operands of the call, assigning locations to each operand. SmallVector ArgLocs; - CCState ArgCCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext()); + CCState ArgCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), ArgLocs, + *DAG.getContext()); ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ); // We don't support GuaranteedTailCallOpt, only automatically-detected @@ -911,6 +914,12 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(DAG.getRegister(RegsToPass[I].first, RegsToPass[I].second.getValueType())); + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + // Glue the call to the argument copies, if any. if (Glue.getNode()) Ops.push_back(Glue); @@ -931,7 +940,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Assign locations to each value returned by this call. SmallVector RetLocs; - CCState RetCCInfo(CallConv, IsVarArg, MF, TM, RetLocs, *DAG.getContext()); + CCState RetCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), RetLocs, + *DAG.getContext()); RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ); // Copy all of the result registers out of their specified physreg. @@ -962,7 +972,8 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, // Assign locations to each returned value. SmallVector RetLocs; - CCState RetCCInfo(CallConv, IsVarArg, MF, TM, RetLocs, *DAG.getContext()); + CCState RetCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), RetLocs, + *DAG.getContext()); RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ); // Quick exit for void returns @@ -1786,8 +1797,8 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node, const GlobalValue *GV = Node->getGlobal(); int64_t Offset = Node->getOffset(); EVT PtrVT = getPointerTy(); - Reloc::Model RM = TM.getRelocationModel(); - CodeModel::Model CM = TM.getCodeModel(); + Reloc::Model RM = DAG.getTarget().getRelocationModel(); + CodeModel::Model CM = DAG.getTarget().getCodeModel(); SDValue Result; if (Subtarget.isPC32DBLSymbol(GV, RM, CM)) { @@ -1824,7 +1835,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SDLoc DL(Node); const GlobalValue *GV = Node->getGlobal(); EVT PtrVT = getPointerTy(); - TLSModel::Model model = TM.getTLSModel(GV); + TLSModel::Model model = DAG.getTarget().getTLSModel(GV); if (model != TLSModel::LocalExec) llvm_unreachable("only local-exec TLS mode supported"); @@ -2287,9 +2298,9 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op, // Use an addition if the operand is constant and either LAA(G) is // available or the negative value is in the range of A(G)FHI. int64_t Value = (-Op2->getAPIntValue()).getSExtValue(); - if (isInt<32>(Value) || TM.getSubtargetImpl()->hasInterlockedAccess1()) + if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1()) NegSrc2 = DAG.getConstant(Value, MemVT); - } else if (TM.getSubtargetImpl()->hasInterlockedAccess1()) + } else if (Subtarget.hasInterlockedAccess1()) // Use LAA(G) if available. NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, MemVT), Src2); @@ -2602,7 +2613,8 @@ static unsigned forceReg(MachineInstr *MI, MachineOperand &Base, MachineBasicBlock * SystemZTargetLowering::emitSelect(MachineInstr *MI, MachineBasicBlock *MBB) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); + const SystemZInstrInfo *TII = static_cast( + MBB->getParent()->getTarget().getInstrInfo()); unsigned DestReg = MI->getOperand(0).getReg(); unsigned TrueReg = MI->getOperand(1).getReg(); @@ -2650,7 +2662,8 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI, MachineBasicBlock *MBB, unsigned StoreOpcode, unsigned STOCOpcode, bool Invert) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); + const SystemZInstrInfo *TII = static_cast( + MBB->getParent()->getTarget().getInstrInfo()); unsigned SrcReg = MI->getOperand(0).getReg(); MachineOperand Base = MI->getOperand(1); @@ -2665,7 +2678,7 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI, // Use STOCOpcode if possible. We could use different store patterns in // order to avoid matching the index register, but the performance trade-offs // might be more complicated in that case. - if (STOCOpcode && !IndexReg && TM.getSubtargetImpl()->hasLoadStoreOnCond()) { + if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) { if (Invert) CCMask ^= CCValid; BuildMI(*MBB, MI, DL, TII->get(STOCOpcode)) @@ -2717,8 +2730,9 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI, unsigned BinOpcode, unsigned BitSize, bool Invert) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); bool IsSubWord = (BitSize < 32); @@ -2840,8 +2854,9 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI, unsigned CompareOpcode, unsigned KeepOldMask, unsigned BitSize) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); bool IsSubWord = (BitSize < 32); @@ -2951,8 +2966,9 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI, MachineBasicBlock * SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI, MachineBasicBlock *MBB) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); // Extract the operands. Base can be a register or a frame index. @@ -3067,8 +3083,9 @@ MachineBasicBlock * SystemZTargetLowering::emitExt128(MachineInstr *MI, MachineBasicBlock *MBB, bool ClearEven, unsigned SubReg) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -3098,8 +3115,9 @@ MachineBasicBlock * SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI, MachineBasicBlock *MBB, unsigned Opcode) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -3267,8 +3285,9 @@ MachineBasicBlock * SystemZTargetLowering::emitStringWrapper(MachineInstr *MI, MachineBasicBlock *MBB, unsigned Opcode) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL = MI->getDebugLoc(); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index bceb25e036e7..e21b0501933f 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -198,7 +198,7 @@ class SystemZTargetMachine; class SystemZTargetLowering : public TargetLowering { public: - explicit SystemZTargetLowering(SystemZTargetMachine &TM); + explicit SystemZTargetLowering(const TargetMachine &TM); // Override TargetLowering. MVT getScalarShiftAmountTy(EVT LHSTy) const override { @@ -249,7 +249,6 @@ class SystemZTargetLowering : public TargetLowering { private: const SystemZSubtarget &Subtarget; - const SystemZTargetMachine &TM; // Implement LowerOperation for individual opcodes. SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index a1e782cdfd77..e8841e131324 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -133,6 +133,13 @@ def LEDBR : UnaryRRE<"ledb", 0xB344, fround, FP32, FP64>; def LEXBR : UnaryRRE<"lexb", 0xB346, null_frag, FP128, FP128>; def LDXBR : UnaryRRE<"ldxb", 0xB345, null_frag, FP128, FP128>; +def LEDBRA : UnaryRRF4<"ledbra", 0xB344, FP32, FP64>, + Requires<[FeatureFPExtension]>; +def LEXBRA : UnaryRRF4<"lexbra", 0xB346, FP128, FP128>, + Requires<[FeatureFPExtension]>; +def LDXBRA : UnaryRRF4<"ldxbra", 0xB345, FP128, FP128>, + Requires<[FeatureFPExtension]>; + def : Pat<(f32 (fround FP128:$src)), (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hh32)>; def : Pat<(f64 (fround FP128:$src)), diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index add675a22ccc..9f59a1c8e7e3 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -511,34 +511,24 @@ class InstSS op, dag outs, dag ins, string asmstr, list pattern> // to store. Other stored registers are added as implicit uses. // // Unary: -// One register output operand and one input operand. The input -// operand may be a register, immediate or memory. +// One register output operand and one input operand. // // Binary: -// One register output operand and two input operands. The first -// input operand is always a register and the second may be a register, -// immediate or memory. -// -// Shift: -// One register output operand and two input operands. The first -// input operand is a register and the second has the same form as -// an address (although it isn't actually used to address memory). +// One register output operand and two input operands. // // Compare: -// Two input operands. The first operand is always a register, -// the second may be a register, immediate or memory. +// Two input operands and an implicit CC output operand. // // Ternary: -// One register output operand and three register input operands. +// One register output operand and three input operands. // // LoadAndOp: -// One output operand and two input operands. The first input operand -// is a register and the second is an address. +// One output operand and two input operands, one of which is an address. +// The instruction both reads from and writes to the address. // // CmpSwap: -// One output operand and three input operands. The first two -// operands are registers and the third is an address. The instruction -// both reads from and writes to the address. +// One output operand and three input operands, one of which is an address. +// The instruction both reads from and writes to the address. // // RotateSelect: // One output operand and five input operands. The first two operands @@ -691,7 +681,7 @@ class CondStoreRSY opcode, class AsmCondStoreRSY opcode, RegisterOperand cls, bits<5> bytes, AddressingMode mode = bdaddr20only> - : InstRSY, Requires<[FeatureLoadStoreOnCond]> { let mayStore = 1; @@ -730,7 +720,7 @@ class UnaryRRE opcode, SDPatternOperator operator, class UnaryRRF opcode, RegisterOperand cls1, RegisterOperand cls2> - : InstRRF { let OpKey = mnemonic ## cls1; let OpType = "reg"; @@ -739,7 +729,7 @@ class UnaryRRF opcode, RegisterOperand cls1, class UnaryRRF4 opcode, RegisterOperand cls1, RegisterOperand cls2> - : InstRRF; // These instructions are generated by if conversion. The old value of R1 @@ -757,7 +747,7 @@ class CondUnaryRRF opcode, RegisterOperand cls1, // mask is the third operand rather than being part of the mnemonic. class AsmCondUnaryRRF opcode, RegisterOperand cls1, RegisterOperand cls2> - : InstRRF, Requires<[FeatureLoadStoreOnCond]> { let Constraints = "$R1 = $R1src"; @@ -823,7 +813,7 @@ class CondUnaryRSY opcode, class AsmCondUnaryRSY opcode, RegisterOperand cls, bits<5> bytes, AddressingMode mode = bdaddr20only> - : InstRSY, Requires<[FeatureLoadStoreOnCond]> { let mayLoad = 1; @@ -993,6 +983,33 @@ class BinaryRIL opcode, SDPatternOperator operator, let DisableEncoding = "$R1src"; } +class BinaryRS opcode, SDPatternOperator operator, + RegisterOperand cls> + : InstRS { + let R3 = 0; + let Constraints = "$R1 = $R1src"; + let DisableEncoding = "$R1src"; +} + +class BinaryRSY opcode, SDPatternOperator operator, + RegisterOperand cls> + : InstRSY; + +multiclass BinaryRSAndK opcode1, bits<16> opcode2, + SDPatternOperator operator, RegisterOperand cls> { + let NumOpsKey = mnemonic in { + let NumOpsValue = "3" in + def K : BinaryRSY, + Requires<[FeatureDistinctOps]>; + let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in + def "" : BinaryRS; + } +} + class BinaryRX opcode, SDPatternOperator operator, RegisterOperand cls, SDPatternOperator load, bits<5> bytes, AddressingMode mode = bdxaddr12only> @@ -1077,33 +1094,6 @@ multiclass BinarySIPair siOpcode, } } -class ShiftRS opcode, SDPatternOperator operator, - RegisterOperand cls> - : InstRS { - let R3 = 0; - let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; -} - -class ShiftRSY opcode, SDPatternOperator operator, - RegisterOperand cls> - : InstRSY; - -multiclass ShiftRSAndK opcode1, bits<16> opcode2, - SDPatternOperator operator, RegisterOperand cls> { - let NumOpsKey = mnemonic in { - let NumOpsValue = "3" in - def K : ShiftRSY, - Requires<[FeatureDistinctOps]>; - let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in - def "" : ShiftRS; - } -} - class CompareRR opcode, SDPatternOperator operator, RegisterOperand cls1, RegisterOperand cls2> : InstRR rsOpcode, bits<16> rsyOpcode, class RotateSelectRIEf opcode, RegisterOperand cls1, RegisterOperand cls2> : InstRIEf { let Constraints = "$R1 = $R1src"; let DisableEncoding = "$R1src"; } class PrefetchRXY opcode, SDPatternOperator operator> - : InstRXY; + [(operator imm32zx4:$R1, bdxaddr20only:$XBD2)]>; class PrefetchRILPC opcode, SDPatternOperator operator> - : InstRIL { + [(operator imm32zx4:$R1, pcrel32:$I2)]> { // We want PC-relative addresses to be tried ahead of BD and BDX addresses. // However, BDXs have two extra operands and are therefore 6 units more // complex. @@ -1450,7 +1441,8 @@ class StoreRXYPseudo : Pseudo<(outs cls1:$R1), - (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5), + (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4, + imm32zx6:$I5), []> { let Constraints = "$R1 = $R1src"; let DisableEncoding = "$R1src"; @@ -1460,9 +1452,9 @@ class RotateSelectRIEfPseudo // the value of the PSW's 2-bit condition code field. class SelectWrapper : Pseudo<(outs cls:$dst), - (ins cls:$src1, cls:$src2, uimm8zx4:$valid, uimm8zx4:$cc), + (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc), [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2, - uimm8zx4:$valid, uimm8zx4:$cc))]> { + imm32zx4:$valid, imm32zx4:$cc))]> { let usesCustomInserter = 1; // Although the instructions used by these nodes do not in themselves // change CC, the insertion requires new blocks, and CC cannot be live @@ -1476,14 +1468,14 @@ multiclass CondStores { let Defs = [CC], Uses = [CC], usesCustomInserter = 1 in { def "" : Pseudo<(outs), - (ins cls:$new, mode:$addr, uimm8zx4:$valid, uimm8zx4:$cc), + (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc), [(store (z_select_ccmask cls:$new, (load mode:$addr), - uimm8zx4:$valid, uimm8zx4:$cc), + imm32zx4:$valid, imm32zx4:$cc), mode:$addr)]>; def Inv : Pseudo<(outs), - (ins cls:$new, mode:$addr, uimm8zx4:$valid, uimm8zx4:$cc), + (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc), [(store (z_select_ccmask (load mode:$addr), cls:$new, - uimm8zx4:$valid, uimm8zx4:$cc), + imm32zx4:$valid, imm32zx4:$cc), mode:$addr)]>; } } @@ -1611,6 +1603,7 @@ class CompareAliasRI : Alias<6, (outs cls1:$R1), - (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5), []> { + (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4, + imm32zx6:$I5), []> { let Constraints = "$R1 = $R1src"; } diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 6a18b2dea9ea..f58ab474fbbc 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -40,9 +40,9 @@ static bool isHighReg(unsigned int Reg) { // Pin the vtable to this file. void SystemZInstrInfo::anchor() {} -SystemZInstrInfo::SystemZInstrInfo(SystemZTargetMachine &tm) +SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti) : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP), - RI(tm), TM(tm) { + RI(), STI(sti) { } // MI is a 128-bit load or store. Split it into two 64-bit loads or stores, @@ -488,7 +488,7 @@ SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare, bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0; if (Value == 0 && !IsLogical && - removeIPMBasedCompare(Compare, SrcReg, MRI, TM.getRegisterInfo())) + removeIPMBasedCompare(Compare, SrcReg, MRI, &RI)) return true; return false; } @@ -505,7 +505,7 @@ static unsigned getConditionalMove(unsigned Opcode) { bool SystemZInstrInfo::isPredicable(MachineInstr *MI) const { unsigned Opcode = MI->getOpcode(); - if (TM.getSubtargetImpl()->hasLoadStoreOnCond() && + if (STI.hasLoadStoreOnCond() && getConditionalMove(Opcode)) return true; return false; @@ -537,7 +537,7 @@ PredicateInstruction(MachineInstr *MI, unsigned CCMask = Pred[1].getImm(); assert(CCMask > 0 && CCMask < 15 && "Invalid predicate"); unsigned Opcode = MI->getOpcode(); - if (TM.getSubtargetImpl()->hasLoadStoreOnCond()) { + if (STI.hasLoadStoreOnCond()) { if (unsigned CondOpcode = getConditionalMove(Opcode)) { MI->setDesc(get(CondOpcode)); MachineInstrBuilder(*MI->getParent()->getParent(), MI) @@ -685,7 +685,7 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // We prefer to keep the two-operand form where possible both // because it tends to be shorter and because some instructions // have memory forms that can be used during spilling. - if (TM.getSubtargetImpl()->hasDistinctOps()) { + if (STI.hasDistinctOps()) { MachineOperand &Dest = MI->getOperand(0); MachineOperand &Src = MI->getOperand(1); unsigned DestReg = Dest.getReg(); diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h index 09aee5d20293..83009cb8d426 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/lib/Target/SystemZ/SystemZInstrInfo.h @@ -110,9 +110,10 @@ struct Branch { }; } // end namespace SystemZII +class SystemZSubtarget; class SystemZInstrInfo : public SystemZGenInstrInfo { const SystemZRegisterInfo RI; - SystemZTargetMachine &TM; + SystemZSubtarget &STI; void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const; void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const; @@ -130,7 +131,7 @@ class SystemZInstrInfo : public SystemZGenInstrInfo { virtual void anchor(); public: - explicit SystemZInstrInfo(SystemZTargetMachine &TM); + explicit SystemZInstrInfo(SystemZSubtarget &STI); // Override TargetInstrInfo. unsigned isLoadFromStackSlot(const MachineInstr *MI, diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index e70df92ffe81..f4951ad8e0ac 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -63,11 +63,11 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in { def BRCL : InstRIL<0xC04, (outs), (ins cond4:$valid, cond4:$R1, brtarget32:$I2), "jg$R1\t$I2", []>; } - def AsmBRC : InstRI<0xA74, (outs), (ins uimm8zx4:$R1, brtarget16:$I2), + def AsmBRC : InstRI<0xA74, (outs), (ins imm32zx4:$R1, brtarget16:$I2), "brc\t$R1, $I2", []>; - def AsmBRCL : InstRIL<0xC04, (outs), (ins uimm8zx4:$R1, brtarget32:$I2), + def AsmBRCL : InstRIL<0xC04, (outs), (ins imm32zx4:$R1, brtarget32:$I2), "brcl\t$R1, $I2", []>; - def AsmBCR : InstRR<0x07, (outs), (ins uimm8zx4:$R1, GR64:$R2), + def AsmBCR : InstRR<0x07, (outs), (ins imm32zx4:$R1, GR64:$R2), "bcr\t$R1, $R2", []>; } @@ -109,7 +109,7 @@ multiclass CompareBranches { } let isCodeGenOnly = 1 in defm C : CompareBranches; -defm AsmC : CompareBranches; +defm AsmC : CompareBranches; // Define AsmParser mnemonics for each general condition-code mask // (integer or floating-point) @@ -233,9 +233,7 @@ defm CondStore64 : CondStores; def CallBASR : Alias<2, (outs), (ins ADDR64:$R2, variable_ops), @@ -855,7 +853,7 @@ let Defs = [CC] in { } // AND to memory - defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, uimm8>; + defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, imm32zx8>; // Block AND. let mayLoad = 1, mayStore = 1 in @@ -912,7 +910,7 @@ let Defs = [CC] in { } // OR to memory - defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, uimm8>; + defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, imm32zx8>; // Block OR. let mayLoad = 1, mayStore = 1 in @@ -952,7 +950,7 @@ let Defs = [CC] in { } // XOR to memory - defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, uimm8>; + defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, imm32zx8>; // Block XOR. let mayLoad = 1, mayStore = 1 in @@ -1015,26 +1013,26 @@ def DLG : BinaryRXY<"dlg", 0xE387, z_udivrem64, GR128, load, 8>; // Shift left. let neverHasSideEffects = 1 in { - defm SLL : ShiftRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>; - def SLLG : ShiftRSY<"sllg", 0xEB0D, shl, GR64>; + defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>; + def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>; } // Logical shift right. let neverHasSideEffects = 1 in { - defm SRL : ShiftRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>; - def SRLG : ShiftRSY<"srlg", 0xEB0C, srl, GR64>; + defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>; + def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>; } // Arithmetic shift right. let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in { - defm SRA : ShiftRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>; - def SRAG : ShiftRSY<"srag", 0xEB0A, sra, GR64>; + defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>; + def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>; } // Rotate left. let neverHasSideEffects = 1 in { - def RLL : ShiftRSY<"rll", 0xEB1D, rotl, GR32>; - def RLLG : ShiftRSY<"rllg", 0xEB1C, rotl, GR64>; + def RLL : BinaryRSY<"rll", 0xEB1D, rotl, GR32>; + def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>; } // Rotate second operand left and inserted selected bits into first operand. @@ -1403,15 +1401,15 @@ def : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)), // Optimize sign-extended 1/0 selects to -1/0 selects. This is important // for vector legalization. -def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, uimm8zx4:$valid, uimm8zx4:$cc)), +def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, imm32zx4:$valid, imm32zx4:$cc)), (i32 31)), (i32 31)), - (Select32 (LHI -1), (LHI 0), uimm8zx4:$valid, uimm8zx4:$cc)>; -def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, uimm8zx4:$valid, - uimm8zx4:$cc)))), + (Select32 (LHI -1), (LHI 0), imm32zx4:$valid, imm32zx4:$cc)>; +def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, imm32zx4:$valid, + imm32zx4:$cc)))), (i32 63)), (i32 63)), - (Select64 (LGHI -1), (LGHI 0), uimm8zx4:$valid, uimm8zx4:$cc)>; + (Select64 (LGHI -1), (LGHI 0), imm32zx4:$valid, imm32zx4:$cc)>; // Peepholes for turning scalar operations into block operations. defm : BlockLoadStore; def S32Imm : ImmediateAsmOperand<"S32Imm">; def U32Imm : ImmediateAsmOperand<"U32Imm">; -//===----------------------------------------------------------------------===// -// 8-bit immediates -//===----------------------------------------------------------------------===// - -def uimm8zx4 : Immediate(N->getZExtValue()); -}], NOOP_SDNodeXForm, "U4Imm">; - -def uimm8zx6 : Immediate(N->getZExtValue()); -}], NOOP_SDNodeXForm, "U6Imm">; - -def simm8 : Immediate; -def uimm8 : Immediate; - //===----------------------------------------------------------------------===// // i32 immediates //===----------------------------------------------------------------------===// @@ -241,6 +226,14 @@ def imm32lh16c : Immediate; // Short immediates +def imm32zx4 : Immediate(N->getZExtValue()); +}], NOOP_SDNodeXForm, "U4Imm">; + +def imm32zx6 : Immediate(N->getZExtValue()); +}], NOOP_SDNodeXForm, "U6Imm">; + def imm32sx8 : Immediate(N->getSExtValue()); }], SIMM8, "S8Imm">; @@ -470,13 +463,13 @@ def AccessReg : AsmOperandClass { let Name = "AccessReg"; let ParserMethod = "parseAccessReg"; } -def access_reg : ImmediategetZExtValue() < 16; }], +def access_reg : ImmediategetZExtValue() < 16; }], NOOP_SDNodeXForm, "AccessReg"> { let ParserMatchClass = AccessReg; } // A 4-bit condition-code mask. -def cond4 : PatLeaf<(i8 imm), [{ return (N->getZExtValue() < 16); }]>, - Operand { +def cond4 : PatLeaf<(i32 imm), [{ return (N->getZExtValue() < 16); }]>, + Operand { let PrintMethod = "printCond4Operand"; } diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index a3919618f8f2..c70e662db427 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -19,14 +19,14 @@ def SDT_ZICmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; def SDT_ZBRCCMask : SDTypeProfile<0, 3, - [SDTCisVT<0, i8>, - SDTCisVT<1, i8>, + [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, SDTCisVT<2, OtherVT>]>; def SDT_ZSelectCCMask : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, - SDTCisVT<3, i8>, - SDTCisVT<4, i8>]>; + SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; def SDT_ZWrapPtr : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; @@ -37,7 +37,7 @@ def SDT_ZWrapOffset : SDTypeProfile<1, 2, def SDT_ZAdjDynAlloc : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>; def SDT_ZExtractAccess : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, - SDTCisVT<1, i8>]>; + SDTCisVT<1, i32>]>; def SDT_ZGR128Binary32 : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisVT<1, untyped>, @@ -77,7 +77,7 @@ def SDT_ZString : SDTypeProfile<1, 3, SDTCisVT<3, i32>]>; def SDT_ZI32Intrinsic : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>; def SDT_ZPrefetch : SDTypeProfile<0, 2, - [SDTCisVT<0, i8>, + [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td index c0f94ecbe2c9..e307f8a888ee 100644 --- a/lib/Target/SystemZ/SystemZPatterns.td +++ b/lib/Target/SystemZ/SystemZPatterns.td @@ -101,15 +101,15 @@ multiclass CondStores64 { def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr), - uimm8zx4:$valid, uimm8zx4:$cc), + imm32zx4:$valid, imm32zx4:$cc), mode:$addr), (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr, - uimm8zx4:$valid, uimm8zx4:$cc)>; + imm32zx4:$valid, imm32zx4:$cc)>; def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new, - uimm8zx4:$valid, uimm8zx4:$cc), + imm32zx4:$valid, imm32zx4:$cc), mode:$addr), (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr, - uimm8zx4:$valid, uimm8zx4:$cc)>; + imm32zx4:$valid, imm32zx4:$cc)>; } // Try to use MVC instruction INSN for a load of type LOAD followed by a store diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index a04d703d09fa..f03bcc412d51 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -7,31 +7,29 @@ // //===----------------------------------------------------------------------===// +#include "SystemZInstrInfo.h" #include "SystemZRegisterInfo.h" -#include "SystemZTargetMachine.h" +#include "SystemZSubtarget.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetFrameLowering.h" using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "SystemZGenRegisterInfo.inc" -SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm) - : SystemZGenRegisterInfo(SystemZ::R14D), TM(tm) {} +SystemZRegisterInfo::SystemZRegisterInfo() + : SystemZGenRegisterInfo(SystemZ::R14D) {} -const MCPhysReg* +const MCPhysReg * SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - static const MCPhysReg CalleeSavedRegs[] = { - SystemZ::R6D, SystemZ::R7D, SystemZ::R8D, SystemZ::R9D, - SystemZ::R10D, SystemZ::R11D, SystemZ::R12D, SystemZ::R13D, - SystemZ::R14D, SystemZ::R15D, - SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D, - SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D, - 0 - }; - - return CalleeSavedRegs; + return CSR_SystemZ_SaveList; +} + +const uint32_t * +SystemZRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { + return CSR_SystemZ_RegMask; } BitVector @@ -63,7 +61,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MachineBasicBlock &MBB = *MI->getParent(); MachineFunction &MF = *MBB.getParent(); - auto *TII = static_cast(TM.getInstrInfo()); + auto *TII = + static_cast(MF.getTarget().getInstrInfo()); const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); DebugLoc DL = MI->getDebugLoc(); diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h index e236f712e7d8..9bffa467a15d 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -29,15 +29,9 @@ inline unsigned odd128(bool Is32bit) { } } // end namespace SystemZ -class SystemZSubtarget; -class SystemZInstrInfo; - struct SystemZRegisterInfo : public SystemZGenRegisterInfo { -private: - SystemZTargetMachine &TM; - public: - SystemZRegisterInfo(SystemZTargetMachine &tm); + SystemZRegisterInfo(); // Override TargetRegisterInfo.h. bool requiresRegisterScavenging(const MachineFunction &MF) const override { @@ -51,6 +45,7 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo { } const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; + const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td index 93d7c8375b3d..47ac20dae78a 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -119,6 +119,29 @@ defm ADDR128 : SystemZRegClass<"ADDR128", untyped, 128, (sub GR128Bit, R0Q)>; // Floating-point registers //===----------------------------------------------------------------------===// +// Maps FPR register numbers to their DWARF encoding. +class DwarfMapping { int Id = id; } + +def F0Dwarf : DwarfMapping<16>; +def F2Dwarf : DwarfMapping<17>; +def F4Dwarf : DwarfMapping<18>; +def F6Dwarf : DwarfMapping<19>; + +def F1Dwarf : DwarfMapping<20>; +def F3Dwarf : DwarfMapping<21>; +def F5Dwarf : DwarfMapping<22>; +def F7Dwarf : DwarfMapping<23>; + +def F8Dwarf : DwarfMapping<24>; +def F10Dwarf : DwarfMapping<25>; +def F12Dwarf : DwarfMapping<26>; +def F14Dwarf : DwarfMapping<27>; + +def F9Dwarf : DwarfMapping<28>; +def F11Dwarf : DwarfMapping<29>; +def F13Dwarf : DwarfMapping<30>; +def F15Dwarf : DwarfMapping<31>; + // Lower 32 bits of one of the 16 64-bit floating-point registers class FPR32 num, string n> : SystemZReg { let HWEncoding = num; @@ -142,7 +165,7 @@ class FPR128 num, string n, FPR64 low, FPR64 high> foreach I = 0-15 in { def F#I#S : FPR32; def F#I#D : FPR64("F"#I#"S")>, - DwarfRegNum<[!add(I, 16)]>; + DwarfRegNum<[!cast("F"#I#"Dwarf").Id]>; } foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in { diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index 528227bd3c0a..a3cba64b9ed2 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -18,10 +18,8 @@ using namespace llvm; #define DEBUG_TYPE "systemz-selectiondag-info" -SystemZSelectionDAGInfo:: -SystemZSelectionDAGInfo(const SystemZTargetMachine &TM) - : TargetSelectionDAGInfo(TM.getDataLayout()) { -} +SystemZSelectionDAGInfo::SystemZSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() { } diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h index 79e7fab20c18..e9de146af1d6 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h @@ -22,7 +22,7 @@ class SystemZTargetMachine; class SystemZSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit SystemZSelectionDAGInfo(const SystemZTargetMachine &TM); + explicit SystemZSelectionDAGInfo(const DataLayout &DL); ~SystemZSelectionDAGInfo(); SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain, diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp index a011157dcdfc..e160bc86f225 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -20,16 +20,11 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "SystemZGenSubtargetInfo.inc" -// Pin the vtabel to this file. +// Pin the vtable to this file. void SystemZSubtarget::anchor() {} -SystemZSubtarget::SystemZSubtarget(const std::string &TT, - const std::string &CPU, - const std::string &FS) - : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false), - HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false), - HasFastSerialization(false), HasInterlockedAccess1(false), - TargetTriple(TT) { +SystemZSubtarget & +SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { std::string CPUName = CPU; if (CPUName.empty()) CPUName = "generic"; @@ -37,11 +32,26 @@ SystemZSubtarget::SystemZSubtarget(const std::string &TT, if (CPUName == "generic") CPUName = sys::getHostCPUName(); #endif - // Parse features string. ParseSubtargetFeatures(CPUName, FS); + return *this; } +SystemZSubtarget::SystemZSubtarget(const std::string &TT, + const std::string &CPU, + const std::string &FS, + const TargetMachine &TM) + : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false), + HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false), + HasFastSerialization(false), HasInterlockedAccess1(false), + TargetTriple(TT), + // Make sure that global data has at least 16 bits of alignment by + // default, so that we can refer to it using LARL. We don't have any + // special requirements for stack variables though. + DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM), + TSInfo(DL), FrameLowering() {} + // Return true if GV binds locally under reloc model RM. static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) { // For non-PIC, all symbols bind locally. diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h index ffca2d8113a8..4e8c710bdefd 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.h +++ b/lib/Target/SystemZ/SystemZSubtarget.h @@ -14,6 +14,12 @@ #ifndef SYSTEMZSUBTARGET_H #define SYSTEMZSUBTARGET_H +#include "SystemZFrameLowering.h" +#include "SystemZISelLowering.h" +#include "SystemZInstrInfo.h" +#include "SystemZRegisterInfo.h" +#include "SystemZSelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/ADT/Triple.h" #include "llvm/Target/TargetSubtargetInfo.h" #include @@ -37,10 +43,26 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo { private: Triple TargetTriple; - + const DataLayout DL; + SystemZInstrInfo InstrInfo; + SystemZTargetLowering TLInfo; + SystemZSelectionDAGInfo TSInfo; + SystemZFrameLowering FrameLowering; + + SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU, + StringRef FS); public: SystemZSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS); + const std::string &FS, const TargetMachine &TM); + + const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } + const SystemZInstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const SystemZRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const SystemZTargetLowering *getTargetLowering() const { return &TLInfo; } + const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } // This is important for reducing register pressure in vector code. bool useAA() const override { return true; } diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index 1fca067ad2e2..0122e99f8a77 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -22,17 +22,10 @@ extern "C" void LLVMInitializeSystemZTarget() { SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, - CodeModel::Model CM, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS), - // Make sure that global data has at least 16 bits of alignment by default, - // so that we can refer to it using LARL. We don't have any special - // requirements for stack variables though. - DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"), - InstrInfo(*this), TLInfo(*this), TSInfo(*this), - FrameLowering(*this, Subtarget) { + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index 1db717b7126d..ded07e912443 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -15,25 +15,15 @@ #ifndef SYSTEMZTARGETMACHINE_H #define SYSTEMZTARGETMACHINE_H -#include "SystemZFrameLowering.h" -#include "SystemZISelLowering.h" -#include "SystemZInstrInfo.h" -#include "SystemZRegisterInfo.h" -#include "SystemZSelectionDAGInfo.h" #include "SystemZSubtarget.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { +class TargetFrameLowering; + class SystemZTargetMachine : public LLVMTargetMachine { SystemZSubtarget Subtarget; - const DataLayout DL; - SystemZInstrInfo InstrInfo; - SystemZTargetLowering TLInfo; - SystemZSelectionDAGInfo TSInfo; - SystemZFrameLowering FrameLowering; public: SystemZTargetMachine(const Target &T, StringRef TT, StringRef CPU, @@ -43,25 +33,25 @@ class SystemZTargetMachine : public LLVMTargetMachine { // Override TargetMachine. const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); } const SystemZInstrInfo *getInstrInfo() const override { - return &InstrInfo; + return getSubtargetImpl()->getInstrInfo(); } const SystemZSubtarget *getSubtargetImpl() const override { return &Subtarget; } const DataLayout *getDataLayout() const override { - return &DL; + return getSubtargetImpl()->getDataLayout(); } const SystemZRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); } const SystemZTargetLowering *getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } // Override LLVMTargetMachine diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp index 39e045919ab0..2569e922641d 100644 --- a/lib/Target/TargetLoweringObjectFile.cpp +++ b/lib/Target/TargetLoweringObjectFile.cpp @@ -297,7 +297,8 @@ TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV, /// specified size and relocation information, return a section that it /// should be placed in. const MCSection * -TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind) const { +TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind, + const Constant *C) const { if (Kind.isReadOnly() && ReadOnlySection != nullptr) return ReadOnlySection; diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp index 0c388f8fb26c..386a813b057f 100644 --- a/lib/Target/TargetSubtargetInfo.cpp +++ b/lib/Target/TargetSubtargetInfo.cpp @@ -39,21 +39,21 @@ bool TargetSubtargetInfo::useMachineScheduler() const { return enableMachineScheduler(); } +bool TargetSubtargetInfo::enableAtomicExpandLoadLinked() const { + return true; +} + bool TargetSubtargetInfo::enableMachineScheduler() const { return false; } -bool TargetSubtargetInfo::enablePostMachineScheduler() const { - return false; +bool TargetSubtargetInfo::enableRALocalReassignment( + CodeGenOpt::Level OptLevel) const { + return true; } -bool TargetSubtargetInfo::enablePostRAScheduler( - CodeGenOpt::Level OptLevel, - AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const { - Mode = ANTIDEP_NONE; - CriticalPathRCs.clear(); - return false; +bool TargetSubtargetInfo::enablePostMachineScheduler() const { + return getSchedModel()->PostRAScheduler; } bool TargetSubtargetInfo::useAA() const { diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index 623479361c29..a365f62190d2 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -37,8 +37,8 @@ bool IsStackReg(unsigned Reg) { } std::string FuncName(unsigned AccessSize, bool IsWrite) { - return std::string("__sanitizer_sanitize_") + (IsWrite ? "store" : "load") + - (utostr(AccessSize)); + return std::string("__asan_report_") + (IsWrite ? "store" : "load") + + utostr(AccessSize); } class X86AddressSanitizer : public X86AsmInstrumentation { @@ -47,17 +47,19 @@ class X86AddressSanitizer : public X86AsmInstrumentation { virtual ~X86AddressSanitizer() {} // X86AsmInstrumentation implementation: - virtual void InstrumentInstruction(const MCInst &Inst, - OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out) override { + virtual void InstrumentInstruction( + const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, MCStreamer &Out) override { InstrumentMOV(Inst, Operands, Ctx, MII, Out); } // Should be implemented differently in x86_32 and x86_64 subclasses. - virtual void InstrumentMemOperandImpl(X86Operand &Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, - MCStreamer &Out) = 0; + virtual void InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) = 0; + virtual void InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) = 0; void InstrumentMemOperand(MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, MCStreamer &Out); @@ -67,14 +69,15 @@ class X86AddressSanitizer : public X86AsmInstrumentation { Out.EmitInstruction(Inst, STI); } + void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } + protected: const MCSubtargetInfo &STI; }; -void X86AddressSanitizer::InstrumentMemOperand(MCParsedAsmOperand &Op, - unsigned AccessSize, - bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { +void X86AddressSanitizer::InstrumentMemOperand( + MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { assert(Op.isMem() && "Op should be a memory operand."); assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 && "AccessSize should be a power of two, less or equal than 16."); @@ -84,13 +87,16 @@ void X86AddressSanitizer::InstrumentMemOperand(MCParsedAsmOperand &Op, if (IsStackReg(MemOp.getMemBaseReg()) || IsStackReg(MemOp.getMemIndexReg())) return; - InstrumentMemOperandImpl(MemOp, AccessSize, IsWrite, Ctx, Out); + // FIXME: take into account load/store alignment. + if (AccessSize < 8) + InstrumentMemOperandSmallImpl(MemOp, AccessSize, IsWrite, Ctx, Out); + else + InstrumentMemOperandLargeImpl(MemOp, AccessSize, IsWrite, Ctx, Out); } -void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst, - OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out) { +void X86AddressSanitizer::InstrumentMOV( + const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, MCStreamer &Out) { // Access size in bytes. unsigned AccessSize = 0; @@ -136,22 +142,46 @@ void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst, class X86AddressSanitizer32 : public X86AddressSanitizer { public: + static const long kShadowOffset = 0x20000000; + X86AddressSanitizer32(const MCSubtargetInfo &STI) : X86AddressSanitizer(STI) {} virtual ~X86AddressSanitizer32() {} - virtual void InstrumentMemOperandImpl(X86Operand &Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; + virtual void InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; + + private: + void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize, + bool IsWrite, unsigned AddressReg) { + EmitInstruction(Out, MCInstBuilder(X86::CLD)); + EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); + + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::ESP) + .addReg(X86::ESP).addImm(-16)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(AddressReg)); + + + const std::string& Fn = FuncName(AccessSize, IsWrite); + MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); + const MCSymbolRefExpr *FnExpr = + MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); + } }; -void X86AddressSanitizer32::InstrumentMemOperandImpl(X86Operand &Op, - unsigned AccessSize, - bool IsWrite, - MCContext &Ctx, - MCStreamer &Out) { - // FIXME: emit .cfi directives for correct stack unwinding. +void X86AddressSanitizer32::InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EDX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); + { MCInst Inst; Inst.setOpcode(X86::LEA32r); @@ -159,50 +189,175 @@ void X86AddressSanitizer32::InstrumentMemOperandImpl(X86Operand &Op, Op.addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); + + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX) + .addReg(X86::ECX).addImm(3)); + { - const std::string Func = FuncName(AccessSize, IsWrite); - const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func)); - const MCSymbolRefExpr *FuncExpr = - MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr)); + MCInst Inst; + Inst.setOpcode(X86::MOV8rm); + Inst.addOperand(MCOperand::CreateReg(X86::CL)); + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); + std::unique_ptr Op( + X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + EmitInstruction(Out, + MCInstBuilder(X86::TEST8rr).addReg(X86::CL).addReg(X86::CL)); + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::EDX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::EDX) + .addReg(X86::EDX).addImm(7)); + + switch (AccessSize) { + case 1: + break; + case 2: { + MCInst Inst; + Inst.setOpcode(X86::LEA32r); + Inst.addOperand(MCOperand::CreateReg(X86::EDX)); + + const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); + std::unique_ptr Op( + X86Operand::CreateMem(0, Disp, X86::EDX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + break; } + case 4: + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::EDX) + .addReg(X86::EDX).addImm(3)); + break; + default: + assert(false && "Incorrect access size"); + break; + } + + EmitInstruction( + Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::ECX).addReg(X86::CL)); + EmitInstruction( + Out, MCInstBuilder(X86::CMP32rr).addReg(X86::EDX).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF32)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EDX)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX)); EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); +} + +void X86AddressSanitizer32::InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); + + { + MCInst Inst; + Inst.setOpcode(X86::LEA32r); + Inst.addOperand(MCOperand::CreateReg(X86::EAX)); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX) + .addReg(X86::ECX).addImm(3)); + { + MCInst Inst; + switch (AccessSize) { + case 8: + Inst.setOpcode(X86::CMP8mi); + break; + case 16: + Inst.setOpcode(X86::CMP16mi); + break; + default: + assert(false && "Incorrect access size"); + break; + } + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); + std::unique_ptr Op( + X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + Inst.addOperand(MCOperand::CreateImm(0)); + EmitInstruction(Out, Inst); + } + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF32)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX)); EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); } class X86AddressSanitizer64 : public X86AddressSanitizer { public: + static const long kShadowOffset = 0x7fff8000; + X86AddressSanitizer64(const MCSubtargetInfo &STI) : X86AddressSanitizer(STI) {} virtual ~X86AddressSanitizer64() {} - virtual void InstrumentMemOperandImpl(X86Operand &Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; -}; - -void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand &Op, - unsigned AccessSize, - bool IsWrite, - MCContext &Ctx, - MCStreamer &Out) { - // FIXME: emit .cfi directives for correct stack unwinding. + virtual void InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; - // Set %rsp below current red zone (128 bytes wide) using LEA instruction to - // preserve flags. - { +private: + void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { MCInst Inst; Inst.setOpcode(X86::LEA64r); Inst.addOperand(MCOperand::CreateReg(X86::RSP)); - const MCExpr *Disp = MCConstantExpr::Create(-128, Ctx); + const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx); std::unique_ptr Op( X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } + + void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize, + bool IsWrite) { + EmitInstruction(Out, MCInstBuilder(X86::CLD)); + EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); + + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::RSP) + .addReg(X86::RSP).addImm(-16)); + + const std::string& Fn = FuncName(AccessSize, IsWrite); + MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); + const MCSymbolRefExpr *FnExpr = + MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); + } +}; + +void X86AddressSanitizer64::InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { + EmitAdjustRSP(Ctx, Out, -128); + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RCX)); EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); { MCInst Inst; Inst.setOpcode(X86::LEA64r); @@ -210,27 +365,119 @@ void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand &Op, Op.addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } + EmitInstruction( + Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RAX).addReg(X86::RDI)); + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX) + .addReg(X86::RAX).addImm(3)); { - const std::string Func = FuncName(AccessSize, IsWrite); - const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func)); - const MCSymbolRefExpr *FuncExpr = - MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FuncExpr)); + MCInst Inst; + Inst.setOpcode(X86::MOV8rm); + Inst.addOperand(MCOperand::CreateReg(X86::AL)); + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); + std::unique_ptr Op( + X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); } + + EmitInstruction(Out, + MCInstBuilder(X86::TEST8rr).addReg(X86::AL).addReg(X86::AL)); + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EDI)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::ECX) + .addReg(X86::ECX).addImm(7)); + + switch (AccessSize) { + case 1: + break; + case 2: { + MCInst Inst; + Inst.setOpcode(X86::LEA32r); + Inst.addOperand(MCOperand::CreateReg(X86::ECX)); + + const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); + std::unique_ptr Op( + X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + break; + } + case 4: + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::ECX) + .addReg(X86::ECX).addImm(3)); + break; + default: + assert(false && "Incorrect access size"); + break; + } + + EmitInstruction( + Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::EAX).addReg(X86::AL)); + EmitInstruction( + Out, MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF64)); EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI)); + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RCX)); + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX)); + EmitAdjustRSP(Ctx, Out, 128); +} + +void X86AddressSanitizer64::InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { + EmitAdjustRSP(Ctx, Out, -128); + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); - // Restore old %rsp value. { MCInst Inst; Inst.setOpcode(X86::LEA64r); - Inst.addOperand(MCOperand::CreateReg(X86::RSP)); - - const MCExpr *Disp = MCConstantExpr::Create(128, Ctx); + Inst.addOperand(MCOperand::CreateReg(X86::RAX)); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX) + .addReg(X86::RAX).addImm(3)); + { + MCInst Inst; + switch (AccessSize) { + case 8: + Inst.setOpcode(X86::CMP8mi); + break; + case 16: + Inst.setOpcode(X86::CMP16mi); + break; + default: + assert(false && "Incorrect access size"); + break; + } + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr Op( - X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); + Inst.addOperand(MCOperand::CreateImm(0)); EmitInstruction(Out, Inst); } + + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF64)); + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX)); + EmitAdjustRSP(Ctx, Out, 128); } } // End anonymous namespace @@ -238,11 +485,9 @@ void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand &Op, X86AsmInstrumentation::X86AsmInstrumentation() {} X86AsmInstrumentation::~X86AsmInstrumentation() {} -void X86AsmInstrumentation::InstrumentInstruction(const MCInst &Inst, - OperandVector &Operands, - MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out) {} +void X86AsmInstrumentation::InstrumentInstruction( + const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, MCStreamer &Out) {} X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index b30eeeb0e037..a11a238fc976 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -235,6 +235,7 @@ class X86AsmParser : public MCTargetAsmParser { IES_RSHIFT, IES_PLUS, IES_MINUS, + IES_NOT, IES_MULTIPLY, IES_DIVIDE, IES_LBRAC, @@ -372,6 +373,7 @@ class X86AsmParser : public MCTargetAsmParser { State = IES_ERROR; break; case IES_PLUS: + case IES_NOT: case IES_MULTIPLY: case IES_DIVIDE: case IES_LPAREN: @@ -401,6 +403,19 @@ class X86AsmParser : public MCTargetAsmParser { } PrevState = CurrState; } + void onNot() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_NOT: + State = IES_NOT; + break; + } + PrevState = CurrState; + } void onRegister(unsigned Reg) { IntelExprState CurrState = State; switch (State) { @@ -438,6 +453,7 @@ class X86AsmParser : public MCTargetAsmParser { break; case IES_PLUS: case IES_MINUS: + case IES_NOT: State = IES_INTEGER; Sym = SymRef; SymName = SymRefName; @@ -453,6 +469,7 @@ class X86AsmParser : public MCTargetAsmParser { break; case IES_PLUS: case IES_MINUS: + case IES_NOT: case IES_OR: case IES_AND: case IES_LSHIFT: @@ -476,11 +493,22 @@ class X86AsmParser : public MCTargetAsmParser { PrevState == IES_OR || PrevState == IES_AND || PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || - PrevState == IES_LPAREN || PrevState == IES_LBRAC) && + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT) && CurrState == IES_MINUS) { // Unary minus. No need to pop the minus operand because it was never // pushed. IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm. + } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS || + PrevState == IES_OR || PrevState == IES_AND || + PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || + PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT) && + CurrState == IES_NOT) { + // Unary not. No need to pop the not operand because it was never + // pushed. + IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm. } else { IC.pushOperand(IC_IMM, TmpInt); } @@ -561,6 +589,7 @@ class X86AsmParser : public MCTargetAsmParser { break; case IES_PLUS: case IES_MINUS: + case IES_NOT: case IES_OR: case IES_AND: case IES_LSHIFT: @@ -568,13 +597,14 @@ class X86AsmParser : public MCTargetAsmParser { case IES_MULTIPLY: case IES_DIVIDE: case IES_LPAREN: - // FIXME: We don't handle this type of unary minus, yet. + // FIXME: We don't handle this type of unary minus or not, yet. if ((PrevState == IES_PLUS || PrevState == IES_MINUS || PrevState == IES_OR || PrevState == IES_AND || PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || - PrevState == IES_LPAREN || PrevState == IES_LBRAC) && - CurrState == IES_MINUS) { + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT) && + (CurrState == IES_MINUS || CurrState == IES_NOT)) { State = IES_ERROR; break; } @@ -666,6 +696,8 @@ class X86AsmParser : public MCTargetAsmParser { unsigned &ErrorInfo, bool MatchingInlineAsm) override; + virtual bool OmitRegisterFromClobberLists(unsigned RegNo) override; + /// doSrcDstMatch - Returns true if operands are matching in their /// word size (%si and %di, %esi and %edi, etc.). Order depends on /// the parsing mode (Intel vs. AT&T). @@ -1061,7 +1093,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) break; - switch (getLexer().getKind()) { + AsmToken::TokenKind TK = getLexer().getKind(); + switch (TK) { default: { if (SM.isValidEndState()) { Done = true; @@ -1073,13 +1106,14 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { Done = true; break; } + case AsmToken::String: case AsmToken::Identifier: { // This could be a register or a symbolic displacement. unsigned TmpReg; const MCExpr *Val; SMLoc IdentLoc = Tok.getLoc(); StringRef Identifier = Tok.getString(); - if(!ParseRegister(TmpReg, IdentLoc, End)) { + if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) { SM.onRegister(TmpReg); UpdateLocLex = false; break; @@ -1139,6 +1173,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { } case AsmToken::Plus: SM.onPlus(); break; case AsmToken::Minus: SM.onMinus(); break; + case AsmToken::Tilde: SM.onNot(); break; case AsmToken::Star: SM.onStar(); break; case AsmToken::Slash: SM.onDivide(); break; case AsmToken::Pipe: SM.onOr(); break; @@ -1521,7 +1556,7 @@ std::unique_ptr X86AsmParser::ParseIntelOperand() { // Immediate. if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) || - getLexer().is(AsmToken::LParen)) { + getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) { AsmToken StartTok = Tok; IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, /*AddImmPrefix=*/false); @@ -1631,6 +1666,8 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, // Recognize only reasonable suffixes. const char *BroadcastPrimitive = StringSwitch(getLexer().getTok().getIdentifier()) + .Case("to2", "{1to2}") + .Case("to4", "{1to4}") .Case("to8", "{1to8}") .Case("to16", "{1to16}") .Default(nullptr); @@ -2487,6 +2524,9 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return true; } +bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) { + return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo); +} bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getIdentifier(); diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index c54fbc1dc806..a09767e1eaff 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen) set(sources X86AsmPrinter.cpp + X86AtomicExpandPass.cpp X86CodeEmitter.cpp X86FastISel.cpp X86FloatingPoint.cpp diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index c36672578b44..521bd21b81c6 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -717,7 +717,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, return false; case ENCODING_WRITEMASK: return translateMaskRegister(mcInst, insn.writemask); - case ENCODING_RM: + CASE_ENCODING_RM: return translateRM(mcInst, operand, insn, Dis); case ENCODING_CB: case ENCODING_CW: diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 804606d917ba..ab3d1f774bc7 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -1488,7 +1488,7 @@ static int fixupReg(struct InternalInstruction *insn, if (!valid) return -1; break; - case ENCODING_RM: + CASE_ENCODING_RM: if (insn->eaBase >= insn->eaRegBase) { insn->eaBase = (EABase)fixupRMValue(insn, (OperandType)op->type, @@ -1620,7 +1620,8 @@ static int readVVVV(struct InternalInstruction* insn) { int vvvv; if (insn->vectorExtensionType == TYPE_EVEX) - vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]); + vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 | + vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2])); else if (insn->vectorExtensionType == TYPE_VEX_3B) vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]); else if (insn->vectorExtensionType == TYPE_VEX_2B) @@ -1680,11 +1681,14 @@ static int readOperands(struct InternalInstruction* insn) { case ENCODING_DI: break; case ENCODING_REG: - case ENCODING_RM: + CASE_ENCODING_RM: if (readModRM(insn)) return -1; if (fixupReg(insn, &Op)) return -1; + // Apply the AVX512 compressed displacement scaling factor. + if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8) + insn->displacement *= 1 << (Op.encoding - ENCODING_RM); break; case ENCODING_CB: case ENCODING_CW: diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index f59e0b6a8aa1..13a7b557b440 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -265,7 +265,7 @@ enum attributeBits { ENUM_ENTRY(IC_EVEX_L2_W_KZ, 3, "requires EVEX_KZ, L2 and W") \ ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ, 4, "requires EVEX_KZ, L2, W and XS prefix") \ ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ, 4, "requires EVEX_KZ, L2, W and XD prefix") \ - ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize") + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize") #define ENUM_ENTRY(n, r, d) n, enum InstructionContext { @@ -325,11 +325,26 @@ enum ModRMDecisionType { }; #undef ENUM_ENTRY +#define CASE_ENCODING_RM \ + case ENCODING_RM: \ + case ENCODING_RM_CD2: \ + case ENCODING_RM_CD4: \ + case ENCODING_RM_CD8: \ + case ENCODING_RM_CD16: \ + case ENCODING_RM_CD32: \ + case ENCODING_RM_CD64 + // Physical encodings of instruction operands. #define ENCODINGS \ ENUM_ENTRY(ENCODING_NONE, "") \ ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \ ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \ + ENUM_ENTRY(ENCODING_RM_CD2, "R/M operand with CDisp scaling of 2") \ + ENUM_ENTRY(ENCODING_RM_CD4, "R/M operand with CDisp scaling of 4") \ + ENUM_ENTRY(ENCODING_RM_CD8, "R/M operand with CDisp scaling of 8") \ + ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16") \ + ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32") \ + ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64") \ ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \ ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \ ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \ @@ -438,8 +453,12 @@ enum OperandEncoding { ENUM_ENTRY(TYPE_XMM256, "32-byte") \ ENUM_ENTRY(TYPE_XMM512, "64-byte") \ ENUM_ENTRY(TYPE_VK1, "1-bit") \ + ENUM_ENTRY(TYPE_VK2, "2-bit") \ + ENUM_ENTRY(TYPE_VK4, "4-bit") \ ENUM_ENTRY(TYPE_VK8, "8-bit") \ ENUM_ENTRY(TYPE_VK16, "16-bit") \ + ENUM_ENTRY(TYPE_VK32, "32-bit") \ + ENUM_ENTRY(TYPE_VK64, "64-bit") \ ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index bf30a8e66633..23bca0d53d3d 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -73,11 +73,12 @@ class X86ELFObjectWriter : public MCELFObjectTargetWriter { }; class X86AsmBackend : public MCAsmBackend { - StringRef CPU; + const StringRef CPU; bool HasNopl; + const uint64_t MaxNopLength; public: X86AsmBackend(const Target &T, StringRef _CPU) - : MCAsmBackend(), CPU(_CPU) { + : MCAsmBackend(), CPU(_CPU), MaxNopLength(_CPU == "slm" ? 7 : 15) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && @@ -331,7 +332,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { // 15 is the longest single nop instruction. Emit as many 15-byte nops as // needed, then emit a nop of the remaining length. do { - const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15); + const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength); const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10; for (uint8_t i = 0; i < Prefixes; i++) OW->Write8(0x66); @@ -365,6 +366,17 @@ class ELFX86_32AsmBackend : public ELFX86AsmBackend { } }; +class ELFX86_X32AsmBackend : public ELFX86AsmBackend { +public: + ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, + ELF::EM_X86_64); + } +}; + class ELFX86_64AsmBackend : public ELFX86AsmBackend { public: ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) @@ -717,11 +729,10 @@ class DarwinX86AsmBackend : public X86AsmBackend { }; class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { - bool SupportsCU; public: DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef CPU, bool SupportsCU) - : DarwinX86AsmBackend(T, MRI, CPU, false), SupportsCU(SupportsCU) {} + StringRef CPU) + : DarwinX86AsmBackend(T, MRI, CPU, false) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { return createX86MachObjectWriter(OS, /*Is64Bit=*/false, @@ -732,20 +743,16 @@ class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { /// \brief Generate the compact unwind encoding for the CFI instructions. uint32_t generateCompactUnwindEncoding( ArrayRef Instrs) const override { - return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0; + return generateCompactUnwindEncodingImpl(Instrs); } }; class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { - bool SupportsCU; const MachO::CPUSubTypeX86 Subtype; public: DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef CPU, bool SupportsCU, - MachO::CPUSubTypeX86 st) - : DarwinX86AsmBackend(T, MRI, CPU, true), SupportsCU(SupportsCU), - Subtype(st) { - } + StringRef CPU, MachO::CPUSubTypeX86 st) + : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { return createX86MachObjectWriter(OS, /*Is64Bit=*/true, @@ -788,7 +795,7 @@ class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { /// \brief Generate the compact unwind encoding for the CFI instructions. uint32_t generateCompactUnwindEncoding( ArrayRef Instrs) const override { - return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0; + return generateCompactUnwindEncodingImpl(Instrs); } }; @@ -801,9 +808,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, Triple TheTriple(TT); if (TheTriple.isOSBinFormatMachO()) - return new DarwinX86_32AsmBackend(T, MRI, CPU, - TheTriple.isMacOSX() && - !TheTriple.isMacOSXVersionLT(10, 7)); + return new DarwinX86_32AsmBackend(T, MRI, CPU); if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF()) return new WindowsX86AsmBackend(T, false, CPU); @@ -823,14 +828,15 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringSwitch(TheTriple.getArchName()) .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H) .Default(MachO::CPU_SUBTYPE_X86_64_ALL); - return new DarwinX86_64AsmBackend(T, MRI, CPU, - TheTriple.isMacOSX() && - !TheTriple.isMacOSXVersionLT(10, 7), CS); + return new DarwinX86_64AsmBackend(T, MRI, CPU, CS); } if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF()) return new WindowsX86AsmBackend(T, true, CPU); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + + if (TheTriple.getEnvironment() == Triple::GNUX32) + return new ELFX86_X32AsmBackend(T, OSABI, CPU); return new ELFX86_64AsmBackend(T, OSABI, CPU); } diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 6aeb1f2c2225..026e4c487d81 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -520,13 +520,9 @@ namespace X86II { // EVEX_B - Set if this instruction has EVEX.B field set. EVEX_B = 1U << 9, - // EVEX_CD8E - compressed disp8 form, element-size - EVEX_CD8EShift = VEXShift + 10, - EVEX_CD8EMask = 3, - - // EVEX_CD8V - compressed disp8 form, vector-width - EVEX_CD8VShift = EVEX_CD8EShift + 2, - EVEX_CD8VMask = 7, + // The scaling factor for the AVX512's 8-bit compressed displacement. + CD8_Scale_Shift = VEXShift + 10, + CD8_Scale_Mask = 127, /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents @@ -534,14 +530,17 @@ namespace X86II { /// storing a classifier in the imm8 field. To simplify our implementation, /// we handle this by storeing the classifier in the opcode field and using /// this flag to indicate that the encoder should do the wacky 3DNow! thing. - Has3DNow0F0FOpcode = 1U << 15, + Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7, + Has3DNow0F0FOpcode = 1U << (Has3DNow0F0FOpcodeShift - VEXShift), /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in /// ModRM or I8IMM. This is used for FMA4 and XOP instructions. - MemOp4 = 1U << 16, + MemOp4Shift = Has3DNow0F0FOpcodeShift + 1, + MemOp4 = 1U << (MemOp4Shift - VEXShift), /// Explicitly specified rounding control - EVEX_RC = 1U << 17 + EVEX_RCShift = MemOp4Shift + 1, + EVEX_RC = 1U << (EVEX_RCShift - VEXShift) }; // getBaseOpcodeFor - This function returns the "base" X86 opcode for the diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 39480eaaac16..b1411bc5040d 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -72,10 +72,10 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6)) HasWeakDefCanBeHiddenDirective = false; - // FIXME: this should not depend on the target OS version, but on the ld64 - // version in use. From at least >= ld64-97.17 (Xcode 3.2.6) the abs-ified - // FDE relocs may be used. - DwarfFDESymbolsUseAbsDiff = T.isMacOSX() && !T.isMacOSXVersionLT(10, 6); + // Assume ld64 is new enough that the abs-ified FDE relocs may be used + // (actually, must, since otherwise the non-extern relocations we produce + // overwhelm ld64's tiny little mind and it fails). + DwarfFDESymbolsUseAbsDiff = true; UseIntegratedAssembler = true; } @@ -142,8 +142,11 @@ getNonexecutableStackSection(MCContext &Ctx) const { void X86MCAsmInfoMicrosoft::anchor() { } X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { - if (Triple.getArch() == Triple::x86_64) + if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; + PointerSize = 8; + ExceptionsType = ExceptionHandling::WinEH; + } AssemblerDialect = AsmWriterFlavor; @@ -157,17 +160,18 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { void X86MCAsmInfoGNUCOFF::anchor() { } X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { + assert(Triple.isOSWindows() && "Windows is the only supported COFF target"); if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; PointerSize = 8; + ExceptionsType = ExceptionHandling::WinEH; + } else { + ExceptionsType = ExceptionHandling::DwarfCFI; } AssemblerDialect = AsmWriterFlavor; TextAlignFillValue = 0x90; - // Exceptions handling - ExceptionsType = ExceptionHandling::DwarfCFI; - UseIntegratedAssembler = true; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 2152b21068f3..075db11027d1 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -185,42 +185,22 @@ static bool isDisp8(int Value) { /// isCDisp8 - Return true if this signed displacement fits in a 8-bit /// compressed dispacement field. static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) { - assert((TSFlags & X86II::EncodingMask) >> X86II::EncodingShift == X86II::EVEX && + assert(((TSFlags & X86II::EncodingMask) >> + X86II::EncodingShift == X86II::EVEX) && "Compressed 8-bit displacement is only valid for EVEX inst."); - unsigned CD8E = (TSFlags >> X86II::EVEX_CD8EShift) & X86II::EVEX_CD8EMask; - unsigned CD8V = (TSFlags >> X86II::EVEX_CD8VShift) & X86II::EVEX_CD8VMask; - - if (CD8V == 0 && CD8E == 0) { + unsigned CD8_Scale = + (TSFlags >> X86II::CD8_Scale_Shift) & X86II::CD8_Scale_Mask; + if (CD8_Scale == 0) { CValue = Value; return isDisp8(Value); } - - unsigned MemObjSize = 1U << CD8E; - if (CD8V & 4) { - // Fixed vector length - MemObjSize *= 1U << (CD8V & 0x3); - } else { - // Modified vector length - bool EVEX_b = (TSFlags >> X86II::VEXShift) & X86II::EVEX_B; - if (!EVEX_b) { - unsigned EVEX_LL = ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) ? 1 : 0; - EVEX_LL += ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2) ? 2 : 0; - assert(EVEX_LL < 3 && ""); - - unsigned NumElems = (1U << (EVEX_LL + 4)) / MemObjSize; - NumElems /= 1U << (CD8V & 0x3); - - MemObjSize *= NumElems; - } - } - - unsigned MemObjMask = MemObjSize - 1; - assert((MemObjSize & MemObjMask) == 0 && "Invalid memory object size."); - if (Value & MemObjMask) // Unaligned offset + unsigned Mask = CD8_Scale - 1; + assert((CD8_Scale & Mask) == 0 && "Invalid memory object size."); + if (Value & Mask) // Unaligned offset return false; - Value /= (int)MemObjSize; + Value /= (int)CD8_Scale; bool Ret = (Value == (signed char)Value); if (Ret) diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 5e29e5c359ac..3bfad6c71b9b 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -272,7 +272,8 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) { MAI = new X86ELFMCAsmInfo(TheTriple); } else if (TheTriple.isWindowsMSVCEnvironment()) { MAI = new X86MCAsmInfoMicrosoft(TheTriple); - } else if (TheTriple.isOSCygMing()) { + } else if (TheTriple.isOSCygMing() || + TheTriple.isWindowsItaniumEnvironment()) { MAI = new X86MCAsmInfoGNUCOFF(TheTriple); } else { // The default is ELF. diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index c62fd0a9390a..6727f5edd26b 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -19,21 +19,21 @@ class X86WinCOFFStreamer : public MCWinCOFFStreamer { raw_ostream &OS) : MCWinCOFFStreamer(C, AB, *CE, OS) { } - void EmitWin64EHHandlerData() override; + void EmitWinEHHandlerData() override; void FinishImpl() override; }; -void X86WinCOFFStreamer::EmitWin64EHHandlerData() { - MCStreamer::EmitWin64EHHandlerData(); +void X86WinCOFFStreamer::EmitWinEHHandlerData() { + MCStreamer::EmitWinEHHandlerData(); // We have to emit the unwind info now, because this directive // actually switches to the .xdata section! - MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo()); + MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentWinFrameInfo()); } void X86WinCOFFStreamer::FinishImpl() { EmitFrames(nullptr); - EmitW64Tables(); + EmitWindowsUnwindTables(); MCWinCOFFStreamer::FinishImpl(); } diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 64e8ea834f47..d5522ed95eb4 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -24,6 +24,10 @@ class ImmutablePass; class JITCodeEmitter; class X86TargetMachine; +/// createX86AtomicExpandPass - This pass expands atomic operations that cannot +/// be handled natively in terms of a loop using cmpxchg. +FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM); + /// createX86ISelDag - This pass converts a legalized DAG into a /// X86-specific DAG, ready for instruction scheduling. /// diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 0ac801a9b0a0..cd32a0f24234 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -104,7 +104,15 @@ def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", "Enable AVX-512 PreFetch Instructions", [FeatureAVX512]>; - +def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true", + "Enable AVX-512 Doubleword and Quadword Instructions", + [FeatureAVX512]>; +def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true", + "Enable AVX-512 Byte and Word Instructions", + [FeatureAVX512]>; +def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true", + "Enable AVX-512 Vector Length eXtensions", + [FeatureAVX512]>; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -273,7 +281,19 @@ def : ProcessorModel<"knl", HaswellModel, FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>; + FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, + FeatureSlowIncDec]>; + +// SKX +// FIXME: define SKX model +def : ProcessorModel<"skx", HaswellModel, + [FeatureAVX512, FeatureCDI, + FeatureDQI, FeatureBWI, FeatureVLX, + FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, + FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, + FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, + FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, + FeatureSlowIncDec]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 1dca5689adee..57c7a62bd5c1 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -18,6 +18,7 @@ #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" @@ -29,6 +30,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" @@ -549,6 +551,26 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel, 4 /*size*/); } +MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const { + if (Subtarget->isTargetKnownWindowsMSVC()) { + const MachineConstantPoolEntry &CPE = + MF->getConstantPool()->getConstants()[CPID]; + if (!CPE.isMachineConstantPoolEntry()) { + SectionKind Kind = CPE.getSectionKind(TM.getDataLayout()); + const Constant *C = CPE.Val.ConstVal; + const MCSectionCOFF *S = cast( + getObjFileLowering().getSectionForConstant(Kind, C)); + if (MCSymbol *Sym = S->getCOMDATSymbol()) { + if (Sym->isUndefined()) + OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global); + return Sym; + } + } + } + + return AsmPrinter::GetCPISymbol(CPID); +} + void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) { SmallString<128> Directive; raw_svector_ostream OS(Directive); diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index e4eef5dbd796..b1bbe8e41cc0 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -50,6 +50,9 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { unsigned AsmVariant, const char *ExtraCode, raw_ostream &OS) override; + /// \brief Return the symbol for the specified constant pool entry. + MCSymbol *GetCPISymbol(unsigned CPID) const override; + bool runOnMachineFunction(MachineFunction &F) override; }; diff --git a/lib/Target/X86/X86AtomicExpandPass.cpp b/lib/Target/X86/X86AtomicExpandPass.cpp new file mode 100644 index 000000000000..3dcadb16760b --- /dev/null +++ b/lib/Target/X86/X86AtomicExpandPass.cpp @@ -0,0 +1,283 @@ +//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass (at IR level) to replace atomic instructions which +// cannot be implemented as a single instruction with cmpxchg-based loops. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-atomic-expand" + +namespace { + class X86AtomicExpandPass : public FunctionPass { + const X86TargetMachine *TM; + public: + static char ID; // Pass identification, replacement for typeid + explicit X86AtomicExpandPass(const X86TargetMachine *TM) + : FunctionPass(ID), TM(TM) {} + + bool runOnFunction(Function &F) override; + bool expandAtomicInsts(Function &F); + + bool needsCmpXchgNb(Type *MemType); + + /// There are four kinds of atomic operations. Two never need expanding: + /// cmpxchg is what we expand the others *to*, and loads are easily handled + /// by ISelLowering. Atomicrmw and store can need expanding in some + /// circumstances. + bool shouldExpand(Instruction *Inst); + + /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms + /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic. + bool shouldExpandStore(StoreInst *SI); + + /// Only some atomicrmw instructions need expanding -- some operations + /// (e.g. max) have absolutely no architectural support; some (e.g. or) have + /// limited support but can't return the previous value; some (e.g. add) + /// have complete support in the instruction set. + /// + /// Also, naturally, 128-bit operations always need to be expanded. + bool shouldExpandAtomicRMW(AtomicRMWInst *AI); + + bool expandAtomicRMW(AtomicRMWInst *AI); + bool expandAtomicStore(StoreInst *SI); + }; +} + +char X86AtomicExpandPass::ID = 0; + +FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) { + return new X86AtomicExpandPass(TM); +} + +bool X86AtomicExpandPass::runOnFunction(Function &F) { + SmallVector AtomicInsts; + + // Changing control-flow while iterating through it is a bad idea, so gather a + // list of all atomic instructions before we start. + for (BasicBlock &BB : F) + for (Instruction &Inst : BB) { + if (isa(&Inst) || + (isa(&Inst) && cast(&Inst)->isAtomic())) + AtomicInsts.push_back(&Inst); + } + + bool MadeChange = false; + for (Instruction *Inst : AtomicInsts) { + if (!shouldExpand(Inst)) + continue; + + if (AtomicRMWInst *AI = dyn_cast(Inst)) + MadeChange |= expandAtomicRMW(AI); + if (StoreInst *SI = dyn_cast(Inst)) + MadeChange |= expandAtomicStore(SI); + + assert(MadeChange && "Atomic inst not expanded when it should be?"); + Inst->eraseFromParent(); + } + + return MadeChange; +} + +/// Returns true if the operand type is 1 step up from the native width, and +/// the corresponding cmpxchg8b or cmpxchg16b instruction is available +/// (otherwise we leave them alone to become __sync_fetch_and_... calls). +bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) { + const X86Subtarget &Subtarget = TM->getSubtarget(); + unsigned OpWidth = MemType->getPrimitiveSizeInBits(); + + if (OpWidth == 64) + return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b + if (OpWidth == 128) + return Subtarget.hasCmpxchg16b(); + + return false; +} + +bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) { + const X86Subtarget &Subtarget = TM->getSubtarget(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + + if (needsCmpXchgNb(AI->getType())) + return true; + + if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth) + return false; + + AtomicRMWInst::BinOp Op = AI->getOperation(); + switch (Op) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Xchg: + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + // It's better to use xadd, xsub or xchg for these in all cases. + return false; + case AtomicRMWInst::Or: + case AtomicRMWInst::And: + case AtomicRMWInst::Xor: + // If the atomicrmw's result isn't actually used, we can just add a "lock" + // prefix to a normal instruction for these operations. + return !AI->use_empty(); + case AtomicRMWInst::Nand: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // These always require a non-trivial set of data operations on x86. We must + // use a cmpxchg loop. + return true; + } +} + +bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) { + if (needsCmpXchgNb(SI->getValueOperand()->getType())) + return true; + + return false; +} + +bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) { + if (AtomicRMWInst *AI = dyn_cast(Inst)) + return shouldExpandAtomicRMW(AI); + if (StoreInst *SI = dyn_cast(Inst)) + return shouldExpandStore(SI); + return false; +} + +/// Emit IR to implement the given atomicrmw operation on values in registers, +/// returning the new value. +static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, + Value *Loaded, Value *Inc) { + Value *NewVal; + switch (Op) { + case AtomicRMWInst::Xchg: + return Inc; + case AtomicRMWInst::Add: + return Builder.CreateAdd(Loaded, Inc, "new"); + case AtomicRMWInst::Sub: + return Builder.CreateSub(Loaded, Inc, "new"); + case AtomicRMWInst::And: + return Builder.CreateAnd(Loaded, Inc, "new"); + case AtomicRMWInst::Nand: + return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); + case AtomicRMWInst::Or: + return Builder.CreateOr(Loaded, Inc, "new"); + case AtomicRMWInst::Xor: + return Builder.CreateXor(Loaded, Inc, "new"); + case AtomicRMWInst::Max: + NewVal = Builder.CreateICmpSGT(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::Min: + NewVal = Builder.CreateICmpSLE(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::UMax: + NewVal = Builder.CreateICmpUGT(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::UMin: + NewVal = Builder.CreateICmpULE(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + default: + break; + } + llvm_unreachable("Unknown atomic op"); +} + +bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) { + AtomicOrdering Order = + AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); + Value *Addr = AI->getPointerOperand(); + BasicBlock *BB = AI->getParent(); + Function *F = BB->getParent(); + LLVMContext &Ctx = F->getContext(); + + // Given: atomicrmw some_op iN* %addr, iN %incr ordering + // + // The standard expansion we produce is: + // [...] + // %init_loaded = load atomic iN* %addr + // br label %loop + // loop: + // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] + // %new = some_op iN %loaded, %incr + // %pair = cmpxchg iN* %addr, iN %loaded, iN %new + // %new_loaded = extractvalue { iN, i1 } %pair, 0 + // %success = extractvalue { iN, i1 } %pair, 1 + // br i1 %success, label %atomicrmw.end, label %loop + // atomicrmw.end: + // [...] + BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end"); + BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); + + // This grabs the DebugLoc from AI. + IRBuilder<> Builder(AI); + + // The split call above "helpfully" added a branch at the end of BB (to the + // wrong place), but we want a load. It's easiest to just remove + // the branch entirely. + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + LoadInst *InitLoaded = Builder.CreateLoad(Addr); + InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits()); + Builder.CreateBr(LoopBB); + + // Start the main loop block now that we've taken care of the preliminaries. + Builder.SetInsertPoint(LoopBB); + PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); + Loaded->addIncoming(InitLoaded, BB); + + Value *NewVal = + performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); + + Value *Pair = Builder.CreateAtomicCmpXchg( + Addr, Loaded, NewVal, Order, + AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); + Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); + Loaded->addIncoming(NewLoaded, LoopBB); + + Value *Success = Builder.CreateExtractValue(Pair, 1, "success"); + Builder.CreateCondBr(Success, ExitBB, LoopBB); + + AI->replaceAllUsesWith(NewLoaded); + + return true; +} + +bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) { + // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express + // this in terms of the usual expansion to "atomicrmw xchg". + IRBuilder<> Builder(SI); + AtomicOrdering Order = + SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering(); + AtomicRMWInst *AI = + Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), + SI->getValueOperand(), Order); + + // Now we have an appropriate swap instruction, lower it as usual. + if (shouldExpandAtomicRMW(AI)) { + expandAtomicRMW(AI); + AI->eraseFromParent(); + return true; + } + + return AI; +} diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 0824d4ed660b..86c01bd64647 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -52,7 +52,7 @@ def RetCC_X86Common : CallingConv<[ // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX-512 target feature. - CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, // MMX vector types are always returned in MM0. If the target doesn't have @@ -252,7 +252,7 @@ def CC_X86_64_C : CallingConv<[ YMM4, YMM5, YMM6, YMM7]>>>>, // The first 8 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>>, diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index b275a9cc3e48..a3ae7ee315a3 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -1131,6 +1131,16 @@ void Emitter::emitInstruction(MachineInstr &MI, case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: break; + + case X86::SEH_PushReg: + case X86::SEH_SaveReg: + case X86::SEH_SaveXMM: + case X86::SEH_StackAlloc: + case X86::SEH_SetFrame: + case X86::SEH_PushFrame: + case X86::SEH_EndPrologue: + break; + case X86::MOVPC32r: { // This emits the "call" portion of this pseudo instruction. MCE.emitByte(BaseOpcode); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 9557d96d5309..2d494b4aeeb6 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -74,6 +74,8 @@ class X86FastISel final : public FastISel { const LoadInst *LI) override; bool FastLowerArguments() override; + bool FastLowerCall(CallLoweringInfo &CLI) override; + bool FastLowerIntrinsicCall(const IntrinsicInst *II) override; #include "X86GenFastISel.inc" @@ -111,6 +113,12 @@ class X86FastISel final : public FastISel { bool X86SelectDivRem(const Instruction *I); + bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I); + bool X86SelectSelect(const Instruction *I); bool X86SelectTrunc(const Instruction *I); @@ -118,11 +126,6 @@ class X86FastISel final : public FastISel { bool X86SelectFPExt(const Instruction *I); bool X86SelectFPTrunc(const Instruction *I); - bool X86VisitIntrinsicCall(const IntrinsicInst &I); - bool X86SelectCall(const Instruction *I); - - bool DoSelectCall(const Instruction *I, const char *MemIntName); - const X86InstrInfo *getInstrInfo() const { return getTargetMachine()->getInstrInfo(); } @@ -151,6 +154,9 @@ class X86FastISel final : public FastISel { bool TryEmitSmallMemcpy(X86AddressMode DestAM, X86AddressMode SrcAM, uint64_t Len); + + bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond); }; } // end anonymous namespace. @@ -196,7 +202,7 @@ static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) { } static std::pair -getX86ConditonCode(CmpInst::Predicate Predicate) { +getX86ConditionCode(CmpInst::Predicate Predicate) { X86::CondCode CC = X86::COND_INVALID; bool NeedSwap = false; switch (Predicate) { @@ -233,6 +239,97 @@ getX86ConditonCode(CmpInst::Predicate Predicate) { return std::make_pair(CC, NeedSwap); } +static std::pair +getX86SSEConditionCode(CmpInst::Predicate Predicate) { + unsigned CC; + bool NeedSwap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (Predicate) { + default: llvm_unreachable("Unexpected predicate"); + case CmpInst::FCMP_OEQ: CC = 0; break; + case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLT: CC = 1; break; + case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLE: CC = 2; break; + case CmpInst::FCMP_UNO: CC = 3; break; + case CmpInst::FCMP_UNE: CC = 4; break; + case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGE: CC = 5; break; + case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGT: CC = 6; break; + case CmpInst::FCMP_ORD: CC = 7; break; + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_ONE: CC = 8; break; + } + + return std::make_pair(CC, NeedSwap); +} + +/// \brief Check if it is possible to fold the condition from the XALU intrinsic +/// into the user. The condition code will only be updated on success. +bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond) { + if (!isa(Cond)) + return false; + + const auto *EV = cast(Cond); + if (!isa(EV->getAggregateOperand())) + return false; + + const auto *II = cast(EV->getAggregateOperand()); + MVT RetVT; + const Function *Callee = II->getCalledFunction(); + Type *RetTy = + cast(Callee->getReturnType())->getTypeAtIndex(0U); + if (!isTypeLegal(RetTy, RetVT)) + return false; + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return false; + + X86::CondCode TmpCC; + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break; + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break; + } + + // Check if both instructions are in the same basic block. + if (II->getParent() != I->getParent()) + return false; + + // Make sure nothing is in the way + BasicBlock::const_iterator Start = I; + BasicBlock::const_iterator End = II; + for (auto Itr = std::prev(Start); Itr != End; --Itr) { + // We only expect extractvalue instructions between the intrinsic and the + // instruction to be selected. + if (!isa(Itr)) + return false; + + // Check that the extractvalue operand comes from the intrinsic. + const auto *EVI = cast(Itr); + if (EVI->getAggregateOperand() != II) + return false; + } + + CC = TmpCC; + return true; +} + bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true); if (evt == MVT::Other || !evt.isSimple()) @@ -781,7 +878,7 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { (AM.Base.Reg != 0 || AM.IndexReg != 0)) return false; - // Can't handle DbgLocLImport. + // Can't handle DLL Import. if (GV->hasDLLImportStorageClass()) return false; @@ -1159,8 +1256,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { X86::CondCode CC; bool SwapArgs; - std::tie(CC, SwapArgs) = getX86ConditonCode(Predicate); - assert(CC <= X86::LAST_VALID_COND && "Unexpected conditon code."); + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); unsigned Opc = X86::getSETFromCond(CC); if (SwapArgs) @@ -1236,6 +1333,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // Fold the common case of a conditional branch with a comparison // in the same block (values defined on other blocks may not have // initialized registers). + X86::CondCode CC; if (const CmpInst *CI = dyn_cast(BI->getCondition())) { if (CI->hasOneUse() && CI->getParent() == I->getParent()) { EVT VT = TLI.getValueType(CI->getOperand(0)->getType()); @@ -1267,9 +1365,9 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { Predicate = CmpInst::getInversePredicate(Predicate); } - // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/conditon + // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition // code check. Instead two branch instructions are required to check all - // the flags. First we change the predicate to a supported conditon code, + // the flags. First we change the predicate to a supported condition code, // which will be the first branch. Later one we will emit the second // branch. bool NeedExtraBranch = false; @@ -1283,11 +1381,10 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { break; } - X86::CondCode CC; bool SwapArgs; unsigned BranchOpc; - std::tie(CC, SwapArgs) = getX86ConditonCode(Predicate); - assert(CC <= X86::LAST_VALID_COND && "Unexpected conditon code."); + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); BranchOpc = X86::GetCondBranchFromCond(CC); if (SwapArgs) @@ -1315,7 +1412,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); // Emits an unconditional branch to the FalseBB, obtains the branch - // weight, andd adds it to the successor list. + // weight, and adds it to the successor list. FastEmitBranch(FalseMBB, DbgLoc); return true; @@ -1357,6 +1454,24 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { return true; } } + } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(BI->getCondition()); + if (TmpReg == 0) + return false; + + unsigned BranchOpc = X86::GetCondBranchFromCond(CC); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DbgLoc); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + return true; } // Otherwise do a clumsy setcc and re-test it. @@ -1611,50 +1726,319 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { return true; } -bool X86FastISel::X86SelectSelect(const Instruction *I) { - MVT VT; - if (!isTypeLegal(I->getType(), VT)) +/// \brief Emit a conditional move instruction (if the are supported) to lower +/// the select. +bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { + // Check if the subtarget supports these instructions. + if (!Subtarget->hasCMov()) + return false; + + // FIXME: Add support for i8. + if (RetVT < MVT::i16 || RetVT > MVT::i64) return false; - // We only use cmov here, if we don't have a cmov instruction bail. - if (!Subtarget->hasCMov()) return false; + const Value *Cond = I->getOperand(0); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + bool NeedTest = true; + X86::CondCode CC = X86::COND_NE; - unsigned Opc = 0; - const TargetRegisterClass *RC = nullptr; - if (VT == MVT::i16) { - Opc = X86::CMOVE16rr; - RC = &X86::GR16RegClass; - } else if (VT == MVT::i32) { - Opc = X86::CMOVE32rr; - RC = &X86::GR32RegClass; - } else if (VT == MVT::i64) { - Opc = X86::CMOVE64rr; - RC = &X86::GR64RegClass; - } else { + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast(Cond); + if (CI && (CI->getParent() == I->getParent())) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETNPr, X86::SETEr , X86::TEST8rr }, + { X86::SETPr, X86::SETNEr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: + SETFOpc = &SETFOpcTable[0][0]; + Predicate = CmpInst::ICMP_NE; + break; + case CmpInst::FCMP_UNE: + SETFOpc = &SETFOpcTable[1][0]; + Predicate = CmpInst::ICMP_NE; + break; + } + + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + // Emit a compare of the LHS and RHS, setting the flags. + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + return false; + + if (SETFOpc) { + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + auto const &II = TII.get(SETFOpc[2]); + if (II.getNumDefs()) { + unsigned TmpReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) + .addReg(FlagReg2).addReg(FlagReg1); + } else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(FlagReg2).addReg(FlagReg1); + } + } + NeedTest = false; + } else if (foldX86XALUIntrinsic(CC, I, Cond)) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(Cond); + if (TmpReg == 0) + return false; + + NeedTest = false; + } + + if (NeedTest) { + // Selects operate on i1, however, CondReg is 8 bits width and may contain + // garbage. Indeed, only the less significant bit is supposed to be + // accurate. If we read more than the lsb, we may see non-zero values + // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for + // the select. This is achieved by performing TEST against 1. + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + if (!LHSReg || !RHSReg) + return false; + + unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); + unsigned ResultReg = FastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill); + UpdateValueMap(I, ResultReg); + return true; +} + +/// \brief Emit SSE instructions to lower the select. +/// +/// Try to use SSE1/SSE2 instructions to simulate a select without branches. +/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary +/// SSE instructions are available. +bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast(I->getOperand(0)); + if (!CI || (CI->getParent() != I->getParent())) return false; + + if (I->getType() != CI->getOperand(0)->getType() || + !((Subtarget->hasSSE1() && RetVT == MVT::f32) || + (Subtarget->hasSSE2() && RetVT == MVT::f64) )) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; } - unsigned Op0Reg = getRegForValue(I->getOperand(0)); - if (Op0Reg == 0) return false; - unsigned Op1Reg = getRegForValue(I->getOperand(1)); - if (Op1Reg == 0) return false; - unsigned Op2Reg = getRegForValue(I->getOperand(2)); - if (Op2Reg == 0) return false; - - // Selects operate on i1, however, Op0Reg is 8 bits width and may contain - // garbage. Indeed, only the less significant bit is supposed to be accurate. - // If we read more than the lsb, we may see non-zero values whereas lsb - // is zero. Therefore, we have to truncate Op0Reg to i1 for the select. - // This is achieved by performing TEST against 1. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) - .addReg(Op0Reg).addImm(1); - unsigned ResultReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(Op1Reg).addReg(Op2Reg); + unsigned CC; + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); + if (CC > 7) + return false; + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + static unsigned OpcTable[2][2][4] = { + { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, + { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } }, + { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }, + { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } } + }; + + bool HasAVX = Subtarget->hasAVX(); + unsigned *Opc = nullptr; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; + case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned CmpLHSReg = getRegForValue(CmpLHS); + bool CmpLHSIsKill = hasTrivialKill(CmpLHS); + + unsigned CmpRHSReg = getRegForValue(CmpRHS); + bool CmpRHSIsKill = hasTrivialKill(CmpRHS); + + if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned CmpReg = FastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned AndReg = FastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + LHSReg, LHSIsKill); + unsigned AndNReg = FastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + RHSReg, RHSIsKill); + unsigned ResultReg = FastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { + // These are pseudo CMOV instructions and will be later expanded into control- + // flow. + unsigned Opc; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::i8: Opc = X86::CMOV_GR8; break; + case MVT::i16: Opc = X86::CMOV_GR16; break; + case MVT::i32: Opc = X86::CMOV_GR32; break; + case MVT::f32: Opc = X86::CMOV_FR32; break; + case MVT::f64: Opc = X86::CMOV_FR64; break; + } + + const Value *Cond = I->getOperand(0); + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast(Cond); + if (CI && (CI->getParent() == I->getParent())) { + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); + if (CC > X86::LAST_VALID_COND) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + return false; + } else { + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + if (!LHSReg || !RHSReg) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + + unsigned ResultReg = + FastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); UpdateValueMap(I, ResultReg); return true; } +bool X86FastISel::X86SelectSelect(const Instruction *I) { + MVT RetVT; + if (!isTypeLegal(I->getType(), RetVT)) + return false; + + // Check if we can fold the select. + if (const auto *CI = dyn_cast(I->getOperand(0))) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + const Value *Opnd = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break; + case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break; + } + // No need for a select anymore - this is an unconditional move. + if (Opnd) { + unsigned OpReg = getRegForValue(Opnd); + if (OpReg == 0) + return false; + bool OpIsKill = hasTrivialKill(Opnd); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(OpReg, getKillRegState(OpIsKill)); + UpdateValueMap(I, ResultReg); + return true; + } + } + + // First try to use real conditional move instructions. + if (X86FastEmitCMoveSelect(RetVT, I)) + return true; + + // Try to use a sequence of SSE instructions to simulate a conditional move. + if (X86FastEmitSSESelect(RetVT, I)) + return true; + + // Fall-back to pseudo conditional move instructions, which will be later + // converted to control-flow. + if (X86FastEmitPseudoSelect(RetVT, I)) + return true; + + return false; +} + bool X86FastISel::X86SelectFPExt(const Instruction *I) { // fpext from float to double. if (X86ScalarSSEf64 && @@ -1779,8 +2163,8 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, return true; } -static bool isCommutativeIntrinsic(IntrinsicInst const &I) { - switch (I.getIntrinsicID()) { +static bool isCommutativeIntrinsic(IntrinsicInst const *II) { + switch (II->getIntrinsicID()) { case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: case Intrinsic::smul_with_overflow: @@ -1791,12 +2175,12 @@ static bool isCommutativeIntrinsic(IntrinsicInst const &I) { } } -bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { +bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { // FIXME: Handle more intrinsics. - switch (I.getIntrinsicID()) { + switch (II->getIntrinsicID()) { default: return false; case Intrinsic::frameaddress: { - Type *RetTy = I.getCalledFunction()->getReturnType(); + Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) @@ -1836,7 +2220,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // movq (%rax), %rax // ... unsigned DestReg; - unsigned Depth = cast(I.getOperand(0))->getZExtValue(); + unsigned Depth = cast(II->getOperand(0))->getZExtValue(); while (Depth--) { DestReg = createResultReg(RC); addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -1844,23 +2228,23 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { SrcReg = DestReg; } - UpdateValueMap(&I, SrcReg); + UpdateValueMap(II, SrcReg); return true; } case Intrinsic::memcpy: { - const MemCpyInst &MCI = cast(I); + const MemCpyInst *MCI = cast(II); // Don't handle volatile or variable length memcpys. - if (MCI.isVolatile()) + if (MCI->isVolatile()) return false; - if (isa(MCI.getLength())) { + if (isa(MCI->getLength())) { // Small memcpy's are common enough that we want to do them // without a call if possible. - uint64_t Len = cast(MCI.getLength())->getZExtValue(); + uint64_t Len = cast(MCI->getLength())->getZExtValue(); if (IsMemcpySmall(Len)) { X86AddressMode DestAM, SrcAM; - if (!X86SelectAddress(MCI.getRawDest(), DestAM) || - !X86SelectAddress(MCI.getRawSource(), SrcAM)) + if (!X86SelectAddress(MCI->getRawDest(), DestAM) || + !X86SelectAddress(MCI->getRawSource(), SrcAM)) return false; TryEmitSmallMemcpy(DestAM, SrcAM, Len); return true; @@ -1868,35 +2252,35 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { } unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; - if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth)) + if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth)) return false; - if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255) + if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255) return false; - return DoSelectCall(&I, "memcpy"); + return LowerCallTo(II, "memcpy", II->getNumArgOperands() - 2); } case Intrinsic::memset: { - const MemSetInst &MSI = cast(I); + const MemSetInst *MSI = cast(II); - if (MSI.isVolatile()) + if (MSI->isVolatile()) return false; unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; - if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth)) + if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth)) return false; - if (MSI.getDestAddressSpace() > 255) + if (MSI->getDestAddressSpace() > 255) return false; - return DoSelectCall(&I, "memset"); + return LowerCallTo(II, "memset", II->getNumArgOperands() - 2); } case Intrinsic::stackprotector: { // Emit code to store the stack guard onto the stack. EVT PtrTy = TLI.getPointerTy(); - const Value *Op1 = I.getArgOperand(0); // The guard's value. - const AllocaInst *Slot = cast(I.getArgOperand(1)); + const Value *Op1 = II->getArgOperand(0); // The guard's value. + const AllocaInst *Slot = cast(II->getArgOperand(1)); MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]); @@ -1907,7 +2291,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { return true; } case Intrinsic::dbg_declare: { - const DbgDeclareInst *DI = cast(&I); + const DbgDeclareInst *DI = cast(II); X86AddressMode AM; assert(DI->getAddress() && "Null address should be checked earlier!"); if (!X86SelectAddress(DI->getAddress(), AM)) @@ -1927,13 +2311,13 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { if (!Subtarget->hasSSE1()) return false; - Type *RetTy = I.getCalledFunction()->getReturnType(); + Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; - // Unfortunatelly we can't use FastEmit_r, because the AVX version of FSQRT + // Unfortunately we can't use FastEmit_r, because the AVX version of FSQRT // is not generated by FastISel yet. // FIXME: Update this code once tablegen can handle it. static const unsigned SqrtOpc[2][2] = { @@ -1949,7 +2333,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break; } - const Value *SrcVal = I.getArgOperand(0); + const Value *SrcVal = II->getArgOperand(0); unsigned SrcReg = getRegForValue(SrcVal); if (SrcReg == 0) @@ -1972,7 +2356,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { MIB.addReg(SrcReg); - UpdateValueMap(&I, ResultReg); + UpdateValueMap(II, ResultReg); return true; } case Intrinsic::sadd_with_overflow: @@ -1982,8 +2366,8 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: { // This implements the basic lowering of the xalu with overflow intrinsics - // into add/sub/mul folowed by either seto or setb. - const Function *Callee = I.getCalledFunction(); + // into add/sub/mul followed by either seto or setb. + const Function *Callee = II->getCalledFunction(); auto *Ty = cast(Callee->getReturnType()); Type *RetTy = Ty->getTypeAtIndex(0U); Type *CondTy = Ty->getTypeAtIndex(1); @@ -1995,16 +2379,16 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { if (VT < MVT::i8 || VT > MVT::i64) return false; - const Value *LHS = I.getArgOperand(0); - const Value *RHS = I.getArgOperand(1); + const Value *LHS = II->getArgOperand(0); + const Value *RHS = II->getArgOperand(1); - // Canonicalize immediates to the RHS. + // Canonicalize immediate to the RHS. if (isa(LHS) && !isa(RHS) && - isCommutativeIntrinsic(I)) + isCommutativeIntrinsic(II)) std::swap(LHS, RHS); unsigned BaseOpc, CondOpc; - switch (I.getIntrinsicID()) { + switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::sadd_with_overflow: BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break; @@ -2015,7 +2399,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { case Intrinsic::usub_with_overflow: BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; case Intrinsic::smul_with_overflow: - BaseOpc = ISD::MUL; CondOpc = X86::SETOr; break; + BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; case Intrinsic::umul_with_overflow: BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; } @@ -2043,10 +2427,11 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { RHSIsKill); } - // FastISel doesn't have a pattern for X86::MUL*r. Emit it manually. + // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit + // it manually. if (BaseOpc == X86ISD::UMUL && !ResultReg) { static const unsigned MULOpc[] = - { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; + { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; // First copy the first operand into RAX, which is an implicit input to // the X86::MUL*r instruction. @@ -2055,6 +2440,21 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { .addReg(LHSReg, getKillRegState(LHSIsKill)); ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], TLI.getRegClassFor(VT), RHSReg, RHSIsKill); + } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr }; + if (VT == MVT::i8) { + // Copy the first operand into AL, which is an implicit input to the + // X86::IMUL8r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), X86::AL) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = FastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, + RHSIsKill); + } else + ResultReg = FastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), LHSReg, LHSIsKill, + RHSReg, RHSIsKill); } if (!ResultReg) @@ -2065,7 +2465,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), ResultReg2); - UpdateValueMap(&I, ResultReg, 2); + UpdateValueMap(II, ResultReg, 2); return true; } case Intrinsic::x86_sse_cvttss2si: @@ -2073,7 +2473,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { case Intrinsic::x86_sse2_cvttsd2si: case Intrinsic::x86_sse2_cvttsd2si64: { bool IsInputDouble; - switch (I.getIntrinsicID()) { + switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic."); case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: @@ -2089,7 +2489,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { break; } - Type *RetTy = I.getCalledFunction()->getReturnType(); + Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; @@ -2109,7 +2509,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { } // Check if we can fold insertelement instructions into the convert. - const Value *Op = I.getArgOperand(0); + const Value *Op = II->getArgOperand(0); while (auto *IE = dyn_cast(Op)) { const Value *Index = IE->getOperand(2); if (!isa(Index)) @@ -2131,7 +2531,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(Reg); - UpdateValueMap(&I, ResultReg); + UpdateValueMap(II, ResultReg); return true; } } @@ -2232,53 +2632,52 @@ bool X86FastISel::FastLowerArguments() { return true; } -bool X86FastISel::X86SelectCall(const Instruction *I) { - const CallInst *CI = cast(I); - const Value *Callee = CI->getCalledValue(); - - // Can't handle inline asm yet. - if (isa(Callee)) - return false; - - // Handle intrinsic calls. - if (const IntrinsicInst *II = dyn_cast(CI)) - return X86VisitIntrinsicCall(*II); - - // Allow SelectionDAG isel to handle tail calls. - if (cast(I)->isTailCall()) - return false; - - return DoSelectCall(I, nullptr); -} - -static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget, - const ImmutableCallSite &CS) { - if (Subtarget.is64Bit()) +static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget, + CallingConv::ID CC, + ImmutableCallSite *CS) { + if (Subtarget->is64Bit()) return 0; - if (Subtarget.getTargetTriple().isOSMSVCRT()) + if (Subtarget->getTargetTriple().isOSMSVCRT()) return 0; - CallingConv::ID CC = CS.getCallingConv(); - if (CC == CallingConv::Fast || CC == CallingConv::GHC) + if (CC == CallingConv::Fast || CC == CallingConv::GHC || + CC == CallingConv::HiPE) return 0; - if (!CS.paramHasAttr(1, Attribute::StructRet)) + if (CS && !CS->paramHasAttr(1, Attribute::StructRet)) return 0; - if (CS.paramHasAttr(1, Attribute::InReg)) + if (CS && CS->paramHasAttr(1, Attribute::InReg)) return 0; return 4; } -// Select either a call, or an llvm.memcpy/memmove/memset intrinsic -bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { - const CallInst *CI = cast(I); - const Value *Callee = CI->getCalledValue(); - - // Handle only C and fastcc calling conventions for now. - ImmutableCallSite CS(CI); - CallingConv::ID CC = CS.getCallingConv(); - bool isWin64 = Subtarget->isCallingConvWin64(CC); - if (CC != CallingConv::C && CC != CallingConv::Fast && - CC != CallingConv::X86_FastCall && CC != CallingConv::X86_64_Win64 && - CC != CallingConv::X86_64_SysV) +bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { + auto &OutVals = CLI.OutVals; + auto &OutFlags = CLI.OutFlags; + auto &OutRegs = CLI.OutRegs; + auto &Ins = CLI.Ins; + auto &InRegs = CLI.InRegs; + CallingConv::ID CC = CLI.CallConv; + bool &IsTailCall = CLI.IsTailCall; + bool IsVarArg = CLI.IsVarArg; + const Value *Callee = CLI.Callee; + const char *SymName = CLI.SymName; + + bool Is64Bit = Subtarget->is64Bit(); + bool IsWin64 = Subtarget->isCallingConvWin64(CC); + + // Handle only C, fastcc, and webkit_js calling conventions for now. + switch (CC) { + default: return false; + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::WebKit_JS: + case CallingConv::X86_FastCall: + case CallingConv::X86_64_Win64: + case CallingConv::X86_64_SysV: + break; + } + + // Allow SelectionDAG isel to handle tail calls. + if (IsTailCall) return false; // fastcc with -tailcallopt is intended to provide a guaranteed @@ -2286,150 +2685,78 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) return false; - PointerType *PT = cast(CS.getCalledValue()->getType()); - FunctionType *FTy = cast(PT->getElementType()); - bool isVarArg = FTy->isVarArg(); - // Don't know how to handle Win64 varargs yet. Nothing special needed for - // x86-32. Special handling for x86-64 is implemented. - if (isVarArg && isWin64) + // x86-32. Special handling for x86-64 is implemented. + if (IsVarArg && IsWin64) return false; // Don't know about inalloca yet. - if (CS.hasInAllocaArgument()) + if (CLI.CS && CLI.CS->hasInAllocaArgument()) return false; // Fast-isel doesn't know about callee-pop yet. - if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg, + if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg, TM.Options.GuaranteedTailCallOpt)) return false; - // Check whether the function can return without sret-demotion. - SmallVector Outs; - GetReturnInfo(I->getType(), CS.getAttributes(), Outs, TLI); - bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), - *FuncInfo.MF, FTy->isVarArg(), - Outs, FTy->getContext()); - if (!CanLowerReturn) - return false; - - // Materialize callee address in a register. FIXME: GV address can be - // handled with a CALLpcrel32 instead. - X86AddressMode CalleeAM; - if (!X86SelectCallAddress(Callee, CalleeAM)) - return false; - unsigned CalleeOp = 0; - const GlobalValue *GV = nullptr; - if (CalleeAM.GV != nullptr) { - GV = CalleeAM.GV; - } else if (CalleeAM.Base.Reg != 0) { - CalleeOp = CalleeAM.Base.Reg; - } else - return false; - - // Deal with call operands first. - SmallVector ArgVals; - SmallVector Args; - SmallVector ArgVTs; - SmallVector ArgFlags; - unsigned arg_size = CS.arg_size(); - Args.reserve(arg_size); - ArgVals.reserve(arg_size); - ArgVTs.reserve(arg_size); - ArgFlags.reserve(arg_size); - for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); - i != e; ++i) { - // If we're lowering a mem intrinsic instead of a regular call, skip the - // last two arguments, which should not passed to the underlying functions. - if (MemIntName && e-i <= 2) - break; - Value *ArgVal = *i; - ISD::ArgFlagsTy Flags; - unsigned AttrInd = i - CS.arg_begin() + 1; - if (CS.paramHasAttr(AttrInd, Attribute::SExt)) - Flags.setSExt(); - if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) - Flags.setZExt(); - - if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) { - PointerType *Ty = cast(ArgVal->getType()); - Type *ElementTy = Ty->getElementType(); - unsigned FrameSize = DL.getTypeAllocSize(ElementTy); - unsigned FrameAlign = CS.getParamAlignment(AttrInd); - if (!FrameAlign) - FrameAlign = TLI.getByValTypeAlignment(ElementTy); - Flags.setByVal(); - Flags.setByValSize(FrameSize); - Flags.setByValAlign(FrameAlign); - if (!IsMemcpySmall(FrameSize)) - return false; - } - - if (CS.paramHasAttr(AttrInd, Attribute::InReg)) - Flags.setInReg(); - if (CS.paramHasAttr(AttrInd, Attribute::Nest)) - Flags.setNest(); - - // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra - // instruction. This is safe because it is common to all fastisel supported - // calling conventions on x86. - if (ConstantInt *CI = dyn_cast(ArgVal)) { - if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 || - CI->getBitWidth() == 16) { + // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra + // instruction. This is safe because it is common to all FastISel supported + // calling conventions on x86. + for (int i = 0, e = OutVals.size(); i != e; ++i) { + Value *&Val = OutVals[i]; + ISD::ArgFlagsTy Flags = OutFlags[i]; + if (auto *CI = dyn_cast(Val)) { + if (CI->getBitWidth() < 32) { if (Flags.isSExt()) - ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext())); + Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext())); else - ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext())); + Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext())); } } - unsigned ArgReg; - // Passing bools around ends up doing a trunc to i1 and passing it. // Codegen this as an argument + "and 1". - if (ArgVal->getType()->isIntegerTy(1) && isa(ArgVal) && - cast(ArgVal)->getParent() == I->getParent() && - ArgVal->hasOneUse()) { - ArgVal = cast(ArgVal)->getOperand(0); - ArgReg = getRegForValue(ArgVal); - if (ArgReg == 0) return false; - - MVT ArgVT; - if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false; - - ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg, - ArgVal->hasOneUse(), 1); - } else { - ArgReg = getRegForValue(ArgVal); - } + if (auto *TI = dyn_cast(Val)) { + if (TI->getType()->isIntegerTy(1) && CLI.CS && + (TI->getParent() == CLI.CS->getInstruction()->getParent()) && + TI->hasOneUse()) { + Val = cast(Val)->getOperand(0); + unsigned ResultReg = getRegForValue(Val); + + if (!ResultReg) + return false; - if (ArgReg == 0) return false; + MVT ArgVT; + if (!isTypeLegal(Val->getType(), ArgVT)) + return false; - Type *ArgTy = ArgVal->getType(); - MVT ArgVT; - if (!isTypeLegal(ArgTy, ArgVT)) - return false; - if (ArgVT == MVT::x86mmx) - return false; - unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - Flags.setOrigAlign(OriginalAlignment); + ResultReg = + FastEmit_ri(ArgVT, ArgVT, ISD::AND, ResultReg, Val->hasOneUse(), 1); - Args.push_back(ArgReg); - ArgVals.push_back(ArgVal); - ArgVTs.push_back(ArgVT); - ArgFlags.push_back(Flags); + if (!ResultReg) + return false; + UpdateValueMap(Val, ResultReg); + } + } } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; - CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, - I->getParent()->getContext()); + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, + CLI.RetTy->getContext()); // Allocate shadow area for Win64 - if (isWin64) + if (IsWin64) CCInfo.AllocateStack(32, 8); - CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86); + SmallVector OutVTs; + for (auto *Val : OutVals) { + MVT VT; + if (!isTypeLegal(Val->getType(), VT)) + return false; + OutVTs.push_back(VT); + } + CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); @@ -2439,13 +2766,20 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) .addImm(NumBytes); - // Process argument: walk the register/memloc assignments, inserting - // copies / loads. - SmallVector RegArgs; + // Walk the register/memloc assignments, inserting copies/loads. + const X86RegisterInfo *RegInfo = + static_cast(TM.getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - unsigned Arg = Args[VA.getValNo()]; - EVT ArgVT = ArgVTs[VA.getValNo()]; + CCValAssign const &VA = ArgLocs[i]; + const Value *ArgVal = OutVals[VA.getValNo()]; + MVT ArgVT = OutVTs[VA.getValNo()]; + + if (ArgVT == MVT::x86mmx) + return false; + + unsigned ArgReg = getRegForValue(ArgVal); + if (!ArgReg) + return false; // Promote the value if needed. switch (VA.getLocInfo()) { @@ -2453,8 +2787,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { case CCValAssign::SExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); - bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), - Arg, ArgVT, Arg); + bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); assert(Emitted && "Failed to emit a sext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; @@ -2462,8 +2796,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { case CCValAssign::ZExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); - bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), - Arg, ArgVT, Arg); + bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); assert(Emitted && "Failed to emit a zext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; @@ -2471,66 +2805,67 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { case CCValAssign::AExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); - bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), - Arg, ArgVT, Arg); + bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); if (!Emitted) - Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), - Arg, ArgVT, Arg); + Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); if (!Emitted) - Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), - Arg, ArgVT, Arg); + Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); assert(Emitted && "Failed to emit a aext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } case CCValAssign::BCvt: { - unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT(), - ISD::BITCAST, Arg, /*TODO: Kill=*/false); - assert(BC != 0 && "Failed to emit a bitcast!"); - Arg = BC; + ArgReg = FastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg, + /*TODO: Kill=*/false); + assert(ArgReg && "Failed to emit a bitcast!"); ArgVT = VA.getLocVT(); break; } - case CCValAssign::VExt: + case CCValAssign::VExt: // VExt has not been implemented, so this should be impossible to reach // for now. However, fallback to Selection DAG isel once implemented. return false; + case CCValAssign::FPExt: + llvm_unreachable("Unexpected loc info!"); case CCValAssign::Indirect: // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully // support this. return false; - case CCValAssign::FPExt: - llvm_unreachable("Unexpected loc info!"); } if (VA.isRegLoc()) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg); - RegArgs.push_back(VA.getLocReg()); + TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); + OutRegs.push_back(VA.getLocReg()); } else { + assert(VA.isMemLoc()); unsigned LocMemOffset = VA.getLocMemOffset(); X86AddressMode AM; - const X86RegisterInfo *RegInfo = static_cast( - getTargetMachine()->getRegisterInfo()); AM.Base.Reg = RegInfo->getStackRegister(); AM.Disp = LocMemOffset; - const Value *ArgVal = ArgVals[VA.getValNo()]; - ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()]; - + ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; + unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore, + ArgVT.getStoreSize(), Alignment); if (Flags.isByVal()) { X86AddressMode SrcAM; - SrcAM.Base.Reg = Arg; - bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()); - assert(Res && "memcpy length already checked!"); (void)Res; + SrcAM.Base.Reg = ArgReg; + if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize())) + return false; } else if (isa(ArgVal) || isa(ArgVal)) { // If this is a really simple value, emit this with the Value* version // of X86FastEmitStore. If it isn't simple, we don't want to do this, // as it can cause us to reevaluate the argument. - if (!X86FastEmitStore(ArgVT, ArgVal, AM)) + if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO)) return false; } else { - if (!X86FastEmitStore(ArgVT, Arg, /*ValIsKill=*/false, AM)) + bool ValIsKill = hasTrivialKill(ArgVal); + if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO)) return false; } } @@ -2544,37 +2879,53 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base); } - if (Subtarget->is64Bit() && isVarArg && !isWin64) { + if (Is64Bit && IsVarArg && !IsWin64) { + // From AMD64 ABI document: + // For calls that may call functions that use varargs or stdargs + // (prototype-less calls or calls to functions containing ellipsis (...) in + // the declaration) %al is used as hidden argument to specify the number + // of SSE registers used. The contents of %al do not need to match exactly + // the number of registers, but must be an ubound on the number of SSE + // registers used and is in the range 0 - 8 inclusive. + // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + assert((Subtarget->hasSSE1() || !NumXMMRegs) + && "SSE registers cannot be used when SSE is disabled"); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), X86::AL).addImm(NumXMMRegs); } + // Materialize callee address in a register. FIXME: GV address can be + // handled with a CALLpcrel32 instead. + X86AddressMode CalleeAM; + if (!X86SelectCallAddress(Callee, CalleeAM)) + return false; + + unsigned CalleeOp = 0; + const GlobalValue *GV = nullptr; + if (CalleeAM.GV != nullptr) { + GV = CalleeAM.GV; + } else if (CalleeAM.Base.Reg != 0) { + CalleeOp = CalleeAM.Base.Reg; + } else + return false; + // Issue the call. MachineInstrBuilder MIB; if (CalleeOp) { // Register-indirect call. - unsigned CallOpc; - if (Subtarget->is64Bit()) - CallOpc = X86::CALL64r; - else - CallOpc = X86::CALL32r; + unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)) .addReg(CalleeOp); - } else { // Direct call. assert(GV && "Not a direct call"); - unsigned CallOpc; - if (Subtarget->is64Bit()) - CallOpc = X86::CALL64pcrel32; - else - CallOpc = X86::CALLpcrel32; + unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; // See if we need any target-specific flags on the GV operand. unsigned char OpFlags = 0; @@ -2597,114 +2948,97 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { OpFlags = X86II::MO_DARWIN_STUB; } - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); - if (MemIntName) - MIB.addExternalSymbol(MemIntName, OpFlags); + if (SymName) + MIB.addExternalSymbol(SymName, OpFlags); else MIB.addGlobalAddress(GV, 0, OpFlags); } - // Add a register mask with the call-preserved registers. + // Add a register mask operand representing the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv())); + MIB.addRegMask(TRI.getCallPreservedMask(CC)); // Add an implicit use GOT pointer in EBX. if (Subtarget->isPICStyleGOT()) MIB.addReg(X86::EBX, RegState::Implicit); - if (Subtarget->is64Bit() && isVarArg && !isWin64) + if (Is64Bit && IsVarArg && !IsWin64) MIB.addReg(X86::AL, RegState::Implicit); // Add implicit physical register uses to the call. - for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i], RegState::Implicit); + for (auto Reg : OutRegs) + MIB.addReg(Reg, RegState::Implicit); // Issue CALLSEQ_END + unsigned NumBytesForCalleeToPop = + computeBytesPoppedByCallee(Subtarget, CC, CLI.CS); unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); - const unsigned NumBytesCallee = computeBytesPoppedByCallee(*Subtarget, CS); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) - .addImm(NumBytes).addImm(NumBytesCallee); - - // Build info for return calling conv lowering code. - // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo. - SmallVector Ins; - SmallVector RetTys; - ComputeValueVTs(TLI, I->getType(), RetTys); - for (unsigned i = 0, e = RetTys.size(); i != e; ++i) { - EVT VT = RetTys[i]; - MVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT); - unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT); - for (unsigned j = 0; j != NumRegs; ++j) { - ISD::InputArg MyFlags; - MyFlags.VT = RegisterVT; - MyFlags.Used = !CS.getInstruction()->use_empty(); - if (CS.paramHasAttr(0, Attribute::SExt)) - MyFlags.Flags.setSExt(); - if (CS.paramHasAttr(0, Attribute::ZExt)) - MyFlags.Flags.setZExt(); - if (CS.paramHasAttr(0, Attribute::InReg)) - MyFlags.Flags.setInReg(); - Ins.push_back(MyFlags); - } - } + .addImm(NumBytes).addImm(NumBytesForCalleeToPop); // Now handle call return values. - SmallVector UsedRegs; SmallVector RVLocs; - CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs, - I->getParent()->getContext()); - unsigned ResultReg = FuncInfo.CreateRegs(I->getType()); + CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, + CLI.RetTy->getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); + + // Copy all of the result registers out of their specified physreg. + unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy); for (unsigned i = 0; i != RVLocs.size(); ++i) { - EVT CopyVT = RVLocs[i].getValVT(); + CCValAssign &VA = RVLocs[i]; + EVT CopyVT = VA.getValVT(); unsigned CopyReg = ResultReg + i; - // If this is a call to a function that returns an fp value on the x87 fp - // stack, but where we prefer to use the value in xmm registers, copy it - // out as F80 and use a truncate to move it from fp stack reg to xmm reg. - if ((RVLocs[i].getLocReg() == X86::ST0 || - RVLocs[i].getLocReg() == X86::ST1)) { - if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) { + // If this is x86-64, and we disabled SSE, we can't return FP values + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { + report_fatal_error("SSE register return with SSE disabled"); + } + + // If this is a call to a function that returns an fp value on the floating + // point stack, we must guarantee the value is popped from the stack, so + // a COPY is not good enough - the copy instruction may be eliminated if the + // return value is not used. We use the FpPOP_RETVAL instruction instead. + if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + if (isScalarFPTypeInSSEReg(VA.getValVT())) { CopyVT = MVT::f80; CopyReg = createResultReg(&X86::RFP80RegClass); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::FpPOP_RETVAL), CopyReg); + + // Round the f80 to the right size, which also moves it to the appropriate + // xmm register. This is accomplished by storing the f80 value in memory + // and then loading it back. + if (CopyVT != VA.getValVT()) { + EVT ResVT = VA.getValVT(); + unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; + unsigned MemSize = ResVT.getSizeInBits()/8; + int FI = MFI.CreateStackObject(MemSize, MemSize, false); + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc)), FI) + .addReg(CopyReg); + Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg + i), FI); + } } else { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), - CopyReg).addReg(RVLocs[i].getLocReg()); - UsedRegs.push_back(RVLocs[i].getLocReg()); - } - - if (CopyVT != RVLocs[i].getValVT()) { - // Round the F80 the right size, which also moves to the appropriate xmm - // register. This is accomplished by storing the F80 value in memory and - // then loading it back. Ewww... - EVT ResVT = RVLocs[i].getValVT(); - unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; - unsigned MemSize = ResVT.getSizeInBits()/8; - int FI = MFI.CreateStackObject(MemSize, MemSize, false); - addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Opc)), FI) - .addReg(CopyReg); - Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; - addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Opc), ResultReg + i), FI); + TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg()); + InRegs.push_back(VA.getLocReg()); } } - if (RVLocs.size()) - UpdateValueMap(I, ResultReg, RVLocs.size()); - - // Set all unused physreg defs as dead. - static_cast(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + CLI.ResultReg = ResultReg; + CLI.NumResultRegs = RVLocs.size(); + CLI.Call = MIB; return true; } - bool X86FastISel::TargetSelectInstruction(const Instruction *I) { switch (I->getOpcode()) { @@ -2722,8 +3056,6 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { return X86SelectZExt(I); case Instruction::Br: return X86SelectBranch(I); - case Instruction::Call: - return X86SelectCall(I); case Instruction::LShr: case Instruction::AShr: case Instruction::Shl: diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 4be766a19f96..eb9f743226cc 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -7,9 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This file defines the pass which will find instructions which -// can be re-written as LEA instructions in order to reduce pipeline -// delays for some models of the Intel Atom family. +// This file defines the pass that finds instructions that can be +// re-written as LEA instructions in order to reduce pipeline delays. // //===----------------------------------------------------------------------===// @@ -40,7 +39,7 @@ class FixupLEAPass : public MachineFunctionPass { /// where appropriate. bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); - const char *getPassName() const override { return "X86 Atom LEA Fixup"; } + const char *getPassName() const override { return "X86 LEA Fixup"; } /// \brief Given a machine register, look for the instruction /// which writes it in the current basic block. If found, diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index fab0560e3bcd..8c029a8c22d5 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -29,6 +29,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -305,9 +306,10 @@ static bool isEAXLiveIn(MachineFunction &MF) { return false; } -void X86FrameLowering::emitCalleeSavedFrameMoves( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, - unsigned FramePtr) const { +void +X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); @@ -318,53 +320,11 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( const std::vector &CSI = MFI->getCalleeSavedInfo(); if (CSI.empty()) return; - const X86RegisterInfo *RegInfo = - static_cast(MF.getTarget().getRegisterInfo()); - bool HasFP = hasFP(MF); - - // Calculate amount of bytes used for return address storing. - int stackGrowth = -RegInfo->getSlotSize(); - - // FIXME: This is dirty hack. The code itself is pretty mess right now. - // It should be rewritten from scratch and generalized sometimes. - - // Determine maximum offset (minimum due to stack growth). - int64_t MaxOffset = 0; - for (std::vector::const_iterator - I = CSI.begin(), E = CSI.end(); I != E; ++I) - MaxOffset = std::min(MaxOffset, - MFI->getObjectOffset(I->getFrameIdx())); - // Calculate offsets. - int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth; for (std::vector::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); unsigned Reg = I->getReg(); - Offset = MaxOffset - Offset + saveAreaOffset; - - // Don't output a new machine move if we're re-saving the frame - // pointer. This happens when the PrologEpilogInserter has inserted an extra - // "PUSH" of the frame pointer -- the "emitPrologue" method automatically - // generates one when frame pointers are used. If we generate a "machine - // move" for this extra "PUSH", the linker will lose track of the fact that - // the frame pointer should have the value of the first "PUSH" when it's - // trying to unwind. - // - // FIXME: This looks inelegant. It's possibly correct, but it's covering up - // another bug. I.e., one where we generate a prolog like this: - // - // pushl %ebp - // movl %esp, %ebp - // pushl %ebp - // pushl %esi - // ... - // - // The immediate re-push of EBP is unnecessary. At the least, it's an - // optimization bug. EBP can be used as a scratch register in certain - // cases, but probably not when we have a frame pointer. - if (HasFP && FramePtr == Reg) - continue; unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); unsigned CFIIndex = @@ -396,6 +356,84 @@ static bool usesTheStack(const MachineFunction &MF) { /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to /// generate the exception handling frames. + +/* + Here's a gist of what gets emitted: + + ; Establish frame pointer, if needed + [if needs FP] + push %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + .seh_pushreg %rpb + mov %rsp, %rbp + .cfi_def_cfa_register %rbp + + ; Spill general-purpose registers + [for all callee-saved GPRs] + pushq % + [if not needs FP] + .cfi_def_cfa_offset (offset from RETADDR) + .seh_pushreg % + + ; If the required stack alignment > default stack alignment + ; rsp needs to be re-aligned. This creates a "re-alignment gap" + ; of unknown size in the stack frame. + [if stack needs re-alignment] + and $MASK, %rsp + + ; Allocate space for locals + [if target is Windows and allocated space > 4096 bytes] + ; Windows needs special care for allocations larger + ; than one page. + mov $NNN, %rax + call ___chkstk_ms/___chkstk + sub %rax, %rsp + [else] + sub $NNN, %rsp + + [if needs FP] + .seh_stackalloc (size of XMM spill slots) + .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots + [else] + .seh_stackalloc NNN + + ; Spill XMMs + ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved, + ; they may get spilled on any platform, if the current function + ; calls @llvm.eh.unwind.init + [if needs FP] + [for all callee-saved XMM registers] + movaps %, -MMM(%rbp) + [for all callee-saved XMM registers] + .seh_savexmm %, (-MMM + SEHFrameOffset) + ; i.e. the offset relative to (%rbp - SEHFrameOffset) + [else] + [for all callee-saved XMM registers] + movaps %, KKK(%rsp) + [for all callee-saved XMM registers] + .seh_savexmm %, KKK + + .seh_endprologue + + [if needs base pointer] + mov %rsp, %rbx + + ; Emit CFI info + [if needs FP] + [for all callee-saved registers] + .cfi_offset %, (offset from %rbp) + [else] + .cfi_def_cfa_offset (offset from RETADDR) + [for all callee-saved registers] + .cfi_offset %, (offset from %rsp) + + Notes: + - .seh directives are emitted only for Windows 64 ABI + - .cfi directives are emitted for all other ABIs + - for 32-bit code, substitute %e?? registers for %r?? +*/ + void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -406,8 +444,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo(); - bool needsFrameMoves = MMI.hasDebugInfo() || - Fn->needsUnwindTableEntry(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); @@ -415,6 +451,12 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { bool Is64Bit = STI.is64Bit(); bool IsLP64 = STI.isTarget64BitLP64(); bool IsWin64 = STI.isTargetWin64(); + bool IsWinEH = + MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() == + ExceptionHandling::WinEH; // Not necessarily synonymous with IsWin64. + bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); + bool NeedsDwarfCFI = + !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); bool UseLEA = STI.useLeaForSP(); unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); @@ -512,7 +554,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addReg(FramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); - if (needsFrameMoves) { + if (NeedsDwarfCFI) { // Mark the place where EBP/RBP was saved. // Define the current CFA rule to use the provided offset. assert(StackSize); @@ -530,13 +572,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); } + if (NeedsWinEH) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + .addImm(FramePtr) + .setMIFlag(MachineInstr::FrameSetup); + } + // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); - if (needsFrameMoves) { + if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); @@ -546,9 +594,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); } - // Mark the FramePtr as live-in in every block except the entry. - for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end(); - I != E; ++I) + // Mark the FramePtr as live-in in every block. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) I->addLiveIn(FramePtr); } else { NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); @@ -562,10 +609,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; - MBBI->setFlag(MachineInstr::FrameSetup); + unsigned Reg = MBBI->getOperand(0).getReg(); ++MBBI; - if (!HasFP && needsFrameMoves) { + if (!HasFP && NeedsDwarfCFI) { // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); @@ -575,16 +622,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); StackOffset += stackGrowth; } + + if (NeedsWinEH) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( + MachineInstr::FrameSetup); + } } // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). - - // NOTE: We push the registers before realigning the stack, so - // vector callee-saved (xmm) registers may be saved w/o proper - // alignment in this way. However, currently these regs are saved in - // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so - // this shouldn't be a problem. if (RegInfo->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); MachineInstr *MI = @@ -683,23 +729,88 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } - } else if (NumBytes) + } else if (NumBytes) { emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, UseLEA, TII, *RegInfo); + } + + int SEHFrameOffset = 0; + if (NeedsWinEH) { + if (HasFP) { + // We need to set frame base offset low enough such that all saved + // register offsets would be positive relative to it, but we can't + // just use NumBytes, because .seh_setframe offset must be <=240. + // So we pretend to have only allocated enough space to spill the + // non-volatile registers. + // We don't care about the rest of stack allocation, because unwinder + // will restore SP to (BP - SEHFrameOffset) + for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { + int offset = MFI->getObjectOffset(Info.getFrameIdx()); + SEHFrameOffset = std::max(SEHFrameOffset, abs(offset)); + } + SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant + + // This only needs to account for XMM spill slots, GPR slots + // are covered by the .seh_pushreg's emitted above. + unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize(); + if (Size) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(Size) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) + .addImm(FramePtr) + .addImm(SEHFrameOffset) + .setMIFlag(MachineInstr::FrameSetup); + } else { + // SP will be the base register for restoring XMMs + if (NumBytes) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } + } + } + + // Skip the rest of register spilling code + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + ++MBBI; + + // Emit SEH info for non-GPRs + if (NeedsWinEH) { + for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { + unsigned Reg = Info.getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class"); + + int Offset = getFrameIndexOffset(MF, Info.getFrameIdx()); + Offset += SEHFrameOffset; + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + .addImm(Reg) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) + .setMIFlag(MachineInstr::FrameSetup); + } // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer // to reference locals. if (RegInfo->hasBasePointer(MF)) { - // Update the frame pointer with the current stack pointer. + // Update the base pointer with the current stack pointer. unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); } - if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { + if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { // Mark end of stack pointer adjustment. if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. @@ -714,7 +825,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Emit DWARF info specifying the offsets of the callee-saved registers. if (PushedRegs) - emitCalleeSavedFrameMoves(MBB, MBBI, DL, HasFP ? FramePtr : StackPtr); + emitCalleeSavedFrameMoves(MBB, MBBI, DL); } } @@ -974,48 +1085,97 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return getFrameIndexOffset(MF, FI); } -bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector &CSI, - const TargetRegisterInfo *TRI) const { - if (CSI.empty()) - return false; +bool X86FrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86RegisterInfo *RegInfo = + static_cast(MF.getTarget().getRegisterInfo()); + unsigned SlotSize = RegInfo->getSlotSize(); + X86MachineFunctionInfo *X86FI = MF.getInfo(); + + unsigned CalleeSavedFrameSize = 0; + int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); + + if (hasFP(MF)) { + // emitPrologue always spills frame register the first thing. + SpillSlotOffset -= SlotSize; + MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + + // Since emitPrologue and emitEpilogue will handle spilling and restoring of + // the frame register, we can delete it from CSI list and not have to worry + // about avoiding it later. + unsigned FPReg = RegInfo->getFrameRegister(MF); + for (unsigned i = 0; i < CSI.size(); ++i) { + if (CSI[i].getReg() == FPReg) { + CSI.erase(CSI.begin() + i); + break; + } + } + } + + // Assign slots for GPRs. It increases frame size. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) + continue; + + SpillSlotOffset -= SlotSize; + CalleeSavedFrameSize += SlotSize; + + int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + } + + X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); + + // Assign slots for XMMs. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + + const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + // ensure alignment + SpillSlotOffset -= abs(SpillSlotOffset) % RC->getAlignment(); + // spill into slot + SpillSlotOffset -= RC->getSize(); + int SlotIndex = + MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + MFI->ensureMaxAlignment(RC->getAlignment()); + } + return true; +} + +bool X86FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector &CSI, + const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); - const X86RegisterInfo *RegInfo = - static_cast(MF.getTarget().getRegisterInfo()); - unsigned SlotSize = RegInfo->getSlotSize(); - unsigned FPReg = TRI->getFrameRegister(MF); - unsigned CalleeFrameSize = 0; - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); - X86MachineFunctionInfo *X86FI = MF.getInfo(); const X86Subtarget &STI = MF.getTarget().getSubtarget(); // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); - if (!X86::GR64RegClass.contains(Reg) && - !X86::GR32RegClass.contains(Reg)) + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); - if (Reg == FPReg) - // X86RegisterInfo::emitPrologue will handle spilling of frame register. - continue; - CalleeFrameSize += SlotSize; + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); } - X86FI->setCalleeSavedFrameSize(CalleeFrameSize); - // Make XMM regs spilled. X86 does not have ability of push/pop XMM. // It can be done by spilling XMMs to stack frame. - // Note that only Win64 ABI might spill XMMs. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (X86::GR64RegClass.contains(Reg) || @@ -1024,8 +1184,12 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), - RC, TRI); + + TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, + TRI); + --MI; + MI->setFlag(MachineInstr::FrameSetup); + ++MI; } return true; @@ -1050,22 +1214,19 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), - RC, TRI); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); } // POP GPRs. - unsigned FPReg = TRI->getFrameRegister(MF); unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; - if (Reg == FPReg) - // X86RegisterInfo::emitEpilogue will handle restoring of frame register. - continue; + BuildMI(MBB, MI, DL, TII.get(Opc), Reg); } return true; @@ -1096,22 +1257,6 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, TailCallReturnAddrDelta - SlotSize, true); } - if (hasFP(MF)) { - assert((TailCallReturnAddrDelta <= 0) && - "The Delta should always be zero or negative"); - const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); - - // Create a frame entry for the EBP register that must be saved. - int FrameIdx = MFI->CreateFixedObject(SlotSize, - -(int)SlotSize + - TFI.getOffsetOfLocalArea() + - TailCallReturnAddrDelta, - true); - assert(FrameIdx == MFI->getObjectIndexBegin() && - "Slot for EBP register must be last in order to be found!"); - (void)FrameIdx; - } - // Spill the BasePtr if it's used. if (RegInfo->hasBasePointer(MF)) MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 5c43c1488d4a..5ad3d4dc2e97 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -27,8 +27,8 @@ class X86FrameLowering : public TargetFrameLowering { : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL, - unsigned FramePtr) const; + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. @@ -42,6 +42,11 @@ class X86FrameLowering : public TargetFrameLowering { void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS = nullptr) const override; + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const override; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 74386d33990d..ba2f5f645d09 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2126,38 +2126,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return getGlobalBaseReg(); - case X86ISD::ATOMOR64_DAG: - case X86ISD::ATOMXOR64_DAG: - case X86ISD::ATOMADD64_DAG: - case X86ISD::ATOMSUB64_DAG: - case X86ISD::ATOMNAND64_DAG: - case X86ISD::ATOMAND64_DAG: - case X86ISD::ATOMMAX64_DAG: - case X86ISD::ATOMMIN64_DAG: - case X86ISD::ATOMUMAX64_DAG: - case X86ISD::ATOMUMIN64_DAG: - case X86ISD::ATOMSWAP64_DAG: { - unsigned Opc; - switch (Opcode) { - default: llvm_unreachable("Impossible opcode"); - case X86ISD::ATOMOR64_DAG: Opc = X86::ATOMOR6432; break; - case X86ISD::ATOMXOR64_DAG: Opc = X86::ATOMXOR6432; break; - case X86ISD::ATOMADD64_DAG: Opc = X86::ATOMADD6432; break; - case X86ISD::ATOMSUB64_DAG: Opc = X86::ATOMSUB6432; break; - case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break; - case X86ISD::ATOMAND64_DAG: Opc = X86::ATOMAND6432; break; - case X86ISD::ATOMMAX64_DAG: Opc = X86::ATOMMAX6432; break; - case X86ISD::ATOMMIN64_DAG: Opc = X86::ATOMMIN6432; break; - case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break; - case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break; - case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break; - } - SDNode *RetVal = SelectAtomic64(Node, Opc); - if (RetVal) - return RetVal; - break; - } - case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 851607eac96e..9e13fd38e13d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -44,11 +44,13 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include +#include #include using namespace llvm; @@ -56,6 +58,17 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); +static cl::opt ExperimentalVectorWideningLegalization( + "x86-experimental-vector-widening-legalization", cl::init(false), + cl::desc("Enable an experimental vector type legalization through widening " + "rather than promotion."), + cl::Hidden); + +static cl::opt ExperimentalVectorShuffleLowering( + "x86-experimental-vector-shuffle-lowering", cl::init(false), + cl::desc("Enable an experimental vector shuffle lowering code path."), + cl::Hidden); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -502,6 +515,25 @@ void X86TargetLowering::resetOperationActions() { } } + // Special handling for half-precision floating point conversions. + // If we don't have F16C support, then lower half float conversions + // into library calls. + if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + } + + // There's never any support for operations beyond MVT::f32. + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f80, MVT::f16, Expand); + if (Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { @@ -585,29 +617,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } - if (!Subtarget->is64Bit()) { - setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); - } - if (Subtarget->hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags - if (!Subtarget->isTargetDarwin() && - !Subtarget->isTargetELF() && - !Subtarget->isTargetCygMing()) { + if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && + !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } @@ -873,7 +889,12 @@ void X86TargetLowering::resetOperationActions() { (MVT::SimpleValueType)InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, Expand); + + // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types, + // we have to deal with them whether we ask for Expansion or not. Setting + // Expand causes its own optimisation problems though, so leave them legal. + if (VT.getVectorElementType() == MVT::i1) + setLoadExtAction(ISD::EXTLOAD, VT, Expand); } // FIXME: In order to prevent SSE instructions being expanded to MMX ones @@ -1597,6 +1618,16 @@ void X86TargetLowering::resetOperationActions() { setPrefFunctionAlignment(4); // 2^4 bytes. } +TargetLoweringBase::LegalizeTypeAction +X86TargetLowering::getPreferredVectorAction(EVT VT) const { + if (ExperimentalVectorWideningLegalization && + VT.getVectorNumElements() != 1 && + VT.getVectorElementType().getSimpleVT() != MVT::i1) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; @@ -3028,7 +3059,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the -// original REtADDR, but before the saved framepointer or the spilled registers +// original RETADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 @@ -4736,28 +4767,6 @@ bool X86::isZeroNode(SDValue Elt) { return false; } -/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in -/// their permute mask. -static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - SmallVector MaskVec; - - for (unsigned i = 0; i != NumElems; ++i) { - int Idx = SVOp->getMaskElt(i); - if (Idx >= 0) { - if (Idx < (int)NumElems) - Idx += NumElems; - else - Idx -= NumElems; - } - MaskVec.push_back(Idx); - } - return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1), - SVOp->getOperand(0), &MaskVec[0]); -} - /// ShouldXformToMOVHLPS - Return true if the node should be transformed to /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper @@ -4843,19 +4852,6 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, return true; } -/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are -/// all the same. -static bool isSplatVector(SDNode *N) { - if (N->getOpcode() != ISD::BUILD_VECTOR) - return false; - - SDValue SplatValue = N->getOperand(0); - for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) - if (N->getOperand(i) != SplatValue) - return false; - return true; -} - /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved /// to an zero vector. /// FIXME: move to dag combiner / method on ShuffleVectorSDNode @@ -5764,18 +5760,22 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); case ISD::BUILD_VECTOR: { - // The BUILD_VECTOR node must be a splat. - if (!isSplatVector(Op.getNode())) + auto *BVOp = cast(Op.getNode()); + BitVector UndefElements; + SDValue Splat = BVOp->getSplatValue(&UndefElements); + + // We need a splat of a single value to use broadcast, and it doesn't + // make any sense if the value is only in one element of the vector. + if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) return SDValue(); - Ld = Op.getOperand(0); + Ld = Splat; ConstSplatVal = (Ld.getOpcode() == ISD::Constant || - Ld.getOpcode() == ISD::ConstantFP); + Ld.getOpcode() == ISD::ConstantFP); - // The suspected load node has several users. Make sure that all - // of its users are from the BUILD_VECTOR node. - // Constants may have multiple users. - if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + // Make sure that all of the users of a non-constant load are from the + // BUILD_VECTOR node. + if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); break; } @@ -6077,21 +6077,35 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, + SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { + EVT VT = N->getValueType(0); + assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); - assert(N->getValueType(0).isVector() && - N->getValueType(0).getVectorNumElements() >= LastIdx && + assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; + V0 = DAG.getUNDEF(VT); + V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); + + // Skip UNDEFs. + if (Op->getOpcode() == ISD::UNDEF) { + // Update the expected vector extract index. + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; + ExpectedVExtractIdx += 2; + continue; + } + CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) @@ -6112,12 +6126,15 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); unsigned I1 = cast(Op1.getOperand(1))->getZExtValue(); - - if (i == 0) - V0 = Op0.getOperand(0); - else if (i * 2 == NumElts) { - V1 = Op0.getOperand(0); - ExpectedVExtractIdx = BaseIdx; + + if (i * 2 < NumElts) { + if (V0.getOpcode() == ISD::UNDEF) + V0 = Op0.getOperand(0); + } else { + if (V1.getOpcode() == ISD::UNDEF) + V1 = Op0.getOperand(0); + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; @@ -6163,9 +6180,14 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI +/// +/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower +/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to +/// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, SDLoc DL, SelectionDAG &DAG, - unsigned X86Opcode, bool Mode) { + unsigned X86Opcode, bool Mode, + bool isUndefLO, bool isUndefHI) { EVT VT = V0.getValueType(); assert(VT.is256BitVector() && VT == V1.getValueType() && "Invalid nodes in input!"); @@ -6177,18 +6199,150 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); EVT NewVT = V0_LO.getValueType(); - SDValue LO, HI; + SDValue LO = DAG.getUNDEF(NewVT); + SDValue HI = DAG.getUNDEF(NewVT); + if (Mode) { - LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); - HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); + if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) + HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { - LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); - HI = DAG.getNode(X86Opcode, DL, NewVT, V1_HI, V1_HI); + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || + V1_LO->getOpcode() != ISD::UNDEF)) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); + + if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || + V1_HI->getOpcode() != ISD::UNDEF)) + HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } +/// \brief Try to fold a build_vector that performs an 'addsub' into the +/// sequence of 'vadd + vsub + blendi'. +static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc DL(BV); + EVT VT = BV->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + SDValue InVec0 = DAG.getUNDEF(VT); + SDValue InVec1 = DAG.getUNDEF(VT); + + assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || + VT == MVT::v2f64) && "build_vector with an invalid type found!"); + + // Don't try to emit a VSELECT that cannot be lowered into a blend. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) + return SDValue(); + + // Odd-numbered elements in the input build vector are obtained from + // adding two integer/float elements. + // Even-numbered elements in the input build vector are obtained from + // subtracting two integer/float elements. + unsigned ExpectedOpcode = ISD::FSUB; + unsigned NextExpectedOpcode = ISD::FADD; + bool AddFound = false; + bool SubFound = false; + + for (unsigned i = 0, e = NumElts; i != e; i++) { + SDValue Op = BV->getOperand(i); + + // Skip 'undef' values. + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::UNDEF) { + std::swap(ExpectedOpcode, NextExpectedOpcode); + continue; + } + + // Early exit if we found an unexpected opcode. + if (Opcode != ExpectedOpcode) + return SDValue(); + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Try to match the following pattern: + // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) + // Early exit if we cannot match that sequence. + if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(Op0.getOperand(1)) || + !isa(Op1.getOperand(1)) || + Op0.getOperand(1) != Op1.getOperand(1)) + return SDValue(); + + unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); + if (I0 != i) + return SDValue(); + + // We found a valid add/sub node. Update the information accordingly. + if (i & 1) + AddFound = true; + else + SubFound = true; + + // Update InVec0 and InVec1. + if (InVec0.getOpcode() == ISD::UNDEF) + InVec0 = Op0.getOperand(0); + if (InVec1.getOpcode() == ISD::UNDEF) + InVec1 = Op1.getOperand(0); + + // Make sure that operands in input to each add/sub node always + // come from a same pair of vectors. + if (InVec0 != Op0.getOperand(0)) { + if (ExpectedOpcode == ISD::FSUB) + return SDValue(); + + // FADD is commutable. Try to commute the operands + // and then test again. + std::swap(Op0, Op1); + if (InVec0 != Op0.getOperand(0)) + return SDValue(); + } + + if (InVec1 != Op1.getOperand(0)) + return SDValue(); + + // Update the pair of expected opcodes. + std::swap(ExpectedOpcode, NextExpectedOpcode); + } + + // Don't try to fold this build_vector into a VSELECT if it has + // too many UNDEF operands. + if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && + InVec1.getOpcode() != ISD::UNDEF) { + // Emit a sequence of vector add and sub followed by a VSELECT. + // The new VSELECT will be lowered into a BLENDI. + // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI' + // and emit a single ADDSUB instruction. + SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1); + SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1); + + // Construct the VSELECT mask. + EVT MaskVT = VT.changeVectorElementTypeToInteger(); + EVT SVT = MaskVT.getVectorElementType(); + unsigned SVTBits = SVT.getSizeInBits(); + SmallVector Ops; + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + APInt Value = i & 1 ? APInt::getNullValue(SVTBits) : + APInt::getAllOnesValue(SVTBits); + SDValue Constant = DAG.getConstant(Value, SVT); + Ops.push_back(Constant); + } + + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops); + return DAG.getSelect(DL, VT, Mask, Sub, Add); + } + + return SDValue(); +} + static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc DL(N); @@ -6197,20 +6351,46 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, BuildVectorSDNode *BV = cast(N); SDValue InVec0, InVec1; + // Try to match an ADDSUB. + if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { + SDValue Value = matchAddSub(BV, DAG, Subtarget); + if (Value.getNode()) + return Value; + } + // Try to match horizontal ADD/SUB. + unsigned NumUndefsLO = 0; + unsigned NumUndefsHI = 0; + unsigned Half = NumElts/2; + + // Count the number of UNDEF operands in the build_vector in input. + for (unsigned i = 0, e = Half; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsLO++; + + for (unsigned i = Half, e = NumElts; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsHI++; + + // Early exit if this is either a build_vector of all UNDEFs or all the + // operands but one are UNDEF. + if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) + return SDValue(); + if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { // Try to match an SSE3 float HADD/HSUB. - if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); - if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. - if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); - if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } @@ -6221,16 +6401,20 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, // Try to match an AVX horizontal add/sub of packed single/double // precision floating point values from 256-bit vectors. SDValue InVec2, InVec3; - if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::FADD, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); - if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::FSUB, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { // Try to match an AVX2 horizontal add/sub of signed integers. @@ -6238,15 +6422,19 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, unsigned X86Opcode; bool CanFold = true; - if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::ADD, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; - else if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::SUB, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; @@ -6257,29 +6445,45 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, if (Subtarget->hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); + // Do not try to expand this build_vector into a pair of horizontal + // add/sub if we can emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + // Convert this build_vector into a pair of horizontal binop followed by // a concat vector. - return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false); + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, + isUndefLO, isUndefHI); } } if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) && Subtarget->hasAVX()) { unsigned X86Opcode; - if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; - else if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts, InVec0, InVec1)) + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; - else if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts, InVec0, InVec1)) + else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; - else if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts, InVec0, InVec1)) + else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); + // Don't try to expand this build_vector into a pair of horizontal add/sub + // if we can simply emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + // Convert this build_vector into two horizontal add/sub followed by // a concat vector. - return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true); + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, + isUndefLO, isUndefHI); } return SDValue(); @@ -6672,127 +6876,1257 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return LowerAVXCONCAT_VECTORS(Op, DAG); } -static bool isBlendMask(ArrayRef MaskVals, MVT VT, bool hasSSE41, - bool hasInt256, unsigned *MaskOut = nullptr) { - MVT EltVT = VT.getVectorElementType(); - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return false; +//===----------------------------------------------------------------------===// +// Vector shuffle lowering +// +// This is an experimental code path for lowering vector shuffles on x86. It is +// designed to handle arbitrary vector shuffles and blends, gracefully +// degrading performance as necessary. It works hard to recognize idiomatic +// shuffles and lower them to optimal instruction patterns without leaving +// a framework that allows reasonably efficient handling of all vector shuffle +// patterns. +//===----------------------------------------------------------------------===// - if (!hasSSE41 || EltVT == MVT::i8) - return false; - if (!hasInt256 && VT == MVT::v16i16) - return false; +/// \brief Tiny helper function to identify a no-op mask. +/// +/// This is a somewhat boring predicate function. It checks whether the mask +/// array input, which is assumed to be a single-input shuffle mask of the kind +/// used by the X86 shuffle instructions (not a fully general +/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an +/// in-place shuffle are 'no-op's. +static bool isNoopShuffleMask(ArrayRef Mask) { + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] != -1 && Mask[i] != i) + return false; + return true; +} - unsigned MaskValue = 0; - unsigned NumElems = VT.getVectorNumElements(); - // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. - unsigned NumLanes = (NumElems - 1) / 8 + 1; - unsigned NumElemsInLane = NumElems / NumLanes; +/// \brief Helper function to classify a mask as a single-input mask. +/// +/// This isn't a generic single-input test because in the vector shuffle +/// lowering we canonicalize single inputs to be the first input operand. This +/// means we can more quickly test for a single input by only checking whether +/// an input from the second operand exists. We also assume that the size of +/// mask corresponds to the size of the input vectors which isn't true in the +/// fully general case. +static bool isSingleInputShuffleMask(ArrayRef Mask) { + for (int M : Mask) + if (M >= (int)Mask.size()) + return false; + return true; +} - // Blend for v16i16 should be symetric for the both lanes. - for (unsigned i = 0; i < NumElemsInLane; ++i) { +/// \brief Get a 4-lane 8-bit shuffle immediate for a mask. +/// +/// This helper function produces an 8-bit shuffle immediate corresponding to +/// the ubiquitous shuffle encoding scheme used in x86 instructions for +/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for +/// example. +/// +/// NB: We rely heavily on "undef" masks preserving the input lane. +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, + SelectionDAG &DAG) { + assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); + assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); + assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); + assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); + assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); - int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; - int EltIdx = MaskVals[i]; + unsigned Imm = 0; + Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; + Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; + Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; + Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; + return DAG.getConstant(Imm, MVT::i8); +} - if ((EltIdx < 0 || EltIdx == (int)i) && - (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) - continue; +/// \brief Handle lowering of 2-lane 64-bit floating point shuffles. +/// +/// This is the basis function for the 2-lane 64-bit shuffles as we have full +/// support for floating point shuffles but not integer shuffles. These +/// instructions will incur a domain crossing penalty on some chips though so +/// it is better to avoid lowering through this for integer vectors where +/// possible. +static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); - if (((unsigned)EltIdx == (i + NumElems)) && - (SndLaneEltIdx < 0 || - (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) - MaskValue |= (1 << i); - else - return false; + if (isSingleInputShuffleMask(Mask)) { + // Straight shuffle of a single input vector. Simulate this by using the + // single input as both of the "inputs" to this instruction.. + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); } + assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); + assert(Mask[1] >= 2 && "Non-canonicalized blend!"); - if (MaskOut) - *MaskOut = MaskValue; - return true; + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, + DAG.getConstant(SHUFPDMask, MVT::i8)); } -// Try to lower a shuffle node into a simple blend instruction. -// This function assumes isBlendMask returns true for this -// SuffleVectorSDNode -static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, - unsigned MaskValue, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - MVT EltVT = VT.getVectorElementType(); - assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), - Subtarget->hasInt256() && "Trying to lower a " - "VECTOR_SHUFFLE to a Blend but " - "with the wrong mask")); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - unsigned NumElems = VT.getVectorNumElements(); +/// \brief Handle lowering of 2-lane 64-bit integer shuffles. +/// +/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by +/// the integer unit to minimize domain crossing penalties. However, for blends +/// it falls back to the floating point shuffle operation with appropriate bit +/// casting. +static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) { + // Straight shuffle of a single input vector. For everything from SSE2 + // onward this has a single fast instruction with no scary immediates. + // We have to map the mask as it is actually a v4i32 shuffle instruction. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1); + int WidenedMask[4] = { + std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, + std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; + return DAG.getNode( + ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); + } + + // We implement this with SHUFPD which is pretty lame because it will likely + // incur 2 cycles of stall for integer vectors on Nehalem and older chips. + // However, all the alternatives are still more cycles and newer chips don't + // have this problem. It would be really nice if x86 had better shuffles here. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); +} + +/// \brief Lower 4-lane 32-bit floating point shuffles. +/// +/// Uses instructions exclusively from the floating point unit to minimize +/// domain crossing penalties, as these are sufficient to implement all v4f32 +/// shuffles. +static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SDValue LowV = V1, HighV = V2; + int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) + // Straight shuffle of a single input vector. We pass the input vector to + // both operands to simulate this with a SHUFPS. + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + if (NumV2Elements == 1) { + int V2Index = + std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - + Mask.begin(); + // Compute the index adjacent to V2Index and in the same half by toggling + // the low bit. + int V2AdjIndex = V2Index ^ 1; + + if (Mask[V2AdjIndex] == -1) { + // Handles all the cases where we have a single V2 element and an undef. + // This will only ever happen in the high lanes because we commute the + // vector otherwise. + if (V2Index < 2) + std::swap(LowV, HighV); + NewMask[V2Index] -= 4; + } else { + // Handle the case where the V2 element ends up adjacent to a V1 element. + // To make this work, blend them together as the first step. + int V1Index = V2AdjIndex; + int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; + V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1, + getV4X86ShuffleImm8ForMask(BlendMask, DAG)); + + // Now proceed to reconstruct the final blend as we have the necessary + // high or low half formed. + if (V2Index < 2) { + LowV = V2; + HighV = V1; + } else { + HighV = V2; + } + NewMask[V1Index] = 2; // We put the V1 element in V2[2]. + NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. + } + } else if (NumV2Elements == 2) { + if (Mask[0] < 4 && Mask[1] < 4) { + // Handle the easy case where we have V1 in the low lanes and V2 in the + // high lanes. We never see this reversed because we sort the shuffle. + NewMask[2] -= 4; + NewMask[3] -= 4; + } else { + // We have a mixture of V1 and V2 in both low and high lanes. Rather than + // trying to place elements directly, just blend them and set up the final + // shuffle to place them. - // Convert i32 vectors to floating point if it is not AVX2. - // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. - MVT BlendVT = VT; - if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { - BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), - NumElems); - V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); - } + // The first two blend mask elements are for V1, the second two are for + // V2. + int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], + Mask[2] < 4 ? Mask[2] : Mask[3], + (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, + (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; + V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2, + getV4X86ShuffleImm8ForMask(BlendMask, DAG)); - SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, - DAG.getConstant(MaskValue, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Ret); + // Now we do a normal shuffle of V1 by giving V1 as both operands to + // a blend. + LowV = HighV = V1; + NewMask[0] = Mask[0] < 4 ? 0 : 2; + NewMask[1] = Mask[0] < 4 ? 2 : 0; + NewMask[2] = Mask[2] < 4 ? 1 : 3; + NewMask[3] = Mask[2] < 4 ? 3 : 1; + } + } + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV, + getV4X86ShuffleImm8ForMask(NewMask, DAG)); } -/// In vector type \p VT, return true if the element at index \p InputIdx -/// falls on a different 128-bit lane than \p OutputIdx. -static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx, - unsigned OutputIdx) { - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128; -} +/// \brief Lower 4-lane i32 vector shuffles. +/// +/// We try to handle these with integer-domain shuffles where we can, but for +/// blends we use the floating point domain blend instructions. +static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) + // Straight shuffle of a single input vector. For everything from SSE2 + // onward this has a single fast instruction with no scary immediates. + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // We implement this with SHUFPS because it can blend from two vectors. + // Because we're going to eventually use SHUFPS, we use SHUFPS even to build + // up the inputs, bypassing domain shift penalties that we would encur if we + // directly used PSHUFD on Nehalem and older. For newer chips, this isn't + // relevant. + return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, + DAG.getVectorShuffle( + MVT::v4f32, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1), + DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask)); +} + +/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 +/// shuffle lowering, and the most complex part. +/// +/// The lowering strategy is to try to form pairs of input lanes which are +/// targeted at the same half of the final vector, and then use a dword shuffle +/// to place them onto the right half, and finally unpack the paired lanes into +/// their final position. +/// +/// The exact breakdown of how to form these dword pairs and align them on the +/// correct sides is really tricky. See the comments within the function for +/// more of the details. +static SDValue lowerV8I16SingleInputVectorShuffle( + SDLoc DL, SDValue V, MutableArrayRef Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + MutableArrayRef LoMask = Mask.slice(0, 4); + MutableArrayRef HiMask = Mask.slice(4, 4); + + SmallVector LoInputs; + std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), + [](int M) { return M >= 0; }); + std::sort(LoInputs.begin(), LoInputs.end()); + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); + SmallVector HiInputs; + std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), + [](int M) { return M >= 0; }); + std::sort(HiInputs.begin(), HiInputs.end()); + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); + int NumLToL = + std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); + int NumHToL = LoInputs.size() - NumLToL; + int NumLToH = + std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); + int NumHToH = HiInputs.size() - NumLToH; + MutableArrayRef LToLInputs(LoInputs.data(), NumLToL); + MutableArrayRef LToHInputs(HiInputs.data(), NumLToH); + MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); + MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); + + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all + // such inputs we can swap two of the dwords across the half mark and end up + // with <=2 inputs to each half in each half. Once there, we can fall through + // to the generic code below. For example: + // + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] + // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] + // + // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2 + // and 2-2. + auto balanceSides = [&](ArrayRef ThreeInputs, int OneInput, + int ThreeInputHalfSum, int OneInputHalfOffset) { + // Compute the index of dword with only one word among the three inputs in + // a half by taking the sum of the half with three inputs and subtracting + // the sum of the actual three inputs. The difference is the remaining + // slot. + int DWordA = (ThreeInputHalfSum - + std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) / + 2; + int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2; + + int PSHUFDMask[] = {0, 1, 2, 3}; + PSHUFDMask[DWordA] = DWordB; + PSHUFDMask[DWordB] = DWordA; + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + + // Adjust the mask to match the new locations of A and B. + for (int &M : Mask) + if (M != -1 && M/2 == DWordA) + M = 2 * DWordB + M % 2; + else if (M != -1 && M/2 == DWordB) + M = 2 * DWordA + M % 2; + + // Recurse back into this routine to re-compute state now that this isn't + // a 3 and 1 problem. + return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), + Mask); + }; + if (NumLToL == 3 && NumHToL == 1) + return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4); + else if (NumLToL == 1 && NumHToL == 3) + return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0); + else if (NumLToH == 1 && NumHToH == 3) + return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0); + else if (NumLToH == 3 && NumHToH == 1) + return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4); + + // At this point there are at most two inputs to the low and high halves from + // each half. That means the inputs can always be grouped into dwords and + // those dwords can then be moved to the correct half with a dword shuffle. + // We use at most one low and one high word shuffle to collect these paired + // inputs into dwords, and finally a dword shuffle to place them. + int PSHUFLMask[4] = {-1, -1, -1, -1}; + int PSHUFHMask[4] = {-1, -1, -1, -1}; + int PSHUFDMask[4] = {-1, -1, -1, -1}; + + // First fix the masks for all the inputs that are staying in their + // original halves. This will then dictate the targets of the cross-half + // shuffles. + auto fixInPlaceInputs = [&PSHUFDMask]( + ArrayRef InPlaceInputs, MutableArrayRef SourceHalfMask, + MutableArrayRef HalfMask, int HalfOffset) { + if (InPlaceInputs.empty()) + return; + if (InPlaceInputs.size() == 1) { + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; + return; + } -/// Generate a PSHUFB if possible. Selects elements from \p V1 according to -/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to -/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p -/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a -/// zero. -static SDValue getPSHUFB(ArrayRef MaskVals, SDValue V1, SDLoc &dl, - SelectionDAG &DAG) { - MVT VT = V1.getSimpleValueType(); - assert(VT.is128BitVector() || VT.is256BitVector()); + assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + // Put the second input next to the first so that they are packed into + // a dword. We find the adjacent index by toggling the low bit. + int AdjIndex = InPlaceInputs[0] ^ 1; + SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; + std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); + PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; + }; + if (!HToLInputs.empty()) + fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0); + if (!LToHInputs.empty()) + fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4); + + // Now gather the cross-half inputs and place them into a free dword of + // their target half. + // FIXME: This operation could almost certainly be simplified dramatically to + // look more like the 3-1 fixing operation. + auto moveInputsToRightHalf = [&PSHUFDMask]( + MutableArrayRef IncomingInputs, ArrayRef ExistingInputs, + MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, + int SourceOffset, int DestOffset) { + auto isWordClobbered = [](ArrayRef SourceHalfMask, int Word) { + return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; + }; + auto isDWordClobbered = [&isWordClobbered](ArrayRef SourceHalfMask, + int Word) { + int LowWord = Word & ~1; + int HighWord = Word | 1; + return isWordClobbered(SourceHalfMask, LowWord) || + isWordClobbered(SourceHalfMask, HighWord); + }; - MVT EltVT = VT.getVectorElementType(); - unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8; - unsigned NumElts = VT.getVectorNumElements(); + if (IncomingInputs.empty()) + return; - SmallVector PshufbMask; - for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) { - int InputIdx = MaskVals[OutputIdx]; - unsigned InputByteIdx; + if (ExistingInputs.empty()) { + // Map any dwords with inputs from them into the right half. + for (int Input : IncomingInputs) { + // If the source half mask maps over the inputs, turn those into + // swaps and use the swapped lane. + if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { + if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { + SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = + Input - SourceOffset; + // We have to swap the uses in our half mask in one sweep. + for (int &M : HalfMask) + if (M == SourceHalfMask[Input - SourceOffset]) + M = Input; + else if (M == Input) + M = SourceHalfMask[Input - SourceOffset] + SourceOffset; + } else { + assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == + Input - SourceOffset && + "Previous placement doesn't match!"); + } + // Note that this correctly re-maps both when we do a swap and when + // we observe the other side of the swap above. We rely on that to + // avoid swapping the members of the input list directly. + Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; + } - if (InputIdx < 0 || NumElts <= (unsigned)InputIdx) - InputByteIdx = 0x80; - else { - // Cross lane is not allowed. - if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx)) - return SDValue(); - InputByteIdx = InputIdx * EltSizeInBytes; - // Index is an byte offset within the 128-bit lane. - InputByteIdx &= 0xf; + // Map the input's dword into the correct half. + if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) + PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; + else + assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == + Input / 2 && + "Previous placement doesn't match!"); + } + + // And just directly shift any other-half mask elements to be same-half + // as we will have mirrored the dword containing the element into the + // same position within that half. + for (int &M : HalfMask) + if (M >= SourceOffset && M < SourceOffset + 4) { + M = M - SourceOffset + DestOffset; + assert(M >= 0 && "This should never wrap below zero!"); + } + return; } - for (unsigned j = 0; j < EltSizeInBytes; ++j) { - PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8)); - if (InputByteIdx != 0x80) - ++InputByteIdx; + // Ensure we have the input in a viable dword of its current half. This + // is particularly tricky because the original position may be clobbered + // by inputs being moved and *staying* in that half. + if (IncomingInputs.size() == 1) { + if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { + int InputFixed = std::find(std::begin(SourceHalfMask), + std::end(SourceHalfMask), -1) - + std::begin(SourceHalfMask) + SourceOffset; + SourceHalfMask[InputFixed - SourceOffset] = + IncomingInputs[0] - SourceOffset; + std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], + InputFixed); + IncomingInputs[0] = InputFixed; + } + } else if (IncomingInputs.size() == 2) { + if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || + isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { + int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2; + assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) && + "Not all dwords can be clobbered!"); + SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset; + SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset; + for (int &M : HalfMask) + if (M == IncomingInputs[0]) + M = SourceDWordBase + SourceOffset; + else if (M == IncomingInputs[1]) + M = SourceDWordBase + 1 + SourceOffset; + IncomingInputs[0] = SourceDWordBase + SourceOffset; + IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset; + } + } else { + llvm_unreachable("Unhandled input size!"); } - } - MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size()); + // Now hoist the DWord down to the right half. + int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; + assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); + PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; + for (int Input : IncomingInputs) + std::replace(HalfMask.begin(), HalfMask.end(), Input, + FreeDWord * 2 + Input % 2); + }; + moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, + /*SourceOffset*/ 4, /*DestOffset*/ 0); + moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, + /*SourceOffset*/ 0, /*DestOffset*/ 4); + + // Now enact all the shuffles we've computed to move the inputs into their + // target half. + if (!isNoopShuffleMask(PSHUFLMask)) + V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); + if (!isNoopShuffleMask(PSHUFHMask)) + V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); + if (!isNoopShuffleMask(PSHUFDMask)) + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + + // At this point, each half should contain all its inputs, and we can then + // just shuffle them into their final position. + assert(std::count_if(LoMask.begin(), LoMask.end(), + [](int M) { return M >= 4; }) == 0 && + "Failed to lift all the high half inputs to the low mask!"); + assert(std::count_if(HiMask.begin(), HiMask.end(), + [](int M) { return M >= 0 && M < 4; }) == 0 && + "Failed to lift all the low half inputs to the high mask!"); + + // Do a half shuffle for the low mask. + if (!isNoopShuffleMask(LoMask)) + V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(LoMask, DAG)); + + // Do a half shuffle with the high mask after shifting its values down. + for (int &M : HiMask) + if (M >= 0) + M -= 4; + if (!isNoopShuffleMask(HiMask)) + V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(HiMask, DAG)); + + return V; +} + +/// \brief Detect whether the mask pattern should be lowered through +/// interleaving. +/// +/// This essentially tests whether viewing the mask as an interleaving of two +/// sub-sequences reduces the cross-input traffic of a blend operation. If so, +/// lowering it through interleaving is a significantly better strategy. +static bool shouldLowerAsInterleaving(ArrayRef Mask) { + int NumEvenInputs[2] = {0, 0}; + int NumOddInputs[2] = {0, 0}; + int NumLoInputs[2] = {0, 0}; + int NumHiInputs[2] = {0, 0}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] < 0) + continue; + + int InputIdx = Mask[i] >= Size; + + if (i < Size / 2) + ++NumLoInputs[InputIdx]; + else + ++NumHiInputs[InputIdx]; + + if ((i % 2) == 0) + ++NumEvenInputs[InputIdx]; + else + ++NumOddInputs[InputIdx]; + } + + // The minimum number of cross-input results for both the interleaved and + // split cases. If interleaving results in fewer cross-input results, return + // true. + int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0], + NumEvenInputs[0] + NumOddInputs[1]); + int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0], + NumLoInputs[0] + NumHiInputs[1]); + return InterleavedCrosses < SplitCrosses; +} + +/// \brief Blend two v8i16 vectors using a naive unpack strategy. +/// +/// This strategy only works when the inputs from each vector fit into a single +/// half of that vector, and generally there are not so many inputs as to leave +/// the in-place shuffles required highly constrained (and thus expensive). It +/// shifts all the inputs into a single side of both input vectors and then +/// uses an unpack to interleave these inputs in a single vector. At that +/// point, we will fall back on the generic single input shuffle lowering. +static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, + SDValue V2, + MutableArrayRef Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + SmallVector LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs; + for (int i = 0; i < 8; ++i) + if (Mask[i] >= 0 && Mask[i] < 4) + LoV1Inputs.push_back(i); + else if (Mask[i] >= 4 && Mask[i] < 8) + HiV1Inputs.push_back(i); + else if (Mask[i] >= 8 && Mask[i] < 12) + LoV2Inputs.push_back(i); + else if (Mask[i] >= 12) + HiV2Inputs.push_back(i); + + int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size(); + int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size(); + (void)NumV1Inputs; + (void)NumV2Inputs; + assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported"); + assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported"); + assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"); + + bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >= + HiV1Inputs.size() + HiV2Inputs.size(); + + auto moveInputsToHalf = [&](SDValue V, ArrayRef LoInputs, + ArrayRef HiInputs, bool MoveToLo, + int MaskOffset) { + ArrayRef GoodInputs = MoveToLo ? LoInputs : HiInputs; + ArrayRef BadInputs = MoveToLo ? HiInputs : LoInputs; + if (BadInputs.empty()) + return V; + + int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int MoveOffset = MoveToLo ? 0 : 4; + + if (GoodInputs.empty()) { + for (int BadInput : BadInputs) { + MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset; + Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset; + } + } else { + if (GoodInputs.size() == 2) { + // If the low inputs are spread across two dwords, pack them into + // a single dword. + MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] = + Mask[GoodInputs[0]] - MaskOffset; + MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] = + Mask[GoodInputs[1]] - MaskOffset; + Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + } else { + // Otherwise pin the low inputs. + for (int GoodInput : GoodInputs) + MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; + } + + int MoveMaskIdx = + std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - + std::begin(MoveMask); + assert(MoveMaskIdx >= MoveOffset && "Established above"); + + if (BadInputs.size() == 2) { + assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); + assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); + MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] = + Mask[BadInputs[0]] - MaskOffset; + MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] = + Mask[BadInputs[1]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset; + Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset; + } else { + assert(BadInputs.size() == 1 && "All sizes handled"); + MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; + } + } + + return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), + MoveMask); + }; + V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo, + /*MaskOffset*/ 0); + V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo, + /*MaskOffset*/ 8); + + // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes + // cross-half traffic in the final shuffle. + + // Munge the mask to be a single-input mask after the unpack merges the + // results. + for (int &M : Mask) + if (M != -1) + M = 2 * (M % 4) + (M / 8); + + return DAG.getVectorShuffle( + MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, + DL, MVT::v8i16, V1, V2), + DAG.getUNDEF(MVT::v8i16), Mask); +} + +/// \brief Generic lowering of 8-lane i16 shuffles. +/// +/// This handles both single-input shuffles and combined shuffle/blends with +/// two inputs. The single input shuffles are immediately delegated to +/// a dedicated lowering routine. +/// +/// The blends are lowered in one of three fundamental ways. If there are few +/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle +/// of the input is significantly cheaper when lowered as an interleaving of +/// the two inputs, try to interleave them. Otherwise, blend the low and high +/// halves of the inputs separately (making them have relatively few inputs) +/// and then concatenate them. +static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef OrigMask = SVOp->getMask(); + int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], + OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; + MutableArrayRef Mask(MaskStorage); + + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + auto isV1 = [](int M) { return M >= 0 && M < 8; }; + auto isV2 = [](int M) { return M >= 8; }; + + int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1); + int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); + + if (NumV2Inputs == 0) + return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG); + + assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " + "to be V1-input shuffles."); + + if (NumV1Inputs + NumV2Inputs <= 4) + return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); + + // Check whether an interleaving lowering is likely to be more efficient. + // This isn't perfect but it is a strong heuristic that tends to work well on + // the kinds of shuffles that show up in practice. + // + // FIXME: Handle 1x, 2x, and 4x interleaving. + if (shouldLowerAsInterleaving(Mask)) { + // FIXME: Figure out whether we should pack these into the low or high + // halves. + + int EMask[8], OMask[8]; + for (int i = 0; i < 4; ++i) { + EMask[i] = Mask[2*i]; + OMask[i] = Mask[2*i + 1]; + EMask[i + 4] = -1; + OMask[i + 4] = -1; + } + + SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask); + SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask); + + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds); + } + + int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + + for (int i = 0; i < 4; ++i) { + LoBlendMask[i] = Mask[i]; + HiBlendMask[i] = Mask[i + 4]; + } + + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); + LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV); + HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV); + + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); +} + +/// \brief Generic lowering of v16i8 shuffles. +/// +/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to +/// detect any complexity reducing interleaving. If that doesn't help, it uses +/// UNPCK to spread the i8 elements across two i16-element vectors, and uses +/// the existing lowering for v8i16 blends on each half, finally PACK-ing them +/// back together. +static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef OrigMask = SVOp->getMask(); + assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + int MaskStorage[16] = { + OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], + OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], + OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11], + OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]}; + MutableArrayRef Mask(MaskStorage); + MutableArrayRef LoMask = Mask.slice(0, 8); + MutableArrayRef HiMask = Mask.slice(8, 8); + + // For single-input shuffles, there are some nicer lowering tricks we can use. + if (isSingleInputShuffleMask(Mask)) { + // Check whether we can widen this to an i16 shuffle by duplicating bytes. + // Notably, this handles splat and partial-splat shuffles more efficiently. + // However, it only makes sense if the pre-duplication shuffle simplifies + // things significantly. Currently, this means we need to be able to + // express the pre-duplication shuffle as an i16 shuffle. + // + // FIXME: We should check for other patterns which can be widened into an + // i16 shuffle as well. + auto canWidenViaDuplication = [](ArrayRef Mask) { + for (int i = 0; i < 16; i += 2) { + if (Mask[i] != Mask[i + 1]) + return false; + } + return true; + }; + auto tryToWidenViaDuplication = [&]() -> SDValue { + if (!canWidenViaDuplication(Mask)) + return SDValue(); + SmallVector LoInputs; + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), + [](int M) { return M >= 0 && M < 8; }); + std::sort(LoInputs.begin(), LoInputs.end()); + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), + LoInputs.end()); + SmallVector HiInputs; + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), + [](int M) { return M >= 8; }); + std::sort(HiInputs.begin(), HiInputs.end()); + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), + HiInputs.end()); + + bool TargetLo = LoInputs.size() >= HiInputs.size(); + ArrayRef InPlaceInputs = TargetLo ? LoInputs : HiInputs; + ArrayRef MovingInputs = TargetLo ? HiInputs : LoInputs; + + int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; + SmallDenseMap LaneMap; + for (int I : InPlaceInputs) { + PreDupI16Shuffle[I/2] = I/2; + LaneMap[I] = I; + } + int j = TargetLo ? 0 : 4, je = j + 4; + for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { + // Check if j is already a shuffle of this input. This happens when + // there are two adjacent bytes after we move the low one. + if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { + // If we haven't yet mapped the input, search for a slot into which + // we can map it. + while (j < je && PreDupI16Shuffle[j] != -1) + ++j; + + if (j == je) + // We can't place the inputs into a single half with a simple i16 shuffle, so bail. + return SDValue(); + + // Map this input with the i16 shuffle. + PreDupI16Shuffle[j] = MovingInputs[i] / 2; + } + + // Update the lane map based on the mapping we ended up with. + LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; + } + V1 = DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); + + // Unpack the bytes to form the i16s that will be shuffled into place. + V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + MVT::v16i8, V1, V1); + + int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + for (int i = 0; i < 16; i += 2) { + if (Mask[i] != -1) + PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); + }; + if (SDValue V = tryToWidenViaDuplication()) + return V; + } + + // Check whether an interleaving lowering is likely to be more efficient. + // This isn't perfect but it is a strong heuristic that tends to work well on + // the kinds of shuffles that show up in practice. + // + // FIXME: We need to handle other interleaving widths (i16, i32, ...). + if (shouldLowerAsInterleaving(Mask)) { + // FIXME: Figure out whether we should pack these into the low or high + // halves. + + int EMask[16], OMask[16]; + for (int i = 0; i < 8; ++i) { + EMask[i] = Mask[2*i]; + OMask[i] = Mask[2*i + 1]; + EMask[i + 8] = -1; + OMask[i + 8] = -1; + } + + SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); + SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); + + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); + } + + int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + + auto buildBlendMasks = [](MutableArrayRef HalfMask, + MutableArrayRef V1HalfBlendMask, + MutableArrayRef V2HalfBlendMask) { + for (int i = 0; i < 8; ++i) + if (HalfMask[i] >= 0 && HalfMask[i] < 16) { + V1HalfBlendMask[i] = HalfMask[i]; + HalfMask[i] = i; + } else if (HalfMask[i] >= 16) { + V2HalfBlendMask[i] = HalfMask[i] - 16; + HalfMask[i] = i + 8; + } + }; + buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask); + buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask); + + SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); + + auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef LoBlendMask, + MutableArrayRef HiBlendMask) { + SDValue V1, V2; + // Check if any of the odd lanes in the v16i8 are used. If not, we can mask + // them out and avoid using UNPCK{L,H} to extract the elements of V as + // i16s. + if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; }) && + std::none_of(HiBlendMask.begin(), HiBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; })) { + // Use a mask to drop the high bytes. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, + DAG.getConstant(0x00FF, MVT::v8i16)); + + // This will be a single vector shuffle instead of a blend so nuke V2. + V2 = DAG.getUNDEF(MVT::v8i16); + + // Squash the masks to point directly into V1. + for (int &M : LoBlendMask) + if (M >= 0) + M /= 2; + for (int &M : HiBlendMask) + if (M >= 0) + M /= 2; + } else { + // Otherwise just unpack the low half of V into V1 and the high half into + // V2 so that we can blend them as i16s. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); + } + + SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); + SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); + return std::make_pair(BlendedLo, BlendedHi); + }; + SDValue V1Lo, V1Hi, V2Lo, V2Hi; + std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask); + std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask); + + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask); + + return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); +} + +/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. +/// +/// This routine breaks down the specific type of 128-bit shuffle and +/// dispatches to the lowering routines accordingly. +static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + switch (VT.SimpleTy) { + case MVT::v2i64: + return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v2f64: + return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4i32: + return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4f32: + return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i16: + return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i8: + return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + + default: + llvm_unreachable("Unimplemented!"); + } +} + +/// \brief Tiny helper function to test whether adjacent masks are sequential. +static bool areAdjacentMasksSequential(ArrayRef Mask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) + if (Mask[i] + 1 != Mask[i+1]) + return false; + + return true; +} + +/// \brief Top-level lowering for x86 vector shuffles. +/// +/// This handles decomposition, canonicalization, and lowering of all x86 +/// vector shuffles. Most of the specific lowering strategies are encapsulated +/// above in helper routines. The canonicalization attempts to widen shuffles +/// to involve fewer lanes of wider elements, consolidate symmetric patterns +/// s.t. only one of the two inputs needs to be tested, etc. +static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + MVT VT = Op.getSimpleValueType(); + int NumElements = VT.getVectorNumElements(); + SDLoc dl(Op); + + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + if (V1IsUndef && V2IsUndef) + return DAG.getUNDEF(VT); + + // When we create a shuffle node we put the UNDEF node to second operand, + // but in some cases the first operand may be transformed to UNDEF. + // In this case we should just commute the node. + if (V1IsUndef) + return DAG.getCommutedVectorShuffle(*SVOp); + + // Check for non-undef masks pointing at an undef vector and make the masks + // undef as well. This makes it easier to match the shuffle based solely on + // the mask. + if (V2IsUndef) + for (int M : Mask) + if (M >= NumElements) { + SmallVector NewMask(Mask.begin(), Mask.end()); + for (int &M : NewMask) + if (M >= NumElements) + M = -1; + return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); + } + + // For integer vector shuffles, try to collapse them into a shuffle of fewer + // lanes but wider integers. We cap this to not form integers larger than i64 + // but it might be interesting to form i128 integers to handle flipping the + // low and high halves of AVX 256-bit vectors. + if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && + areAdjacentMasksSequential(Mask)) { + SmallVector NewMask; + for (int i = 0, Size = Mask.size(); i < Size; i += 2) + NewMask.push_back(Mask[i] / 2); + MVT NewVT = + MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), + VT.getVectorNumElements() / 2); + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); + } + + int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; + for (int M : SVOp->getMask()) + if (M < 0) + ++NumUndefElements; + else if (M < NumElements) + ++NumV1Elements; + else + ++NumV2Elements; + + // Commute the shuffle as needed such that more elements come from V1 than + // V2. This allows us to match the shuffle pattern strictly on how many + // elements come from V1 without handling the symmetric cases. + if (NumV2Elements > NumV1Elements) + return DAG.getCommutedVectorShuffle(*SVOp); + + // When the number of V1 and V2 elements are the same, try to minimize the + // number of uses of V2 in the low half of the vector. + if (NumV1Elements == NumV2Elements) { + int LowV1Elements = 0, LowV2Elements = 0; + for (int M : SVOp->getMask().slice(0, NumElements / 2)) + if (M >= NumElements) + ++LowV2Elements; + else if (M >= 0) + ++LowV1Elements; + if (LowV2Elements > LowV1Elements) + return DAG.getCommutedVectorShuffle(*SVOp); + } + + // For each vector width, delegate to a specialized lowering routine. + if (VT.getSizeInBits() == 128) + return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + + llvm_unreachable("Unimplemented!"); +} + + +//===----------------------------------------------------------------------===// +// Legacy vector shuffle lowering +// +// This code is the legacy code handling vector shuffles until the above +// replaces its functionality and performance. +//===----------------------------------------------------------------------===// + +static bool isBlendMask(ArrayRef MaskVals, MVT VT, bool hasSSE41, + bool hasInt256, unsigned *MaskOut = nullptr) { + MVT EltVT = VT.getVectorElementType(); + + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return false; + + if (!hasSSE41 || EltVT == MVT::i8) + return false; + if (!hasInt256 && VT == MVT::v16i16) + return false; + + unsigned MaskValue = 0; + unsigned NumElems = VT.getVectorNumElements(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems - 1) / 8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; + + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { + + int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; + int EltIdx = MaskVals[i]; + + if ((EltIdx < 0 || EltIdx == (int)i) && + (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) + continue; + + if (((unsigned)EltIdx == (i + NumElems)) && + (SndLaneEltIdx < 0 || + (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) + MaskValue |= (1 << i); + else + return false; + } + + if (MaskOut) + *MaskOut = MaskValue; + return true; +} + +// Try to lower a shuffle node into a simple blend instruction. +// This function assumes isBlendMask returns true for this +// SuffleVectorSDNode +static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, + unsigned MaskValue, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = SVOp->getSimpleValueType(0); + MVT EltVT = VT.getVectorElementType(); + assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), + Subtarget->hasInt256() && "Trying to lower a " + "VECTOR_SHUFFLE to a Blend but " + "with the wrong mask")); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + SDLoc dl(SVOp); + unsigned NumElems = VT.getVectorNumElements(); + + // Convert i32 vectors to floating point if it is not AVX2. + // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. + MVT BlendVT = VT; + if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, + DAG.getConstant(MaskValue, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + +/// In vector type \p VT, return true if the element at index \p InputIdx +/// falls on a different 128-bit lane than \p OutputIdx. +static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx, + unsigned OutputIdx) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128; +} + +/// Generate a PSHUFB if possible. Selects elements from \p V1 according to +/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to +/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p +/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a +/// zero. +static SDValue getPSHUFB(ArrayRef MaskVals, SDValue V1, SDLoc &dl, + SelectionDAG &DAG) { + MVT VT = V1.getSimpleValueType(); + assert(VT.is128BitVector() || VT.is256BitVector()); + + MVT EltVT = VT.getVectorElementType(); + unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8; + unsigned NumElts = VT.getVectorNumElements(); + + SmallVector PshufbMask; + for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) { + int InputIdx = MaskVals[OutputIdx]; + unsigned InputByteIdx; + + if (InputIdx < 0 || NumElts <= (unsigned)InputIdx) + InputByteIdx = 0x80; + else { + // Cross lane is not allowed. + if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx)) + return SDValue(); + InputByteIdx = InputIdx * EltSizeInBytes; + // Index is an byte offset within the 128-bit lane. + InputByteIdx &= 0xf; + } + + for (unsigned j = 0; j < EltSizeInBytes; ++j) { + PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8)); + if (InputByteIdx != 0x80) + ++InputByteIdx; + } + } + + MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size()); if (ShufVT != VT) V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1); return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1, @@ -7731,12 +9065,13 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); } + unsigned SrcIndex = Mask[DestIndex] % 4; if (MayFoldLoad(From)) { // Trivial case, when From comes from a load and is only used by the // shuffle. Make it use insertps from the vector that we need from that // load. SDValue NewLoad = - NarrowVectorLoadToElement(cast(From), DestIndex, DAG); + NarrowVectorLoadToElement(cast(From), SrcIndex, DAG); if (!NewLoad.getNode()) return SDValue(); @@ -7757,7 +9092,6 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, } // Vector-element-to-vector - unsigned SrcIndex = Mask[DestIndex] % 4; SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); } @@ -7924,6 +9258,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool OptForSize = MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + // Check if we should use the experimental vector shuffle lowering. If so, + // delegate completely to that code path. + if (ExperimentalVectorShuffleLowering) + return lowerVectorShuffle(Op, Subtarget, DAG); + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); if (V1IsUndef && V2IsUndef) @@ -7933,7 +9272,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // Vector shuffle lowering takes 3 steps: // @@ -8045,7 +9384,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (ShouldXformToMOVHLPS(M, VT) || ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); if (isShift) { // No better options. Use a vshldq / vsrldq. @@ -8057,8 +9396,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool Commuted = false; // FIXME: This should also accept a bitcast of a splat? Be careful, not // 1,1,1,1 -> v8i16 though. - V1IsSplat = isSplatVector(V1.getNode()); - V2IsSplat = isSplatVector(V2.getNode()); + BitVector UndefElements; + if (auto *BVOp = dyn_cast(V1.getNode())) + if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) + V1IsSplat = true; + if (auto *BVOp = dyn_cast(V2.getNode())) + if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) + V2IsSplat = true; // Canonicalize the splat or undef, if present, to be on the RHS. if (!V2IsUndef && V1IsSplat && !V2IsSplat) { @@ -8112,7 +9456,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Normalize the node to match x86 shuffle ops if needed if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific @@ -8134,6 +9478,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePSHUFLWImmediate(SVOp), DAG); + unsigned MaskValue; + if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), + &MaskValue)) + return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); + if (isSHUFPMask(M, VT)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, getShuffleSHUFImmediate(SVOp), DAG); @@ -8171,11 +9520,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - unsigned MaskValue; - if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), - &MaskValue)) - return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); - if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) return getINSERTPS(SVOp, dl, DAG); @@ -12438,11 +13782,37 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx2_packusdw: + return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse2_pshuf_d: + return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pshufl_w: + return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pshufh_w: + return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_psign_b_128: case Intrinsic::x86_ssse3_psign_w_128: case Intrinsic::x86_ssse3_psign_d_128: @@ -12890,6 +14260,51 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, return SDValue(Res, 0); } +// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that +// read performance monitor counters (x86_rdpmc). +static void getReadPerformanceCounter(SDNode *N, SDLoc DL, + SelectionDAG &DAG, const X86Subtarget *Subtarget, + SmallVectorImpl &Results) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue LO, HI; + + // The ECX register is used to select the index of the performance counter + // to read. + SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, + N->getOperand(2)); + SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); + + // Reads the content of a 64-bit performance counter and returns it in the + // registers EDX:EAX. + if (Subtarget->is64Bit()) { + LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, + LO.getValue(2)); + } else { + LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, + LO.getValue(2)); + } + Chain = HI.getValue(1); + + if (Subtarget->is64Bit()) { + // The EAX register is loaded with the low-order 32 bits. The EDX register + // is loaded with the supported high-order bits of the counter. + SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, + DAG.getConstant(32, MVT::i8)); + Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); + Results.push_back(Chain); + return; + } + + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { LO, HI }; + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); + Results.push_back(Pair); + Results.push_back(Chain); +} + // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is // also used to custom lower READCYCLECOUNTER nodes. @@ -12954,7 +14369,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, } enum IntrinsicType { - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST }; struct IntrinsicData { @@ -13048,6 +14463,8 @@ static void InitIntinsicsMap() { IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0))); IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp, IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc, + IntrinsicData(RDPMC, X86ISD::RDPMC_DAG, 0))); Initialized = true; } @@ -13123,6 +14540,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } + // Read Performance Monitoring Counters. + case RDPMC: { + SmallVector Results; + getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); + return DAG.getMergeValues(Results, dl); + } // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); @@ -13711,7 +15134,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), static_cast(MVT::v2i64).getTypeForEVT(*DAG.getContext()), - Callee, &Args, 0) + Callee, std::move(Args), 0) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); @@ -13727,10 +15150,23 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT == MVT::v8i32 && Subtarget->hasInt256())); - // Get the high parts. - const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8}; - SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); - SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); + // PMULxD operations multiply each even value (starting at 0) of LHS with + // the related value of RHS and produce a widen result. + // E.g., PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> + // + // In other word, to have all the results, we need to perform two PMULxD: + // 1. one with the even values. + // 2. one with the odd values. + // To achieve #2, with need to place the odd values at an even position. + // + // Place the odd value at an even position (basically, shift all values 1 + // step to the left): + const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; + // => + SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); + // => + SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. @@ -13738,16 +15174,41 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; unsigned Opcode = (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; + // PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); + // PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1)); + DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. - const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15}; - SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14}; - SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + // The internal representation is big endian. + // In other words, a i64 bitcasted to 2 x i32 has its high part at index 0 + // and its low part at index 1. + // Moreover, we have: Mul1 = ; Mul2 = + // Vector index 0 1 ; 2 3 + // We want + // Vector index 0 2 1 3 + // Since each element is seen as 2 x i32, we get: + // high_mask[i] = 2 x vector_index[i] + // low_mask[i] = 2 x vector_index[i] + 1 + // where vector_index = {0, Size/2, 1, Size/2 + 1, ..., + // Size/2 - 1, Size/2 + Size/2 - 1} + // where Size is the number of element of the final vector. + SDValue Highs, Lows; + if (VT == MVT::v8i32) { + const int HighMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; + Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; + Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + } else { + const int HighMask[] = {0, 4, 2, 6}; + Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {1, 5, 3, 7}; + Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + } // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. @@ -13763,7 +15224,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } - return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows); + // The low part of a MUL_LOHI is supposed to be the first value and the + // high part the second value. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Lows, Highs); } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, @@ -13774,10 +15237,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Amt = Op.getOperand(1); // Optimize shl/srl/sra with constant shift amount. - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast(SclrAmt)) { - uint64_t ShiftAmt = C->getZExtValue(); + if (auto *BVAmt = dyn_cast(Amt)) { + if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { + uint64_t ShiftAmt = ShiftConst->getZExtValue(); if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && @@ -14084,15 +15546,14 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); SDValue V; - if (!Subtarget->hasSSE2()) - return SDValue(); + assert(VT.isVector() && "Custom lowering only for vector shifts!"); + assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); V = LowerScalarImmediateShift(Op, DAG, Subtarget); if (V.getNode()) @@ -14711,7 +16172,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, RetTy, Callee, &Args, 0); + .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); std::pair CallResult = TLI.LowerCallTo(CLI); @@ -14840,29 +16301,6 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, Results.push_back(Swap.getValue(2)); } -static void -ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl&Results, - SelectionDAG &DAG, unsigned NewOp) { - SDLoc dl(Node); - assert (Node->getValueType(0) == MVT::i64 && - "Only know how to expand i64 atomics"); - - SDValue Chain = Node->getOperand(0); - SDValue In1 = Node->getOperand(1); - SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(0)); - SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(1)); - SDValue Ops[] = { Chain, In1, In2L, In2H }; - SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); - SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64, - cast(Node)->getMemOperand()); - SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF)); - Results.push_back(Result.getValue(2)); -} - /// ReplaceNodeResults - Replace a node with an illegal result type /// with a new node built out of custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, @@ -14947,6 +16385,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case Intrinsic::x86_rdtscp: return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, Results); + case Intrinsic::x86_rdpmc: + return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } case ISD::READCYCLECOUNTER: { @@ -15008,57 +16448,20 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(EFLAGS.getValue(1)); return; } + case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: - case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_SWAP: { - unsigned Opc; - switch (N->getOpcode()) { - default: llvm_unreachable("Unexpected opcode"); - case ISD::ATOMIC_LOAD_ADD: - Opc = X86ISD::ATOMADD64_DAG; - break; - case ISD::ATOMIC_LOAD_AND: - Opc = X86ISD::ATOMAND64_DAG; - break; - case ISD::ATOMIC_LOAD_NAND: - Opc = X86ISD::ATOMNAND64_DAG; - break; - case ISD::ATOMIC_LOAD_OR: - Opc = X86ISD::ATOMOR64_DAG; - break; - case ISD::ATOMIC_LOAD_SUB: - Opc = X86ISD::ATOMSUB64_DAG; - break; - case ISD::ATOMIC_LOAD_XOR: - Opc = X86ISD::ATOMXOR64_DAG; - break; - case ISD::ATOMIC_LOAD_MAX: - Opc = X86ISD::ATOMMAX64_DAG; - break; - case ISD::ATOMIC_LOAD_MIN: - Opc = X86ISD::ATOMMIN64_DAG; - break; - case ISD::ATOMIC_LOAD_UMAX: - Opc = X86ISD::ATOMUMAX64_DAG; - break; - case ISD::ATOMIC_LOAD_UMIN: - Opc = X86ISD::ATOMUMIN64_DAG; - break; - case ISD::ATOMIC_SWAP: - Opc = X86ISD::ATOMSWAP64_DAG; - break; - } - ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); - return; - } + case ISD::ATOMIC_LOAD_UMAX: + // Delegate to generic TypeLegalization. Situations we can really handle + // should have already been dealt with by X86AtomicExpand.cpp. + break; case ISD::ATOMIC_LOAD: { ReplaceATOMIC_LOAD(N, Results, DAG); return; @@ -15079,6 +16482,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, MVT::v2f64, N->getOperand(0)); SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded); + if (ExperimentalVectorWideningLegalization) { + // If we are legalizing vectors by widening, we already have the desired + // legal vector type, just return it. + Results.push_back(ToVecInt); + return; + } + SmallVector Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, @@ -15111,6 +16521,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; + case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; @@ -15165,12 +16576,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; - case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; - case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; - case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; - case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; - case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; - case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; @@ -15211,6 +16616,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::PACKSS: return "X86ISD::PACKSS"; + case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; @@ -15467,6 +16874,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || + isMOVHLPSMask(M, SVT) || isSHUFPMask(M, SVT) || isPSHUFDMask(M, SVT) || isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || @@ -15481,758 +16889,79 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, bool X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, - EVT VT) const { - if (!VT.isSimple()) - return false; - - MVT SVT = VT.getSimpleVT(); - unsigned NumElts = SVT.getVectorNumElements(); - // FIXME: This collection of masks seems suspect. - if (NumElts == 2) - return true; - if (NumElts == 4 && SVT.is128BitVector()) { - return (isMOVLMask(Mask, SVT) || - isCommutedMOVLMask(Mask, SVT, true) || - isSHUFPMask(Mask, SVT) || - isSHUFPMask(Mask, SVT, /* Commuted */ true)); - } - return false; -} - -//===----------------------------------------------------------------------===// -// X86 Scheduler Hooks -//===----------------------------------------------------------------------===// - -/// Utility function to emit xbegin specifying the start of an RTM region. -static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, - const TargetInstrInfo *TII) { - DebugLoc DL = MI->getDebugLoc(); - - const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; - - // For the v = xbegin(), we generate - // - // thisMBB: - // xbegin sinkMBB - // - // mainMBB: - // eax = -1 - // - // sinkMBB: - // v = eax - - MachineBasicBlock *thisMBB = MBB; - MachineFunction *MF = MBB->getParent(); - MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); - MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); - MF->insert(I, mainMBB); - MF->insert(I, sinkMBB); - - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); - - // thisMBB: - // xbegin sinkMBB - // # fallthrough to mainMBB - // # abortion to sinkMBB - BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); - thisMBB->addSuccessor(mainMBB); - thisMBB->addSuccessor(sinkMBB); - - // mainMBB: - // EAX = -1 - BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); - mainMBB->addSuccessor(sinkMBB); - - // sinkMBB: - // EAX is live into the sinkMBB - sinkMBB->addLiveIn(X86::EAX); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) - .addReg(X86::EAX); - - MI->eraseFromParent(); - return sinkMBB; -} - -// Get CMPXCHG opcode for the specified data type. -static unsigned getCmpXChgOpcode(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::LCMPXCHG8; - case MVT::i16: return X86::LCMPXCHG16; - case MVT::i32: return X86::LCMPXCHG32; - case MVT::i64: return X86::LCMPXCHG64; - default: - break; - } - llvm_unreachable("Invalid operand size!"); -} - -// Get LOAD opcode for the specified data type. -static unsigned getLoadOpcode(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::MOV8rm; - case MVT::i16: return X86::MOV16rm; - case MVT::i32: return X86::MOV32rm; - case MVT::i64: return X86::MOV64rm; - default: - break; - } - llvm_unreachable("Invalid operand size!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction. -static unsigned getNonAtomicOpcode(unsigned Opc) { - switch (Opc) { - case X86::ATOMAND8: return X86::AND8rr; - case X86::ATOMAND16: return X86::AND16rr; - case X86::ATOMAND32: return X86::AND32rr; - case X86::ATOMAND64: return X86::AND64rr; - case X86::ATOMOR8: return X86::OR8rr; - case X86::ATOMOR16: return X86::OR16rr; - case X86::ATOMOR32: return X86::OR32rr; - case X86::ATOMOR64: return X86::OR64rr; - case X86::ATOMXOR8: return X86::XOR8rr; - case X86::ATOMXOR16: return X86::XOR16rr; - case X86::ATOMXOR32: return X86::XOR32rr; - case X86::ATOMXOR64: return X86::XOR64rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction with -// extra opcode. -static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, - unsigned &ExtraOpc) { - switch (Opc) { - case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; - case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; - case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; - case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; - case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; - case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; - case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; - case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; - case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; - case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; - case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; - case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; - case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; - case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; - case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; - case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; - case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; - case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; - case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; - case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction for -// 64-bit data type on 32-bit target. -static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { - switch (Opc) { - case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; - case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; - case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; - case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; - case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; - case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; - case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; - case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; - case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; - case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction for -// 64-bit data type on 32-bit target with extra opcode. -static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, - unsigned &HiOpc, - unsigned &ExtraOpc) { - switch (Opc) { - case X86::ATOMNAND6432: - ExtraOpc = X86::NOT32r; - HiOpc = X86::AND32rr; - return X86::AND32rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get pseudo CMOV opcode from the specified data type. -static unsigned getPseudoCMOVOpc(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::CMOV_GR8; - case MVT::i16: return X86::CMOV_GR16; - case MVT::i32: return X86::CMOV_GR32; - default: - break; - } - llvm_unreachable("Unknown CMOV opcode!"); -} - -// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. -// They will be translated into a spin-loop or compare-exchange loop from -// -// ... -// dst = atomic-fetch-op MI.addr, MI.val -// ... -// -// to -// -// ... -// t1 = LOAD MI.addr -// loop: -// t4 = phi(t1, t3 / loop) -// t2 = OP MI.val, t4 -// EAX = t4 -// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined] -// t3 = EAX -// JNE loop -// sink: -// dst = t3 -// ... -MachineBasicBlock * -X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, - MachineBasicBlock *MBB) const { - MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); - - MachineRegisterInfo &MRI = MF->getRegInfo(); - - const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; - - assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && - "Unexpected number of operands"); - - assert(MI->hasOneMemOperand() && - "Expected atomic-load-op to have one memoperand"); - - // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - - unsigned DstReg, SrcReg; - unsigned MemOpndSlot; - - unsigned CurOp = 0; - - DstReg = MI->getOperand(CurOp++).getReg(); - MemOpndSlot = CurOp; - CurOp += X86::AddrNumOperands; - SrcReg = MI->getOperand(CurOp++).getReg(); - - const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - MVT::SimpleValueType VT = *RC->vt_begin(); - unsigned t1 = MRI.createVirtualRegister(RC); - unsigned t2 = MRI.createVirtualRegister(RC); - unsigned t3 = MRI.createVirtualRegister(RC); - unsigned t4 = MRI.createVirtualRegister(RC); - unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT); - - unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); - unsigned LOADOpc = getLoadOpcode(VT); - - // For the atomic load-arith operator, we generate - // - // thisMBB: - // t1 = LOAD [MI.addr] - // mainMBB: - // t4 = phi(t1 / thisMBB, t3 / mainMBB) - // t1 = OP MI.val, EAX - // EAX = t4 - // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] - // t3 = EAX - // JNE mainMBB - // sinkMBB: - // dst = t3 - - MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); - MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); - MF->insert(I, mainMBB); - MF->insert(I, sinkMBB); - - MachineInstrBuilder MIB; - - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); - - // thisMBB: - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { - unsigned flags = (*MMOI)->getFlags(); - flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; - MachineMemOperand *MMO = - MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, - (*MMOI)->getSize(), - (*MMOI)->getBaseAlignment(), - (*MMOI)->getTBAAInfo(), - (*MMOI)->getRanges()); - MIB.addMemOperand(MMO); - } - - thisMBB->addSuccessor(mainMBB); - - // mainMBB: - MachineBasicBlock *origMainMBB = mainMBB; - - // Add a PHI. - MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); - - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: - llvm_unreachable("Unhandled atomic-load-op opcode!"); - case X86::ATOMAND8: - case X86::ATOMAND16: - case X86::ATOMAND32: - case X86::ATOMAND64: - case X86::ATOMOR8: - case X86::ATOMOR16: - case X86::ATOMOR32: - case X86::ATOMOR64: - case X86::ATOMXOR8: - case X86::ATOMXOR16: - case X86::ATOMXOR32: - case X86::ATOMXOR64: { - unsigned ARITHOpc = getNonAtomicOpcode(Opc); - BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg) - .addReg(t4); - break; - } - case X86::ATOMNAND8: - case X86::ATOMNAND16: - case X86::ATOMNAND32: - case X86::ATOMNAND64: { - unsigned Tmp = MRI.createVirtualRegister(RC); - unsigned NOTOpc; - unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); - BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg) - .addReg(t4); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp); - break; - } - case X86::ATOMMAX8: - case X86::ATOMMAX16: - case X86::ATOMMAX32: - case X86::ATOMMAX64: - case X86::ATOMMIN8: - case X86::ATOMMIN16: - case X86::ATOMMIN32: - case X86::ATOMMIN64: - case X86::ATOMUMAX8: - case X86::ATOMUMAX16: - case X86::ATOMUMAX32: - case X86::ATOMUMAX64: - case X86::ATOMUMIN8: - case X86::ATOMUMIN16: - case X86::ATOMUMIN32: - case X86::ATOMUMIN64: { - unsigned CMPOpc; - unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); - - BuildMI(mainMBB, DL, TII->get(CMPOpc)) - .addReg(SrcReg) - .addReg(t4); - - if (Subtarget->hasCMov()) { - if (VT != MVT::i8) { - // Native support - BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) - .addReg(SrcReg) - .addReg(t4); - } else { - // Promote i8 to i32 to use CMOV32 - const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo(); - const TargetRegisterClass *RC32 = - TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit); - unsigned SrcReg32 = MRI.createVirtualRegister(RC32); - unsigned AccReg32 = MRI.createVirtualRegister(RC32); - unsigned Tmp = MRI.createVirtualRegister(RC32); - - unsigned Undef = MRI.createVirtualRegister(RC32); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); - - BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) - .addReg(Undef) - .addReg(SrcReg) - .addImm(X86::sub_8bit); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) - .addReg(Undef) - .addReg(t4) - .addImm(X86::sub_8bit); - - BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp) - .addReg(SrcReg32) - .addReg(AccReg32); - - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2) - .addReg(Tmp, 0, X86::sub_8bit); - } - } else { - // Use pseudo select and lower them. - assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && - "Invalid atomic-load-op transformation!"); - unsigned SelOpc = getPseudoCMOVOpc(VT); - X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); - assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); - MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2) - .addReg(SrcReg).addReg(t4) - .addImm(CC); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // Replace the original PHI node as mainMBB is changed after CMOV - // lowering. - BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); - Phi->eraseFromParent(); - } - break; - } - } - - // Copy PhyReg back from virtual register. - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg) - .addReg(t4); - - MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - MIB.addReg(t2); - MIB.setMemRefs(MMOBegin, MMOEnd); - - // Copy PhyReg back to virtual register. - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3) - .addReg(PhyReg); - - BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - - mainMBB->addSuccessor(origMainMBB); - mainMBB->addSuccessor(sinkMBB); - - // sinkMBB: - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstReg) - .addReg(t3); + EVT VT) const { + if (!VT.isSimple()) + return false; - MI->eraseFromParent(); - return sinkMBB; + MVT SVT = VT.getSimpleVT(); + unsigned NumElts = SVT.getVectorNumElements(); + // FIXME: This collection of masks seems suspect. + if (NumElts == 2) + return true; + if (NumElts == 4 && SVT.is128BitVector()) { + return (isMOVLMask(Mask, SVT) || + isCommutedMOVLMask(Mask, SVT, true) || + isSHUFPMask(Mask, SVT) || + isSHUFPMask(Mask, SVT, /* Commuted */ true)); + } + return false; } -// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic -// instructions. They will be translated into a spin-loop or compare-exchange -// loop from -// -// ... -// dst = atomic-fetch-op MI.addr, MI.val -// ... -// -// to -// -// ... -// t1L = LOAD [MI.addr + 0] -// t1H = LOAD [MI.addr + 4] -// loop: -// t4L = phi(t1L, t3L / loop) -// t4H = phi(t1H, t3H / loop) -// t2L = OP MI.val.lo, t4L -// t2H = OP MI.val.hi, t4H -// EAX = t4L -// EDX = t4H -// EBX = t2L -// ECX = t2H -// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] -// t3L = EAX -// t3H = EDX -// JNE loop -// sink: -// dstL = t3L -// dstH = t3H -// ... -MachineBasicBlock * -X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, - MachineBasicBlock *MBB) const { - MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); +//===----------------------------------------------------------------------===// +// X86 Scheduler Hooks +//===----------------------------------------------------------------------===// - MachineRegisterInfo &MRI = MF->getRegInfo(); +/// Utility function to emit xbegin specifying the start of an RTM region. +static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, + const TargetInstrInfo *TII) { + DebugLoc DL = MI->getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = MBB; ++I; - assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 && - "Unexpected number of operands"); - - assert(MI->hasOneMemOperand() && - "Expected atomic-load-op32 to have one memoperand"); - - // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - - unsigned DstLoReg, DstHiReg; - unsigned SrcLoReg, SrcHiReg; - unsigned MemOpndSlot; - - unsigned CurOp = 0; - - DstLoReg = MI->getOperand(CurOp++).getReg(); - DstHiReg = MI->getOperand(CurOp++).getReg(); - MemOpndSlot = CurOp; - CurOp += X86::AddrNumOperands; - SrcLoReg = MI->getOperand(CurOp++).getReg(); - SrcHiReg = MI->getOperand(CurOp++).getReg(); - - const TargetRegisterClass *RC = &X86::GR32RegClass; - const TargetRegisterClass *RC8 = &X86::GR8RegClass; - - unsigned t1L = MRI.createVirtualRegister(RC); - unsigned t1H = MRI.createVirtualRegister(RC); - unsigned t2L = MRI.createVirtualRegister(RC); - unsigned t2H = MRI.createVirtualRegister(RC); - unsigned t3L = MRI.createVirtualRegister(RC); - unsigned t3H = MRI.createVirtualRegister(RC); - unsigned t4L = MRI.createVirtualRegister(RC); - unsigned t4H = MRI.createVirtualRegister(RC); - - unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; - unsigned LOADOpc = X86::MOV32rm; - - // For the atomic load-arith operator, we generate + // For the v = xbegin(), we generate // - // thisMBB: - // t1L = LOAD [MI.addr + 0] - // t1H = LOAD [MI.addr + 4] - // mainMBB: - // t4L = phi(t1L / thisMBB, t3L / mainMBB) - // t4H = phi(t1H / thisMBB, t3H / mainMBB) - // t2L = OP MI.val.lo, t4L - // t2H = OP MI.val.hi, t4H - // EBX = t2L - // ECX = t2H - // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] - // t3L = EAX - // t3H = EDX - // JNE loop - // sinkMBB: - // dstL = t3L - // dstH = t3H + // thisMBB: + // xbegin sinkMBB + // + // mainMBB: + // eax = -1 + // + // sinkMBB: + // v = eax MachineBasicBlock *thisMBB = MBB; + MachineFunction *MF = MBB->getParent(); MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); - MachineInstrBuilder MIB; - // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: - // Lo - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { - unsigned flags = (*MMOI)->getFlags(); - flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; - MachineMemOperand *MMO = - MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, - (*MMOI)->getSize(), - (*MMOI)->getBaseAlignment(), - (*MMOI)->getTBAAInfo(), - (*MMOI)->getRanges()); - MIB.addMemOperand(MMO); - }; - MachineInstr *LowMI = MIB; - - // Hi - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - if (i == X86::AddrDisp) { - MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) - } else { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - } - MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end()); - + // xbegin sinkMBB + // # fallthrough to mainMBB + // # abortion to sinkMBB + BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(sinkMBB); // mainMBB: - MachineBasicBlock *origMainMBB = mainMBB; - - // Add PHIs. - MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L) - .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); - MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H) - .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); - - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: - llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); - case X86::ATOMAND6432: - case X86::ATOMOR6432: - case X86::ATOMXOR6432: - case X86::ATOMADD6432: - case X86::ATOMSUB6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L) - .addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H) - .addReg(SrcHiReg); - break; - } - case X86::ATOMNAND6432: { - unsigned HiOpc, NOTOpc; - unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); - unsigned TmpL = MRI.createVirtualRegister(RC); - unsigned TmpH = MRI.createVirtualRegister(RC); - BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg) - .addReg(t4L); - BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg) - .addReg(t4H); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH); - break; - } - case X86::ATOMMAX6432: - case X86::ATOMMIN6432: - case X86::ATOMUMAX6432: - case X86::ATOMUMIN6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - unsigned cL = MRI.createVirtualRegister(RC8); - unsigned cH = MRI.createVirtualRegister(RC8); - unsigned cL32 = MRI.createVirtualRegister(RC); - unsigned cH32 = MRI.createVirtualRegister(RC); - unsigned cc = MRI.createVirtualRegister(RC); - // cl := cmp src_lo, lo - BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcLoReg).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(LoOpc), cL); - BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); - // ch := cmp src_hi, hi - BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcHiReg).addReg(t4H); - BuildMI(mainMBB, DL, TII->get(HiOpc), cH); - BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); - // cc := if (src_hi == hi) ? cl : ch; - if (Subtarget->hasCMov()) { - BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) - .addReg(cH32).addReg(cL32); - } else { - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) - .addReg(cH32).addReg(cL32) - .addImm(X86::COND_E); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - } - BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); - if (Subtarget->hasCMov()) { - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L) - .addReg(SrcLoReg).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H) - .addReg(SrcHiReg).addReg(t4H); - } else { - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L) - .addReg(SrcLoReg).addReg(t4L) - .addImm(X86::COND_NE); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the - // 2nd CMOV lowering. - mainMBB->addLiveIn(X86::EFLAGS); - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H) - .addReg(SrcHiReg).addReg(t4H) - .addImm(X86::COND_NE); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // Replace the original PHI node as mainMBB is changed after CMOV - // lowering. - BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L) - .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); - BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H) - .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); - PhiL->eraseFromParent(); - PhiH->eraseFromParent(); - } - break; - } - case X86::ATOMSWAP6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg); - break; - } - } - - // Copy EDX:EAX back from HiReg:LoReg - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H); - // Copy ECX:EBX from t1H:t1L - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H); - - MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - MIB.setMemRefs(MMOBegin, MMOEnd); - - // Copy EDX:EAX back to t3H:t3L - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX); - - BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - - mainMBB->addSuccessor(origMainMBB); + // EAX = -1 + BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); mainMBB->addSuccessor(sinkMBB); // sinkMBB: + // EAX is live into the sinkMBB + sinkMBB->addLiveIn(X86::EAX); BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstLoReg) - .addReg(t3L); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstHiReg) - .addReg(t3H); + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); MI->eraseFromParent(); return sinkMBB; @@ -17447,62 +18176,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::XBEGIN: return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo()); - // Atomic Lowering. - case X86::ATOMAND8: - case X86::ATOMAND16: - case X86::ATOMAND32: - case X86::ATOMAND64: - // Fall through - case X86::ATOMOR8: - case X86::ATOMOR16: - case X86::ATOMOR32: - case X86::ATOMOR64: - // Fall through - case X86::ATOMXOR16: - case X86::ATOMXOR8: - case X86::ATOMXOR32: - case X86::ATOMXOR64: - // Fall through - case X86::ATOMNAND8: - case X86::ATOMNAND16: - case X86::ATOMNAND32: - case X86::ATOMNAND64: - // Fall through - case X86::ATOMMAX8: - case X86::ATOMMAX16: - case X86::ATOMMAX32: - case X86::ATOMMAX64: - // Fall through - case X86::ATOMMIN8: - case X86::ATOMMIN16: - case X86::ATOMMIN32: - case X86::ATOMMIN64: - // Fall through - case X86::ATOMUMAX8: - case X86::ATOMUMAX16: - case X86::ATOMUMAX32: - case X86::ATOMUMAX64: - // Fall through - case X86::ATOMUMIN8: - case X86::ATOMUMIN16: - case X86::ATOMUMIN32: - case X86::ATOMUMIN64: - return EmitAtomicLoadArith(MI, BB); - - // This group does 64-bit operations on a 32-bit host. - case X86::ATOMAND6432: - case X86::ATOMOR6432: - case X86::ATOMXOR6432: - case X86::ATOMNAND6432: - case X86::ATOMADD6432: - case X86::ATOMSUB6432: - case X86::ATOMMAX6432: - case X86::ATOMMIN6432: - case X86::ATOMUMAX6432: - case X86::ATOMUMIN6432: - case X86::ATOMSWAP6432: - return EmitAtomicLoadArith6432(MI, BB); - case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -17774,6 +18447,333 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// \brief Get the PSHUF-style mask from PSHUF node. +/// +/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 +/// PSHUF-style masks that can be reused with such instructions. +static SmallVector getPSHUFShuffleMask(SDValue N) { + SmallVector Mask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); + (void)HaveMask; + assert(HaveMask); + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + return Mask; + case X86ISD::PSHUFLW: + Mask.resize(4); + return Mask; + case X86ISD::PSHUFHW: + Mask.erase(Mask.begin(), Mask.begin() + 4); + for (int &M : Mask) + M -= 4; + return Mask; + default: + llvm_unreachable("No valid shuffle instruction found!"); + } +} + +/// \brief Search for a combinable shuffle across a chain ending in pshufd. +/// +/// We walk up the chain and look for a combinable shuffle, skipping over +/// shuffles that we could hoist this shuffle's transformation past without +/// altering anything. +static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N.getOpcode() == X86ISD::PSHUFD && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + + // Walk up a single-use chain looking for a combinable shuffle. + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return false; // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFD: + // Found another dword shuffle. + break; + + case X86ISD::PSHUFLW: + // Check that the low words (being shuffled) are the identity in the + // dword shuffle, and the high words are self-contained. + if (Mask[0] != 0 || Mask[1] != 1 || + !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) + return false; + + continue; + + case X86ISD::PSHUFHW: + // Check that the high words (being shuffled) are the identity in the + // dword shuffle, and the low words are self-contained. + if (Mask[2] != 2 || Mask[3] != 3 || + !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) + return false; + + continue; + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword + // shuffle into a preceding word shuffle. + if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) + return false; + + // Search for a half-shuffle which we can combine with. + unsigned CombineOp = + V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; + if (V.getOperand(0) != V.getOperand(1) || + !V->isOnlyUserOf(V.getOperand(0).getNode())) + return false; + V = V.getOperand(0); + do { + switch (V.getOpcode()) { + default: + return false; // Nothing to combine. + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOp) + break; + + // Fallthrough! + case ISD::BITCAST: + V = V.getOperand(0); + continue; + } + break; + } while (V.hasOneUse()); + break; + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return false; + + // Record the old value to use in RAUW-ing. + SDValue Old = V; + + // Merge this node's mask and our incoming mask. + SmallVector VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // It is possible that one of the combinable shuffles was completely absorbed + // by the other, just replace it and revisit all users in that case. + if (Old.getNode() == V.getNode()) { + DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true); + return true; + } + + // Replace N with its operand as we're going to combine that shuffle away. + DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); + return true; +} + +/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. +/// +/// We walk up the chain, skipping shuffles of the other half and looking +/// through shuffles which switch halves trying to find a shuffle of the same +/// pair of dwords. +static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert( + (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + unsigned CombineOpcode = N.getOpcode(); + + // Walk up a single-use chain looking for a combinable shuffle. + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return false; // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOpcode) + break; + + // Other-half shuffles are no-ops. + continue; + + case X86ISD::PSHUFD: { + // We can only handle pshufd if the half we are combining either stays in + // its half, or switches to the other half. Bail if one of these isn't + // true. + SmallVector VMask = getPSHUFShuffleMask(V); + int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2; + if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) || + (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2))) + return false; + + // Map the mask through the pshufd and keep walking up the chain. + for (int i = 0; i < 4; ++i) + Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2; + + // Switch halves if the pshufd does. + CombineOpcode = + VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; + continue; + } + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return false; + + // Record the old value to use in RAUW-ing. + SDValue Old = V; + + // Merge this node's mask and our incoming mask (adjusted to account for all + // the pshufd instructions encountered). + SmallVector VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Replace N with its operand as we're going to combine that shuffle away. + DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); + return true; +} + +/// \brief Try to combine x86 target specific shuffles. +static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N.getSimpleValueType(); + SmallVector Mask; + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + Mask = getPSHUFShuffleMask(N); + assert(Mask.size() == 4); + break; + default: + return SDValue(); + } + + // Nuke no-op shuffles that show up after combining. + if (isNoopShuffleMask(Mask)) + return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); + + // Look for simplifications involving one or two shuffle instructions. + SDValue V = N.getOperand(0); + switch (N.getOpcode()) { + default: + break; + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + assert(VT == MVT::v8i16); + (void)VT; + + if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) + return SDValue(); // We combined away this shuffle, so we're done. + + // See if this reduces to a PSHUFD which is no more expensive and can + // combine with more operations. + if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 && + areAdjacentMasksSequential(Mask)) { + int DMask[] = {-1, -1, -1, -1}; + int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; + DMask[DOffset + 0] = DOffset + Mask[0] / 2; + DMask[DOffset + 1] = DOffset + Mask[2] / 2; + V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); + DCI.AddToWorklist(V.getNode()); + V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, + getV4X86ShuffleImm8ForMask(DMask, DAG)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + } + + // Look for shuffle patterns which can be implemented as a single unpack. + // FIXME: This doesn't handle the location of the PSHUFD generically, and + // only works when we have a PSHUFD followed by two half-shuffles. + if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && + (V.getOpcode() == X86ISD::PSHUFLW || + V.getOpcode() == X86ISD::PSHUFHW) && + V.getOpcode() != N.getOpcode() && + V.hasOneUse()) { + SDValue D = V.getOperand(0); + while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) + D = D.getOperand(0); + if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { + SmallVector VMask = getPSHUFShuffleMask(V); + SmallVector DMask = getPSHUFShuffleMask(D); + int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int WordMask[8]; + for (int i = 0; i < 4; ++i) { + WordMask[i + NOffset] = Mask[i] + NOffset; + WordMask[i + VOffset] = VMask[i] + VOffset; + } + // Map the word mask through the DWord mask. + int MappedMask[8]; + for (int i = 0; i < 8; ++i) + MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; + const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3}; + const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7}; + if (std::equal(std::begin(MappedMask), std::end(MappedMask), + std::begin(UnpackLoMask)) || + std::equal(std::begin(MappedMask), std::end(MappedMask), + std::begin(UnpackHiMask))) { + // We can replace all three shuffles with an unpack. + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL + : X86ISD::UNPCKH, + DL, MVT::v8i16, V, V); + } + } + } + + break; + + case X86ISD::PSHUFD: + if (combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + return SDValue(); // We combined away this shuffle. + + break; + } + + return SDValue(); +} + /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -17783,6 +18783,49 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + // Canonicalize shuffles that perform 'addsub' on packed float vectors + // according to the rule: + // (shuffle (FADD A, B), (FSUB A, B), Mask) -> + // (shuffle (FSUB A, -B), (FADD A, -B), Mask) + // + // Where 'Mask' is: + // <0,5,2,7> -- for v4f32 and v4f64 shuffles; + // <0,3> -- for v2f64 shuffles; + // <0,9,2,11,4,13,6,15> -- for v8f32 shuffles. + // + // This helps pattern-matching more SSE3/AVX ADDSUB instructions + // during ISel stage. + if (N->getOpcode() == ISD::VECTOR_SHUFFLE && + ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && + N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB && + // Operands to the FADD and FSUB must be the same. + ((N0->getOperand(0) == N1->getOperand(0) && + N0->getOperand(1) == N1->getOperand(1)) || + // FADD is commutable. See if by commuting the operands of the FADD + // we would still be able to match the operands of the FSUB dag node. + (N0->getOperand(1) == N1->getOperand(0) && + N0->getOperand(0) == N1->getOperand(1))) && + N0->getOperand(0)->getOpcode() != ISD::UNDEF && + N0->getOperand(1)->getOpcode() != ISD::UNDEF) { + + ShuffleVectorSDNode *SV = cast(N); + unsigned NumElts = VT.getVectorNumElements(); + ArrayRef Mask = SV->getMask(); + bool CanFold = true; + + for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) + CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i); + + if (CanFold) { + SDValue Op0 = N1->getOperand(0); + SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1)); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1); + SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1); + return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask); + } + } + // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) @@ -17855,7 +18898,18 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + if (LD.getNode()) + return LD; + + if (isTargetShuffle(N->getOpcode())) { + SDValue Shuffle = + PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + + return SDValue(); } /// PerformTruncateCombine - Converts truncate operation to @@ -18509,28 +19563,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); - // If the RHS is a constant we have to reverse the const canonicalization. - // x > C-1 ? x+-C : 0 --> subus x, C - if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && - isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { - APInt A = cast(OpRHS.getOperand(0))->getAPIntValue(); - if (CondRHS.getConstantOperandVal(0) == -A-1) - return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, - DAG.getConstant(-A, VT)); - } - - // Another special case: If C was a sign bit, the sub has been - // canonicalized into a xor. - // FIXME: Would it be better to use computeKnownBits to determine whether - // it's safe to decanonicalize the xor? - // x s< 0 ? x^C : 0 --> subus x, C - if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && - ISD::isBuildVectorAllZeros(CondRHS.getNode()) && - isSplatVector(OpRHS.getNode())) { - APInt A = cast(OpRHS.getOperand(0))->getAPIntValue(); - if (A.isSignBit()) - return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); - } + if (auto *OpRHSBV = dyn_cast(OpRHS)) + if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { + if (auto *CondRHSBV = dyn_cast(CondRHS)) + if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) + // If the RHS is a constant we have to reverse the const + // canonicalization. + // x > C-1 ? x+-C : 0 --> subus x, C + if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && + CondRHSConst->getAPIntValue() == + (-OpRHSConst->getAPIntValue() - 1)) + return DAG.getNode( + X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(-OpRHSConst->getAPIntValue(), VT)); + + // Another special case: If C was a sign bit, the sub has been + // canonicalized into a xor. + // FIXME: Would it be better to use computeKnownBits to determine + // whether it's safe to decanonicalize the xor? + // x s< 0 ? x^C : 0 --> subus x, C + if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && + ISD::isBuildVectorAllZeros(CondRHS.getNode()) && + OpRHSConst->getAPIntValue().isSignBit()) + // Note that we have to rebuild the RHS constant here to ensure we + // don't rely on particular values of undef lanes. + return DAG.getNode( + X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(OpRHSConst->getAPIntValue(), VT)); + } } } @@ -19097,6 +20157,8 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, if (C->isAllOnesValue()) return Op1; } + + return SDValue(); } // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. @@ -19236,16 +20298,15 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // vector operations in many cases. Also, on sandybridge ADD is faster than // shl. // (shl V, 1) -> add V,V - if (isSplatVector(N1.getNode())) { - assert(N0.getValueType().isVector() && "Invalid vector shift type"); - ConstantSDNode *N1C = dyn_cast(N1->getOperand(0)); - // We shift all of the values by one. In many cases we do not have - // hardware support for this operation. This is better expressed as an ADD - // of two values. - if (N1C && (1 == N1C->getZExtValue())) { - return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); + if (auto *N1BV = dyn_cast(N1)) + if (auto *N1SplatC = N1BV->getConstantSplatNode()) { + assert(N0.getValueType().isVector() && "Invalid vector shift type"); + // We shift all of the values by one. In many cases we do not have + // hardware support for this operation. This is better expressed as an ADD + // of two values. + if (N1SplatC->getZExtValue() == 1) + return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } - } return SDValue(); } @@ -19264,10 +20325,9 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, SDValue Amt = N->getOperand(1); SDLoc DL(N); - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast(SclrAmt)) { - APInt ShiftAmt = C->getAPIntValue(); + if (auto *AmtBV = dyn_cast(Amt)) + if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { + APInt ShiftAmt = AmtSplat->getAPIntValue(); unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); // SSE2/AVX2 logical shifts always return a vector of 0s @@ -19277,7 +20337,6 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, if (ShiftAmt.trunc(8).uge(MaxAmount)) return getZeroVector(VT, Subtarget, DAG, DL); } - } return SDValue(); } @@ -19471,9 +20530,10 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // The right side has to be a 'trunc' or a constant vector. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; - bool RHSConst = (isSplatVector(N1.getNode()) && - isa(N1->getOperand(0))); - if (!RHSTrunc && !RHSConst) + ConstantSDNode *RHSConstSplat = nullptr; + if (auto *RHSBV = dyn_cast(N1)) + RHSConstSplat = RHSBV->getConstantSplatNode(); + if (!RHSTrunc && !RHSConstSplat) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -19483,9 +20543,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); - if (RHSConst) { + if (RHSConstSplat) { N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), - N1->getOperand(0)); + SDValue(RHSConstSplat, 0)); SmallVector C(WideVT.getVectorNumElements(), N1); N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); } else if (RHSTrunc) { @@ -19631,12 +20691,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); unsigned SraAmt = ~0; if (Mask.getOpcode() == ISD::SRA) { - SDValue Amt = Mask.getOperand(1); - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast(SclrAmt)) - SraAmt = C->getZExtValue(); - } + if (auto *AmtBV = dyn_cast(Mask.getOperand(1))) + if (auto *AmtConst = AmtBV->getConstantSplatNode()) + SraAmt = AmtConst->getZExtValue(); } else if (Mask.getOpcode() == X86ISD::VSRAI) { SDValue SraC = Mask.getOperand(1); SraAmt = cast(SraC)->getZExtValue(); @@ -20773,8 +21830,59 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, + SelectionDAG &DAG) { + // Take advantage of vector comparisons producing 0 or -1 in each lane to + // optimize away operation when it's from a constant. + // + // The general transformation is: + // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> + // AND(VECTOR_CMP(x,y), constant2) + // constant2 = UNARYOP(constant) + + // Early exit if this isn't a vector operation or if the operand of the + // unary operation isn't a bitwise AND. + EVT VT = N->getValueType(0); + if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || + N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC) + return SDValue(); + + // Now check that the other operand of the AND is a constant splat. We could + // make the transformation for non-constant splats as well, but it's unclear + // that would be a benefit as it would not eliminate any operations, just + // perform one more step in scalar code before moving to the vector unit. + if (BuildVectorSDNode *BV = + dyn_cast(N->getOperand(0)->getOperand(1))) { + // Bail out if the vector isn't a constant splat. + if (!BV->getConstantSplatNode()) + return SDValue(); + + // Everything checks out. Build up the new and improved node. + SDLoc DL(N); + EVT IntVT = BV->getValueType(0); + // Create a new constant of the appropriate type for the transformed + // DAG. + SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); + // The AND node needs bitcasts to/from an integer vector type around it. + SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, + N->getOperand(0)->getOperand(0), MaskConst); + SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); + return Res; + } + + return SDValue(); +} + static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { + // First try to optimize away the conversion entirely when it's + // conditionally from a constant. Vectors only. + SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); + if (Res != SDValue()) + return Res; + + // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT InVT = Op0->getValueType(0); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index af2e4344e5b6..c8cdce7c7664 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -86,6 +86,9 @@ namespace llvm { /// X86 Read Time-Stamp Counter and Processor ID. RDTSCP_DAG, + /// X86 Read Performance Monitoring Counters. + RDPMC_DAG, + /// X86 compare and logical compare instructions. CMP, COMI, UCOMI, @@ -315,6 +318,8 @@ namespace llvm { KORTEST, // Several flavors of instructions with vector shuffle behaviors. + PACKSS, + PACKUS, PALIGNR, PSHUFD, PSHUFHW, @@ -400,23 +405,8 @@ namespace llvm { // XTEST - Test if in transactional execution. XTEST, - // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, - // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - - // Atomic 64-bit binary operations. - ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, - ATOMSUB64_DAG, - ATOMOR64_DAG, - ATOMXOR64_DAG, - ATOMAND64_DAG, - ATOMNAND64_DAG, - ATOMMAX64_DAG, - ATOMMIN64_DAG, - ATOMUMAX64_DAG, - ATOMUMIN64_DAG, - ATOMSWAP64_DAG, - // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap. - LCMPXCHG_DAG, + LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, LCMPXCHG16_DAG, @@ -806,6 +796,9 @@ namespace llvm { /// \brief Reset the operation actions based on target options. void resetOperationActions() override; + /// \brief Customize the preferred legalization strategy for certain types. + LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + protected: std::pair findRepresentativeClass(MVT VT) const override; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 7cac5ebbece7..92d536356d4c 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1,19 +1,36 @@ // Bitcasts between 512-bit vector types. Return the original type since // no instruction is needed for the conversion let Predicates = [HasAVX512] in { - def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; - def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; - def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>; def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>; - def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>; def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; - def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; - def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; - def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>; @@ -135,7 +152,6 @@ def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst), (ins VR512:$src1, i128mem:$src2, i8imm:$src3), "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; - } let hasSideEffects = 0 in { @@ -476,6 +492,28 @@ defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +multiclass avx512_int_subvec_broadcast_rm opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + RegisterClass KRC> { + let mayLoad = 1 in { + def rm : AVX5128I, EVEX; + def krm : AVX5128I, EVEX, EVEX_KZ; + } +} + +defm VBROADCASTI32X4 : avx512_int_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", + i128mem, loadv2i64, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTI64X4 : avx512_int_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", + i256mem, loadv4i64, VK16WM>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT4>; + def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), (VPBROADCASTDZrr VR128X:$src)>; def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), @@ -587,7 +625,7 @@ defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, // -- VPERM2I - 3 source operands form -- multiclass avx512_perm_3src opc, string OpcodeStr, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop, - SDNode OpNode, ValueType OpVT> { + SDNode OpNode, ValueType OpVT, RegisterClass KRC> { let Constraints = "$src1 = $dst" in { def rr : AVX5128I, EVEX_4V; + def rrk : AVX5128I, + EVEX_4V, EVEX_K; + + let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> + def rrkz : AVX5128I, + EVEX_4V, EVEX_KZ; + def rm : AVX5128I, EVEX_4V; + + def rmk : AVX5128I, + EVEX_4V, EVEX_K; + + let AddedComplexity = 10 in // Prefer over the rrkz variant + def rmkz : AVX5128I, + EVEX_4V, EVEX_KZ; } } -defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem, - X86VPermiv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem, - X86VPermiv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512mem, - X86VPermiv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem, - X86VPermiv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPERMT2D : avx512_perm_3src<0x7E, "vpermt2d", VR512, memopv16i32, i512mem, - X86VPermv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_3src<0x7E, "vpermt2q", VR512, memopv8i64, i512mem, - X86VPermv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps", VR512, memopv16f32, i512mem, - X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd", VR512, memopv8f64, i512mem, - X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx), - (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))), - (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>; - -def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx), - (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))), - (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx), - (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))), - (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>; - -def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx), - (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))), - (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>; +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, + i512mem, X86VPermiv3, v16i32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, + i512mem, X86VPermiv3, v8i64, VK8WM>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, + i512mem, X86VPermiv3, v16f32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, + i512mem, X86VPermiv3, v8f64, VK8WM>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +multiclass avx512_perm_table_3src opc, string Suffix, RegisterClass RC, + PatFrag mem_frag, X86MemOperand x86memop, + SDNode OpNode, ValueType OpVT, RegisterClass KRC, + ValueType MaskVT, RegisterClass MRC> : + avx512_perm_3src { + def : Pat<(OpVT (!cast("int_x86_avx512_mask_vpermt_"##Suffix##"_512") + VR512:$idx, VR512:$src1, VR512:$src2, -1)), + (!cast(NAME#rr) VR512:$src1, VR512:$idx, VR512:$src2)>; + + def : Pat<(OpVT (!cast("int_x86_avx512_mask_vpermt_"##Suffix##"_512") + VR512:$idx, VR512:$src1, VR512:$src2, MRC:$mask)), + (!cast(NAME#rrk) VR512:$src1, + (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>; +} + +defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, memopv16i32, i512mem, + X86VPermv3, v16i32, VK16WM, v16i1, GR16>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, memopv8i64, i512mem, + X86VPermv3, v8i64, VK8WM, v8i1, GR8>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, memopv16f32, i512mem, + X86VPermv3, v16f32, VK16WM, v16i1, GR16>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, memopv8f64, i512mem, + X86VPermv3, v8f64, VK8WM, v8i1, GR8>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask // @@ -792,52 +889,61 @@ def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; -multiclass avx512_icmp_cc opc, RegisterClass KRC, +multiclass avx512_icmp_cc opc, RegisterClass WMRC, RegisterClass KRC, RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt, Operand CC, string asm, - string asm_alt> { + SDNode OpNode, ValueType vt, Operand CC, string Suffix> { def rri : AVX512AIi8, EVEX_4V; def rmi : AVX512AIi8, EVEX_4V; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512AIi8, EVEX_4V; + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + def rrik_alt : AVX512AIi8, EVEX_4V, EVEX_K; def rmi_alt : AVX512AIi8, EVEX_4V; + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rmik_alt : AVX512AIi8, EVEX_4V, EVEX_K; } } -defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32, - X86cmpm, v16i32, AVXCC, - "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32, - X86cmpmu, v16i32, AVXCC, - "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8, VR512, i512mem, memopv8i64, - X86cmpm, v8i64, AVXCC, - "vpcmp${cc}q\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64, - X86cmpmu, v8i64, AVXCC, - "vpcmp${cc}uq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpuq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -// avx512_cmp_packed - sse 1 & 2 compare packed instructions +defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32, + X86cmpm, v16i32, AVXCC, "d">, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32, + X86cmpmu, v16i32, AVXCC, "ud">, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64, + X86cmpm, v8i64, AVXCC, "q">, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64, + X86cmpmu, v8i64, AVXCC, "uq">, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + +// avx512_cmp_packed - compare packed instructions multiclass avx512_cmp_packed { @@ -861,11 +967,11 @@ multiclass avx512_cmp_packed; def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), !strconcat("vcmp", suffix, " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; } @@ -3523,7 +3629,6 @@ def : Pat <(v8f64 (int_x86_avx512_rcp28_pd (v8f64 VR512:$src), (VRCP28PDZrb VR512:$src)>; multiclass avx512_sqrt_packed opc, string OpcodeStr, SDNode OpNode, - Intrinsic V16F32Int, Intrinsic V8F64Int, OpndItins itins_s, OpndItins itins_d> { def PSZrr :AVX512PSI opc, string OpcodeStr, SDNode OpNode, (v8f64 (bitconvert (memopv16f32 addr:$src)))))], itins_d.rm>, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; -let isCodeGenOnly = 1 in { - def PSZr_Int : AVX512PSI, - EVEX, EVEX_V512; - def PSZm_Int : AVX512PSI, EVEX, - EVEX_V512, EVEX_CD8<32, CD8VF>; - def PDZr_Int : AVX512PDI, - EVEX, EVEX_V512, VEX_W; - def PDZm_Int : AVX512PDI, - EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -} // isCodeGenOnly = 1 } multiclass avx512_sqrt_scalar opc, string OpcodeStr, @@ -3638,10 +3722,16 @@ defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, SSE_SQRTSS, SSE_SQRTSD>, avx512_sqrt_packed<0x51, "vsqrt", fsqrt, - int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512, SSE_SQRTPS, SSE_SQRTPD>; let Predicates = [HasAVX512] in { + def : Pat<(v16f32 (int_x86_avx512_sqrt_ps_512 (v16f32 VR512:$src1), + (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_CURRENT)), + (VSQRTPSZrr VR512:$src1)>; + def : Pat<(v8f64 (int_x86_avx512_sqrt_pd_512 (v8f64 VR512:$src1), + (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_CURRENT)), + (VSQRTPDZrr VR512:$src1)>; + def : Pat<(f32 (fsqrt FR32X:$src)), (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; def : Pat<(f32 (fsqrt (load addr:$src))), diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index 368e14b91f40..f2574cc3700e 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1278,8 +1278,10 @@ let isCompare = 1 in { def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the - // register class is constrained to GR8_NOREX. - let isPseudo = 1 in + // register class is constrained to GR8_NOREX. This pseudo is explicitly + // marked side-effect free, since it doesn't have an isel pattern like + // other test instructions. + let isPseudo = 1, hasSideEffects = 0 in def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; } // Defs = [EFLAGS] diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 9b3dce52f72e..ca4f608a6b80 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -110,7 +110,7 @@ let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in // When using segmented stacks these are lowered into instructions which first // check if the current stacklet has enough free memory. If it does, memory is -// allocated by bumping the stack pointer. Otherwise memory is allocated from +// allocated by bumping the stack pointer. Otherwise memory is allocated from // the heap. let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in @@ -196,6 +196,26 @@ let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { "#EH_SjLj_Setup\t$dst", []>; } +//===----------------------------------------------------------------------===// +// Pseudo instructions used by unwind info. +// +let isPseudo = 1 in { + def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg), + "#SEH_PushReg $reg", []>; + def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveReg $reg, $dst", []>; + def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveXMM $reg, $dst", []>; + def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size), + "#SEH_StackAlloc $size", []>; + def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset), + "#SEH_SetFrame $reg, $offset", []>; + def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode), + "#SEH_PushFrame $mode", []>; + def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), + "#SEH_EndPrologue", []>; +} + //===----------------------------------------------------------------------===// // Pseudo instructions used by segmented stacks. // @@ -371,7 +391,7 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in { def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, Requires<[In64BitMode]>; - + let Uses = [RAX,RCX,RDI] in def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", [(X86rep_stos i64)], IIC_REP_STOS>, REP, @@ -501,83 +521,6 @@ def CMOV_RFP80 : I<0, Pseudo, } // UsesCustomInserter = 1, Uses = [EFLAGS] -//===----------------------------------------------------------------------===// -// Atomic Instruction Pseudo Instructions -//===----------------------------------------------------------------------===// - -// Pseudo atomic instructions - -multiclass PSEUDO_ATOMIC_LOAD_BINOP { - let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in { - let Defs = [EFLAGS, AL] in - def NAME#8 : I<0, Pseudo, (outs GR8:$dst), - (ins i8mem:$ptr, GR8:$val), - !strconcat(mnemonic, "8 PSEUDO!"), []>; - let Defs = [EFLAGS, AX] in - def NAME#16 : I<0, Pseudo,(outs GR16:$dst), - (ins i16mem:$ptr, GR16:$val), - !strconcat(mnemonic, "16 PSEUDO!"), []>; - let Defs = [EFLAGS, EAX] in - def NAME#32 : I<0, Pseudo, (outs GR32:$dst), - (ins i32mem:$ptr, GR32:$val), - !strconcat(mnemonic, "32 PSEUDO!"), []>; - let Defs = [EFLAGS, RAX] in - def NAME#64 : I<0, Pseudo, (outs GR64:$dst), - (ins i64mem:$ptr, GR64:$val), - !strconcat(mnemonic, "64 PSEUDO!"), []>; - } -} - -multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS { - def : Pat<(!cast(frag # "_8") addr:$ptr, GR8:$val), - (!cast(name # "8") addr:$ptr, GR8:$val)>; - def : Pat<(!cast(frag # "_16") addr:$ptr, GR16:$val), - (!cast(name # "16") addr:$ptr, GR16:$val)>; - def : Pat<(!cast(frag # "_32") addr:$ptr, GR32:$val), - (!cast(name # "32") addr:$ptr, GR32:$val)>; - def : Pat<(!cast(frag # "_64") addr:$ptr, GR64:$val), - (!cast(name # "64") addr:$ptr, GR64:$val)>; -} - -// Atomic exchange, and, or, xor -defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">; -defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">; -defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">; -defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">; -defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">; -defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">; -defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">; -defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">; - -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND", "atomic_load_and">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR", "atomic_load_or">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR", "atomic_load_xor">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX", "atomic_load_max">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN", "atomic_load_min">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">; - -multiclass PSEUDO_ATOMIC_LOAD_BINOP6432 { - let usesCustomInserter = 1, Defs = [EFLAGS, EAX, EDX], - mayLoad = 1, mayStore = 1, hasSideEffects = 0 in - def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), - (ins i64mem:$ptr, GR32:$val1, GR32:$val2), - !strconcat(mnemonic, "6432 PSEUDO!"), []>; -} - -defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">; -defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">; -defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">; -defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">; -defm ATOMADD : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">; -defm ATOMSUB : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">; -defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">; -defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">; -defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">; -defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">; -defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">; - //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index cc302663d589..8ef5f901c185 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -184,13 +184,16 @@ class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; } class EVEX_B { bit hasEVEX_B = 1; } class EVEX_RC { bit hasEVEX_RC = 1; } class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; } +class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; } +class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; } + +// Specify AVX512 8-bit compressed displacement encoding based on the vector +// element size in bits (8, 16, 32, 64) and the CDisp8 form. class EVEX_CD8 { - bits<2> EVEX_CD8E = !if(!eq(esize, 8), 0b00, - !if(!eq(esize, 16), 0b01, - !if(!eq(esize, 32), 0b10, - !if(!eq(esize, 64), 0b11, ?)))); - bits<3> EVEX_CD8V = form.Value; + int CD8_EltSize = !srl(esize, 3); + bits<3> CD8_Form = form.Value; } + class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } class MemOp4 { bit hasMemOp4Prefix = 1; } class XOP { Encoding OpEnc = EncXOP; } @@ -253,12 +256,32 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field? bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field? bit hasEVEX_B = 0; // Does this inst set the EVEX_B field? - bits<2> EVEX_CD8E = 0; // Compressed disp8 form - element-size. - bits<3> EVEX_CD8V = 0; // Compressed disp8 form - vector-width. + bits<3> CD8_Form = 0; // Compressed disp8 form - vector-width. + // Declare it int rather than bits<4> so that all bits are defined when + // assigning to bits<7>. + int CD8_EltSize = 0; // Compressed disp8 form - element-size in bytes. bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction. + bits<2> EVEX_LL; + let EVEX_LL{0} = hasVEX_L; + let EVEX_LL{1} = hasEVEX_L2; + // Vector size in bytes. + bits<7> VectSize = !shl(16, EVEX_LL); + + // The scaling factor for AVX512's compressed displacement is either + // - the size of a power-of-two number of elements or + // - the size of a single element for broadcasts or + // - the total vector size divided by a power-of-two number. + // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64. + bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value), + !if (CD8_Form{2}, + !shl(CD8_EltSize, CD8_Form{1-0}), + !if (hasEVEX_B, + CD8_EltSize, + !srl(VectSize, CD8_Form{1-0}))), 0); + // TSFlags layout should be kept in sync with X86InstrInfo.h. let TSFlags{6-0} = FormBits; let TSFlags{8-7} = OpSizeBits; @@ -283,11 +306,11 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{45} = hasEVEX_Z; let TSFlags{46} = hasEVEX_L2; let TSFlags{47} = hasEVEX_B; - let TSFlags{49-48} = EVEX_CD8E; - let TSFlags{52-50} = EVEX_CD8V; - let TSFlags{53} = has3DNow0F0FOpcode; - let TSFlags{54} = hasMemOp4Prefix; - let TSFlags{55} = hasEVEX_RC; + // If we run out of TSFlags bits, it's possible to encode this in 3 bits. + let TSFlags{54-48} = CD8_Scale; + let TSFlags{55} = has3DNow0F0FOpcode; + let TSFlags{56} = hasMemOp4Prefix; + let TSFlags{57} = hasEVEX_RC; } class PseudoI pattern> diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 1582f4388192..6f0fa9462770 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -224,6 +224,10 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; +def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; +def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>; +def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; + def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index bfc8e2759dcb..0d3afc43c2bb 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2696,8 +2696,8 @@ unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { /// getCMovFromCond - Return a cmov opcode for the given condition, /// register size in bytes, and operand type. -static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes, - bool HasMemoryOperand) { +unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, + bool HasMemoryOperand) { static const uint16_t Opc[32][3] = { { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr }, { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr }, @@ -5037,6 +5037,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, case X86::TEST16rm: case X86::TEST32rm: case X86::TEST64rm: + case X86::TEST8ri_NOREX: case X86::AND16i16: case X86::AND16ri: case X86::AND16ri8: diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index d76c52ce47df..c177e3a5c7c7 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -66,6 +66,11 @@ namespace X86 { /// a memory operand. unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); + /// \brief Return a cmov opcode for the given condition, register size in + /// bytes, and operand type. + unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, + bool HasMemoryOperand = false); + // Turn CMov opcode into condition code. CondCode getCondFromCMovOpc(unsigned Opc); diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 5d34c326ace6..0f872a676c25 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -155,27 +155,6 @@ def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; @@ -208,6 +187,8 @@ def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; @@ -738,10 +719,14 @@ def HasAVX512 : Predicate<"Subtarget->hasAVX512()">, AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">; def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; -def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; +def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; def HasCDI : Predicate<"Subtarget->hasCDI()">; def HasPFI : Predicate<"Subtarget->hasPFI()">; def HasERI : Predicate<"Subtarget->hasERI()">; +def HasDQI : Predicate<"Subtarget->hasDQI()">; +def HasBWI : Predicate<"Subtarget->hasBWI()">; +def HasVLX : Predicate<"Subtarget->hasVLX()">, + AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 043b2f32c6ff..2bb898e7465b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4336,20 +4336,6 @@ defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, SSE_INTALU_ITINS_P, 0>; -//===---------------------------------------------------------------------===// -// SSE2 - Packed Integer Pack Instructions -//===---------------------------------------------------------------------===// - -defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128, - int_x86_avx2_packsswb, - SSE_INTALU_ITINS_SHUFF_P, 0>; -defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128, - int_x86_avx2_packssdw, - SSE_INTALU_ITINS_SHUFF_P, 0>; -defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128, - int_x86_avx2_packuswb, - SSE_INTALU_ITINS_SHUFF_P, 0>; - //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Shuffle Instructions //===---------------------------------------------------------------------===// @@ -4431,6 +4417,136 @@ let Predicates = [UseSSE2] in { (PSHUFDri VR128:$src1, imm:$imm)>; } +//===---------------------------------------------------------------------===// +// Packed Integer Pack Instructions (SSE & AVX) +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pack opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + bit Is2Addr = 1> { + def rr : PDI, + Sched<[WriteShuffle]>; + def rm : PDI, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse2_pack_y opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : PDI, + Sched<[WriteShuffle]>; + def Yrm : PDI, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + bit Is2Addr = 1> { + def rr : SS48I, + Sched<[WriteShuffle]>; + def rm : SS48I, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack_y opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : SS48I, + Sched<[WriteShuffle]>; + def Yrm : SS48I, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, + bc_v8i16, 0>, VEX_4V; + defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, + bc_v4i32, 0>, VEX_4V; + + defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, + bc_v8i16, 0>, VEX_4V; + defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, + bc_v4i32, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, + bc_v8i32>, VEX_4V, VEX_L; + + defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, + bc_v8i32>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, + bc_v8i16>; + defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, + bc_v4i32>; + + defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, + bc_v8i16>; + + let Predicates = [HasSSE41] in + defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, + bc_v4i32>; +} +} // ExeDomain = SSEPackedInt + //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Unpack Instructions //===---------------------------------------------------------------------===// @@ -5239,6 +5355,60 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { f128mem, SSE_ALU_F64P>, PD; } +// Patterns used to select 'addsub' instructions. +let Predicates = [HasAVX] in { + // Constant 170 corresponds to the binary mask '10101010'. + // When used as a blend mask, it allows selecting eight elements from two + // input vectors as follow: + // - Even-numbered values in the destination are copied from + // the corresponding elements in the first input vector; + // - Odd-numbered values in the destination are copied from + // the corresponding elements in the second input vector. + + def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)), + (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + + // Constant 10 corresponds to the binary mask '1010'. + // In the two pattens below, constant 10 is used as a blend mask to select + // - the 1st and 3rd element from the first input vector (the 'fsub' node); + // - the 2nd and 4th element from the second input vector (the 'fadd' node). + + def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), + (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), + (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), + (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), + (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), + (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; +} + +let Predicates = [UseSSE3] in { + // Constant 10 corresponds to the binary mask '1010'. + // In the pattern below, it is used as a blend mask to select: + // - the 1st and 3rd element from the first input vector (the fsub node); + // - the 2nd and 4th element from the second input vector (the fadd node). + + def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), + (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), + (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), + (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; +} + //===---------------------------------------------------------------------===// // SSE3 Instructions //===---------------------------------------------------------------------===// @@ -7053,8 +7223,6 @@ multiclass SS48I_binop_rm2 opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX] in { let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, - 0, DEFAULT_ITINS_SHUFFLESCHED>, VEX_4V; defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; @@ -7086,9 +7254,6 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", - int_x86_avx2_packusdw, WriteShuffle>, - VEX_4V, VEX_L; defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; @@ -7120,8 +7285,6 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { let isCommutable = 0 in - defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw, - 1, DEFAULT_ITINS_SHUFFLESCHED>; defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128, @@ -8375,6 +8538,21 @@ let Predicates = [HasF16C] in { (VCVTPH2PSrm addr:$src)>; } +// Patterns for matching conversions from float to half-float and vice versa. +let Predicates = [HasF16C] in { + def : Pat<(fp_to_f16 FR32:$src), + (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr + (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>; + + def : Pat<(f16_to_fp GR16:$src), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; + + def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >; +} + //===----------------------------------------------------------------------===// // AVX2 Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index b5595cbd3bbe..540278021f02 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -439,7 +439,10 @@ def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), let SchedRW = [WriteSystem] in { def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB; def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB; -def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB; + +let Defs = [RAX, RDX], Uses = [ECX] in + def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>, + TB; def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB; diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 0190080b935b..2bd70a96c432 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "X86AsmPrinter.h" +#include "X86RegisterInfo.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "llvm/ADT/SmallString.h" @@ -779,6 +780,9 @@ static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM, void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); + const X86RegisterInfo *RI = + static_cast(TM.getRegisterInfo()); + switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); @@ -883,6 +887,37 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { .addReg(X86::R10) .addReg(X86::RAX)); return; + + case X86::SEH_PushReg: + OutStreamer.EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm())); + return; + + case X86::SEH_SaveReg: + OutStreamer.EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_SaveXMM: + OutStreamer.EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_StackAlloc: + OutStreamer.EmitWinCFIAllocStack(MI->getOperand(0).getImm()); + return; + + case X86::SEH_SetFrame: + OutStreamer.EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_PushFrame: + OutStreamer.EmitWinCFIPushFrame(MI->getOperand(0).getImm()); + return; + + case X86::SEH_EndPrologue: + OutStreamer.EmitWinCFIEndProlog(); + return; } MCInst TmpInst; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 236e1a4d949a..e8a7e84bb7e3 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -81,21 +81,6 @@ X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI) BasePtr = Is64Bit ? X86::RBX : X86::ESI; } -/// getCompactUnwindRegNum - This function maps the register to the number for -/// compact unwind encoding. Return -1 if the register isn't valid. -int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const { - switch (getLLVMRegNum(RegNum, isEH)) { - case X86::EBX: case X86::RBX: return 1; - case X86::ECX: case X86::R12: return 2; - case X86::EDX: case X86::R13: return 3; - case X86::EDI: case X86::R14: return 4; - case X86::ESI: case X86::R15: return 5; - case X86::EBP: case X86::RBP: return 6; - } - - return -1; -} - bool X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { // ExeDepsFixer and PostRAScheduler require liveness. diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index ba346c85a7f9..74efd1fe32d7 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -60,10 +60,6 @@ class X86RegisterInfo final : public X86GenRegisterInfo { // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; - /// getCompactUnwindRegNum - This function maps the register to the number for - /// compact unwind encoding. Return -1 if the register isn't valid. - int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const override; - /// Code Generation virtual methods... /// bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 33c402b69a4a..0da98637496e 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -449,7 +449,7 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { } // AVX-512 vector/mask registers. -def VR512 : RegisterClass<"X86", [v16f32, v8f64, v16i32, v8i64], 512, +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512, (sequence "ZMM%u", 0, 31)>; // Scalar AVX-512 floating point registers. @@ -463,13 +463,19 @@ def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], 256, (sequence "YMM%u", 0, 31)>; -// The size of the all masked registers is 16 bit because we have only one -// KMOVW istruction that can store this register in memory, and it writes 2 bytes -def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)>; -def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK1)> {let Size = 16;} +// Mask registers +def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} +def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} +def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} +def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} +def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} +def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} +def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} +def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} -def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>; - +def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} +def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} +def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 25c5a6bfa1f7..b76850aa1c8b 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -633,6 +633,7 @@ def GenericModel : SchedMachineModel { let MicroOpBufferSize = 32; let LoadLatency = 4; let HighLatency = 10; + let PostRAScheduler = 0; } include "X86ScheduleAtom.td" diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index 3256ee7c6e49..c8820aa2d8df 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -538,6 +538,7 @@ def AtomModel : SchedMachineModel { // On the Atom, the throughput for taken branches is 2 cycles. For small // simple loops, expand by a small factor to hide the backedge cost. let LoopMicroOpBufferSize = 10; + let PostRAScheduler = 1; let Itineraries = AtomItineraries; } diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 823d10140e3c..90d858788124 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -19,6 +19,7 @@ def SLMModel : SchedMachineModel { let MicroOpBufferSize = 32; // Based on the reorder buffer. let LoadLatency = 3; let MispredictPenalty = 10; + let PostRAScheduler = 1; // For small loops, expand by a small factor to hide the backedge cost. let LoopMicroOpBufferSize = 10; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index b6ecdf8ecf63..a83dd9b2eea7 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -66,7 +66,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0) + DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), + 0) .setDiscardResult(); std::pair CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 79b7e68c320b..41551a1d677f 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -219,9 +219,6 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { // Make sure the right MCSchedModel is used. InitCPUSchedModel(CPUName); - if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM) - PostRAScheduler = true; - InstrItins = getInstrItineraryForCPU(CPUName); // It's important to keep the MCSubtargetInfo feature bits in sync with @@ -275,6 +272,9 @@ void X86Subtarget::initializeEnvironment() { HasERI = false; HasCDI = false; HasPFI = false; + HasDQI = false; + HasBWI = false; + HasVLX = false; HasADX = false; HasSHA = false; HasPRFCHW = false; @@ -286,7 +286,6 @@ void X86Subtarget::initializeEnvironment() { HasCmpxchg16b = false; UseLeaForSP = false; HasSlowDivide = false; - PostRAScheduler = false; PadShortFunctions = false; CallRegIndirect = false; LEAUsesAG = false; @@ -359,16 +358,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, is64Bit() ? -8 : -4), JITInfo(hasSSE1()) {} -bool -X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode &Mode, - RegClassVector &CriticalPathRCs) const { - Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL; - CriticalPathRCs.clear(); - return PostRAScheduler && OptLevel >= CodeGenOpt::Default; -} - -bool -X86Subtarget::enableEarlyIfConversion() const { +bool X86Subtarget::enableEarlyIfConversion() const { return hasCMov() && X86EarlyIfConv; } + diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 09db0ebc5a9b..5f5df5e0818c 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -170,9 +170,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// full divides and should be used when possible. bool HasSlowDivide; - /// PostRAScheduler - True if using post-register-allocation scheduler. - bool PostRAScheduler; - /// PadShortFunctions - True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -192,13 +189,22 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Processor has AVX-512 PreFetch Instructions bool HasPFI; - + /// Processor has AVX-512 Exponential and Reciprocal Instructions bool HasERI; - + /// Processor has AVX-512 Conflict Detection Instructions bool HasCDI; - + + /// Processor has AVX-512 Doubleword and Quadword instructions + bool HasDQI; + + /// Processor has AVX-512 Byte and Word instructions + bool HasBWI; + + /// Processor has AVX-512 Vector Length eXtenstions + bool HasVLX; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -352,6 +358,9 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } + bool hasDQI() const { return HasDQI; } + bool hasBWI() const { return HasBWI; } + bool hasVLX() const { return HasVLX; } bool isAtom() const { return X86ProcFamily == IntelAtom; } bool isSLM() const { return X86ProcFamily == IntelSLM; } @@ -453,18 +462,15 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } - /// enablePostRAScheduler - run for Atom optimization. - bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const override; - - bool postRAScheduler() const { return PostRAScheduler; } - bool enableEarlyIfConversion() const override; /// getInstrItins = Return the instruction itineraries based on the /// subtarget selection. const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + + AntiDepBreakMode getAntiDepBreakMode() const override { + return TargetSubtargetInfo::ANTIDEP_CRITICAL; + } }; } // End llvm namespace diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 74ce31a15d5c..f12140f1f161 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -111,6 +111,7 @@ class X86PassConfig : public TargetPassConfig { return *getX86TargetMachine().getSubtargetImpl(); } + void addIRPasses() override; bool addInstSelector() override; bool addILPOpts() override; bool addPreRegAlloc() override; @@ -123,6 +124,12 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { return new X86PassConfig(this, PM); } +void X86PassConfig::addIRPasses() { + addPass(createX86AtomicExpandPass(&getX86TargetMachine())); + + TargetPassConfig::addIRPasses(); +} + bool X86PassConfig::addInstSelector() { // Install an instruction selector. addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel())); diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 87ddaf42a71e..41d51570b9ab 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -13,11 +13,7 @@ #ifndef X86TARGETMACHINE_H #define X86TARGETMACHINE_H - -#include "X86FrameLowering.h" -#include "X86ISelLowering.h" #include "X86InstrInfo.h" -#include "X86JITInfo.h" #include "X86Subtarget.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 8157085feaec..f8bcd616e111 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -8,10 +8,12 @@ //===----------------------------------------------------------------------===// #include "X86TargetObjectFile.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Operator.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/Support/Dwarf.h" #include "llvm/Target/TargetLowering.h" @@ -106,3 +108,64 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( MCSymbolRefExpr::VK_COFF_IMGREL32, getContext()); } + +static std::string APIntToHexString(const APInt &AI) { + unsigned Width = (AI.getBitWidth() / 8) * 2; + std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true); + unsigned Size = HexString.size(); + assert(Width >= Size && "hex string is too large!"); + HexString.insert(HexString.begin(), Width - Size, '0'); + + return HexString; +} + + +static std::string scalarConstantToHexString(const Constant *C) { + Type *Ty = C->getType(); + APInt AI; + if (isa(C)) { + AI = APInt(Ty->getPrimitiveSizeInBits(), /*val=*/0); + } else if (Ty->isFloatTy() || Ty->isDoubleTy()) { + const auto *CFP = cast(C); + AI = CFP->getValueAPF().bitcastToAPInt(); + } else if (Ty->isIntegerTy()) { + const auto *CI = cast(C); + AI = CI->getValue(); + } else { + llvm_unreachable("unexpected constant pool element type!"); + } + return APIntToHexString(AI); +} + +const MCSection * +X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind, + const Constant *C) const { + if (Kind.isReadOnly()) { + if (C) { + Type *Ty = C->getType(); + SmallString<32> COMDATSymName; + if (Ty->isFloatTy() || Ty->isDoubleTy()) { + COMDATSymName = "__real@"; + COMDATSymName += scalarConstantToHexString(C); + } else if (const auto *VTy = dyn_cast(Ty)) { + uint64_t NumBits = VTy->getBitWidth(); + if (NumBits == 128 || NumBits == 256) { + COMDATSymName = NumBits == 128 ? "__xmm@" : "__ymm@"; + for (int I = VTy->getNumElements() - 1, E = -1; I != E; --I) + COMDATSymName += + scalarConstantToHexString(C->getAggregateElement(I)); + } + } + if (!COMDATSymName.empty()) { + unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ | + COFF::IMAGE_SCN_LNK_COMDAT; + return getContext().getCOFFSection(".rdata", Characteristics, Kind, + COMDATSymName, + COFF::IMAGE_COMDAT_SELECT_ANY); + } + } + } + + return TargetLoweringObjectFile::getSectionForConstant(Kind, C); +} diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index a08ed09ffb9c..4a10b7ea6b42 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -46,6 +46,11 @@ namespace llvm { const MCExpr * getExecutableRelativeSymbol(const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const override; + + /// \brief Given a mergeable constant with the specified size and relocation + /// information, return a section that it should be placed in. + const MCSection *getSectionForConstant(SectionKind Kind, + const Constant *C) const override; }; } // end namespace llvm diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 299f9a581b8d..c961e2f5b2c8 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -144,13 +144,17 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const { if (Vector && !ST->hasSSE1()) return 0; - if (ST->is64Bit()) + if (ST->is64Bit()) { + if (Vector && ST->hasAVX512()) + return 32; return 16; + } return 8; } unsigned X86TTI::getRegisterBitWidth(bool Vector) const { if (Vector) { + if (ST->hasAVX512()) return 512; if (ST->hasAVX()) return 256; if (ST->hasSSE1()) return 128; return 0; @@ -402,17 +406,117 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { - // We only estimate the cost of reverse shuffles. - if (Kind != SK_Reverse) + // We only estimate the cost of reverse and alternate shuffles. + if (Kind != SK_Reverse && Kind != SK_Alternate) return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - std::pair LT = TLI->getTypeLegalizationCost(Tp); - unsigned Cost = 1; - if (LT.second.getSizeInBits() > 128) - Cost = 3; // Extract + insert + copy. + if (Kind == SK_Reverse) { + std::pair LT = TLI->getTypeLegalizationCost(Tp); + unsigned Cost = 1; + if (LT.second.getSizeInBits() > 128) + Cost = 3; // Extract + insert + copy. + + // Multiple by the number of parts. + return Cost * LT.first; + } + + if (Kind == SK_Alternate) { + // 64-bit packed float vectors (v2f32) are widened to type v4f32. + // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + std::pair LT = TLI->getTypeLegalizationCost(Tp); + + // The backend knows how to generate a single VEX.256 version of + // instruction VPBLENDW if the target supports AVX2. + if (ST->hasAVX2() && LT.second == MVT::v16i16) + return LT.first; + + static const CostTblEntry AVXAltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd + {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd + + {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps + {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps + + // This shuffle is custom lowered into a sequence of: + // 2x vextractf128 , 2x vpblendw , 1x vinsertf128 + {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5}, + + // This shuffle is custom lowered into a long sequence of: + // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128 + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9} + }; + + if (ST->hasAVX()) { + int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * AVXAltShuffleTbl[Idx].Cost; + } + + static const CostTblEntry SSE41AltShuffleTbl[] = { + // These are lowered into movsd. + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + + // packed float vectors with four elements are lowered into BLENDI dag + // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'. + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + + // This shuffle generates a single pshufw. + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + + // There is no instruction that matches a v16i8 alternate shuffle. + // The backend will expand it into the sequence 'pshufb + pshufb + or'. + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} + }; + + if (ST->hasSSE41()) { + int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * SSE41AltShuffleTbl[Idx].Cost; + } + + static const CostTblEntry SSSE3AltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd + + // SSE3 doesn't have 'blendps'. The following shuffles are expanded into + // the sequence 'shufps + pshufd' + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or + }; + + if (ST->hasSSSE3()) { + int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * SSSE3AltShuffleTbl[Idx].Cost; + } + + static const CostTblEntry SSEAltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd + + // This is expanded into a long sequence of four extract + four insert. + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw. + + // 8 x (pinsrw + pextrw + and + movb + movzb + or) + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48} + }; + + // Fall-back (SSE3 and SSE2). + int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * SSEAltShuffleTbl[Idx].Cost; + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + } - // Multiple by the number of parts. - return Cost * LT.first; + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); } unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp index 5499aba351c5..e6947369c2d7 100644 --- a/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/lib/Target/XCore/XCoreFrameLowering.cpp @@ -228,7 +228,9 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const { const XCoreInstrInfo &TII = *static_cast(MF.getTarget().getInstrInfo()); XCoreFunctionInfo *XFI = MF.getInfo(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; if (MFI->getMaxAlignment() > getStackAlignment()) report_fatal_error("emitPrologue unsupported alignment: " @@ -416,7 +418,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF); DebugLoc DL; - if (MI != MBB.end()) + if (MI != MBB.end() && !MI->isDebugValue()) DL = MI->getDebugLoc(); for (std::vector::const_iterator it = CSI.begin(); diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index 5e763aead97a..be7ef6420193 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -68,10 +68,9 @@ getTargetNodeName(unsigned Opcode) const } } -XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM) - : TargetLowering(XTM, new XCoreTargetObjectFile()), - TM(XTM), - Subtarget(*XTM.getSubtargetImpl()) { +XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM) + : TargetLowering(TM, new XCoreTargetObjectFile()), TM(TM), + Subtarget(TM.getSubtarget()) { // Set up the register classes. addRegisterClass(MVT::i32, &XCore::GRRegsRegClass); @@ -492,7 +491,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const { CLI.setDebugLoc(DL).setChain(Chain) .setCallee(CallingConv::C, IntPtrTy, DAG.getExternalSymbol("__misaligned_load", getPointerTy()), - &Args, 0); + std::move(Args), 0); std::pair CallResult = LowerCallTo(CLI); SDValue Ops[] = { CallResult.first, CallResult.second }; @@ -552,7 +551,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), DAG.getExternalSymbol("__misaligned_store", getPointerTy()), - &Args, 0); + std::move(Args), 0); std::pair CallResult = LowerCallTo(CLI); return CallResult.second; diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h index 81d91875100b..62b89c348dc7 100644 --- a/lib/Target/XCore/XCoreISelLowering.h +++ b/lib/Target/XCore/XCoreISelLowering.h @@ -94,7 +94,7 @@ namespace llvm { { public: - explicit XCoreTargetLowering(XCoreTargetMachine &TM); + explicit XCoreTargetLowering(const TargetMachine &TM); using TargetLowering::isZExtFree; bool isZExtFree(SDValue Val, EVT VT2) const override; @@ -123,7 +123,7 @@ namespace llvm { bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; private: - const XCoreTargetMachine &TM; + const TargetMachine &TM; const XCoreSubtarget &Subtarget; // Lower Operand helpers diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp index 984f0cd9c4d3..36ea9a087da5 100644 --- a/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/lib/Target/XCore/XCoreInstrInfo.cpp @@ -373,7 +373,8 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { DebugLoc DL; - if (I != MBB.end()) DL = I->getDebugLoc(); + if (I != MBB.end() && !I->isDebugValue()) + DL = I->getDebugLoc(); MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); MachineMemOperand *MMO = @@ -395,7 +396,8 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { DebugLoc DL; - if (I != MBB.end()) DL = I->getDebugLoc(); + if (I != MBB.end() && !I->isDebugValue()) + DL = I->getDebugLoc(); MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); MachineMemOperand *MMO = @@ -440,7 +442,8 @@ MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate( MachineBasicBlock::iterator MI, unsigned Reg, uint64_t Value) const { DebugLoc dl; - if (MI != MBB.end()) dl = MI->getDebugLoc(); + if (MI != MBB.end() && !MI->isDebugValue()) + dl = MI->getDebugLoc(); if (isImmMskBitp(Value)) { int N = Log2_32(Value) + 1; return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg).addImm(N); diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp index b72c520d84a5..91b33fd6559c 100644 --- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp +++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp @@ -16,8 +16,8 @@ using namespace llvm; #define DEBUG_TYPE "xcore-selectiondag-info" -XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const XCoreTargetMachine &TM) - : TargetSelectionDAGInfo(TM.getDataLayout()) {} +XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() { } @@ -46,7 +46,7 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY), Type::getVoidTy(*DAG.getContext()), DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()), - &Args, 0) + std::move(Args), 0) .setDiscardResult(); std::pair CallResult = TLI.LowerCallTo(CLI); diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h index ea6af980c01f..0079de1798b2 100644 --- a/lib/Target/XCore/XCoreSelectionDAGInfo.h +++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h @@ -22,7 +22,7 @@ class XCoreTargetMachine; class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit XCoreSelectionDAGInfo(const XCoreTargetMachine &TM); + explicit XCoreSelectionDAGInfo(const DataLayout &DL); ~XCoreSelectionDAGInfo(); SDValue diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp index 89ea03a88f6d..7227411ba560 100644 --- a/lib/Target/XCore/XCoreSubtarget.cpp +++ b/lib/Target/XCore/XCoreSubtarget.cpp @@ -25,8 +25,8 @@ using namespace llvm; void XCoreSubtarget::anchor() { } -XCoreSubtarget::XCoreSubtarget(const std::string &TT, - const std::string &CPU, const std::string &FS) - : XCoreGenSubtargetInfo(TT, CPU, FS) -{ -} +XCoreSubtarget::XCoreSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, const TargetMachine &TM) + : XCoreGenSubtargetInfo(TT, CPU, FS), + DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"), + InstrInfo(), FrameLowering(*this), TLInfo(TM), TSInfo(DL) {} diff --git a/lib/Target/XCore/XCoreSubtarget.h b/lib/Target/XCore/XCoreSubtarget.h index 5ac4dbc4bc07..1e9810bb89ef 100644 --- a/lib/Target/XCore/XCoreSubtarget.h +++ b/lib/Target/XCore/XCoreSubtarget.h @@ -14,6 +14,11 @@ #ifndef XCORESUBTARGET_H #define XCORESUBTARGET_H +#include "XCoreFrameLowering.h" +#include "XCoreISelLowering.h" +#include "XCoreInstrInfo.h" +#include "XCoreSelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" #include @@ -26,17 +31,31 @@ class StringRef; class XCoreSubtarget : public XCoreGenSubtargetInfo { virtual void anchor(); + const DataLayout DL; // Calculates type size & alignment + XCoreInstrInfo InstrInfo; + XCoreFrameLowering FrameLowering; + XCoreTargetLowering TLInfo; + XCoreSelectionDAGInfo TSInfo; public: /// This constructor initializes the data members to match that /// of the specified triple. /// XCoreSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS); + const std::string &FS, const TargetMachine &TM); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; } + const XCoreFrameLowering *getFrameLowering() const { return &FrameLowering; } + const XCoreTargetLowering *getTargetLowering() const { return &TLInfo; } + const XCoreSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + const TargetRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const DataLayout *getDataLayout() const { return &DL; } }; } // End llvm namespace diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 0fb21c5d7dfb..8d8bb3800ea5 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -25,13 +25,8 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS), - DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"), - InstrInfo(), - FrameLowering(Subtarget), - TLInfo(*this), - TSInfo(*this) { + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h index a57ca55f3c10..14c43bf151f4 100644 --- a/lib/Target/XCore/XCoreTargetMachine.h +++ b/lib/Target/XCore/XCoreTargetMachine.h @@ -14,46 +14,38 @@ #ifndef XCORETARGETMACHINE_H #define XCORETARGETMACHINE_H -#include "XCoreFrameLowering.h" -#include "XCoreISelLowering.h" -#include "XCoreInstrInfo.h" -#include "XCoreSelectionDAGInfo.h" #include "XCoreSubtarget.h" -#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" namespace llvm { class XCoreTargetMachine : public LLVMTargetMachine { XCoreSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - XCoreInstrInfo InstrInfo; - XCoreFrameLowering FrameLowering; - XCoreTargetLowering TLInfo; - XCoreSelectionDAGInfo TSInfo; public: XCoreTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); - const XCoreInstrInfo *getInstrInfo() const override { return &InstrInfo; } + const XCoreInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } const XCoreFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); } const XCoreSubtarget *getSubtargetImpl() const override { return &Subtarget; } const XCoreTargetLowering *getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } - const XCoreSelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } - const TargetRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - const DataLayout *getDataLayout() const override { return &DL; } // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp index ab0f7ad47dcc..cfd3302481e7 100644 --- a/lib/Target/XCore/XCoreTargetObjectFile.cpp +++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp @@ -165,8 +165,9 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang, report_fatal_error("Target does not support TLS or Common sections"); } -const MCSection *XCoreTargetObjectFile:: -getSectionForConstant(SectionKind Kind) const { +const MCSection * +XCoreTargetObjectFile::getSectionForConstant(SectionKind Kind, + const Constant *C) const { if (Kind.isMergeableConst4()) return MergeableConst4Section; if (Kind.isMergeableConst8()) return MergeableConst8Section; if (Kind.isMergeableConst16()) return MergeableConst16Section; diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h index 34d756edc3f7..d389e55ae399 100644 --- a/lib/Target/XCore/XCoreTargetObjectFile.h +++ b/lib/Target/XCore/XCoreTargetObjectFile.h @@ -34,7 +34,8 @@ static const unsigned CodeModelLargeSize = 256; Mangler &Mang, const TargetMachine &TM) const override; - const MCSection *getSectionForConstant(SectionKind Kind) const override; + const MCSection *getSectionForConstant(SectionKind Kind, + const Constant *C) const override; }; } // end namespace llvm diff --git a/lib/Transforms/Hello/CMakeLists.txt b/lib/Transforms/Hello/CMakeLists.txt index e724dbc8be89..3851b35871f5 100644 --- a/lib/Transforms/Hello/CMakeLists.txt +++ b/lib/Transforms/Hello/CMakeLists.txt @@ -6,6 +6,10 @@ if( NOT LLVM_REQUIRES_RTTI ) endif() endif() +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + add_llvm_loadable_module( LLVMHello Hello.cpp ) diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index 377fa153a254..f9de54a173d1 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -39,6 +39,8 @@ #include "llvm/IR/CFG.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" @@ -67,21 +69,24 @@ namespace { bool runOnSCC(CallGraphSCC &SCC) override; static char ID; // Pass identification, replacement for typeid explicit ArgPromotion(unsigned maxElements = 3) - : CallGraphSCCPass(ID), maxElements(maxElements) { + : CallGraphSCCPass(ID), DL(nullptr), maxElements(maxElements) { initializeArgPromotionPass(*PassRegistry::getPassRegistry()); } /// A vector used to hold the indices of a single GEP instruction typedef std::vector IndicesVector; + const DataLayout *DL; private: CallGraphNode *PromoteArguments(CallGraphNode *CGN); bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const; CallGraphNode *DoPromotion(Function *F, SmallPtrSet &ArgsToPromote, SmallPtrSet &ByValArgsToTransform); + bool doInitialization(CallGraph &CG) override; /// The maximum number of elements to expand, or 0 for unlimited. unsigned maxElements; + DenseMap FunctionDIs; }; } @@ -100,6 +105,9 @@ Pass *llvm::createArgumentPromotionPass(unsigned maxElements) { bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { bool Changed = false, LocalChange; + DataLayoutPass *DLP = getAnalysisIfAvailable(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + do { // Iterate until we stop promoting from this SCC. LocalChange = false; // Attempt to promote arguments from all functions in this SCC. @@ -215,7 +223,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { /// AllCallersPassInValidPointerForArgument - Return true if we can prove that /// all callees pass in a valid pointer for the specified function argument. -static bool AllCallersPassInValidPointerForArgument(Argument *Arg) { +static bool AllCallersPassInValidPointerForArgument(Argument *Arg, + const DataLayout *DL) { Function *Callee = Arg->getParent(); unsigned ArgNo = Arg->getArgNo(); @@ -226,7 +235,7 @@ static bool AllCallersPassInValidPointerForArgument(Argument *Arg) { CallSite CS(U); assert(CS && "Should only have direct calls!"); - if (!CS.getArgument(ArgNo)->isDereferenceablePointer()) + if (!CS.getArgument(ArgNo)->isDereferenceablePointer(DL)) return false; } return true; @@ -334,7 +343,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, GEPIndicesSet ToPromote; // If the pointer is always valid, any load with first index 0 is valid. - if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg)) + if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg, DL)) SafeToUnconditionallyLoad.insert(IndicesVector(1, 0)); // First, iterate the entry block and mark loads of (geps of) arguments as @@ -604,6 +613,10 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); NF->copyAttributesFrom(F); + // Patch the pointer to LLVM function in debug info descriptor. + auto DI = FunctionDIs.find(F); + if (DI != FunctionDIs.end()) + DI->second.replaceFunction(NF); DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" << "From: " << *F); @@ -741,6 +754,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, if (cast(Call)->isTailCall()) cast(New)->setTailCall(); } + New->setDebugLoc(Call->getDebugLoc()); Args.clear(); AttributesVec.clear(); @@ -902,3 +916,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, return NF_CGN; } + +bool ArgPromotion::doInitialization(CallGraph &CG) { + FunctionDIs = makeSubprogramMap(CG.getModule()); + return CallGraphSCCPass::doInitialization(CG); +} diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 284b896407d9..ac3853dbd679 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -127,8 +127,7 @@ namespace { // As the code generation for module is finished (and DIBuilder is // finalized) we assume that subprogram descriptors won't be changed, and // they are stored in map for short duration anyway. - typedef DenseMap FunctionDIMap; - FunctionDIMap FunctionDIs; + DenseMap FunctionDIs; protected: // DAH uses this to specify a different ID. @@ -150,7 +149,6 @@ namespace { unsigned RetValNum = 0); Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses); - void CollectFunctionDIs(Module &M); void SurveyFunction(const Function &F); void MarkValue(const RetOrArg &RA, Liveness L, const UseVector &MaybeLiveUses); @@ -190,35 +188,6 @@ INITIALIZE_PASS(DAH, "deadarghaX0r", ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); } ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); } -/// CollectFunctionDIs - Map each function in the module to its debug info -/// descriptor. -void DAE::CollectFunctionDIs(Module &M) { - FunctionDIs.clear(); - - for (Module::named_metadata_iterator I = M.named_metadata_begin(), - E = M.named_metadata_end(); I != E; ++I) { - NamedMDNode &NMD = *I; - for (unsigned MDIndex = 0, MDNum = NMD.getNumOperands(); - MDIndex < MDNum; ++MDIndex) { - MDNode *Node = NMD.getOperand(MDIndex); - if (!DIDescriptor(Node).isCompileUnit()) - continue; - DICompileUnit CU(Node); - const DIArray &SPs = CU.getSubprograms(); - for (unsigned SPIndex = 0, SPNum = SPs.getNumElements(); - SPIndex < SPNum; ++SPIndex) { - DISubprogram SP(SPs.getElement(SPIndex)); - assert((!SP || SP.isSubprogram()) && - "A MDNode in subprograms of a CU should be null or a DISubprogram."); - if (!SP) - continue; - if (Function *F = SP.getFunction()) - FunctionDIs[F] = SP; - } - } - } -} - /// DeleteDeadVarargs - If this is an function that takes a ... list, and if /// llvm.vastart is never called, the varargs list is dead for the function. bool DAE::DeleteDeadVarargs(Function &Fn) { @@ -327,7 +296,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { } // Patch the pointer to LLVM function in debug info descriptor. - FunctionDIMap::iterator DI = FunctionDIs.find(&Fn); + auto DI = FunctionDIs.find(&Fn); if (DI != FunctionDIs.end()) DI->second.replaceFunction(NF); @@ -1087,7 +1056,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { } // Patch the pointer to LLVM function in debug info descriptor. - FunctionDIMap::iterator DI = FunctionDIs.find(F); + auto DI = FunctionDIs.find(F); if (DI != FunctionDIs.end()) DI->second.replaceFunction(NF); @@ -1101,7 +1070,7 @@ bool DAE::runOnModule(Module &M) { bool Changed = false; // Collect debug info descriptors for functions. - CollectFunctionDIs(M); + FunctionDIs = makeSubprogramMap(M); // First pass: Do a simple check to see if any functions can have their "..." // removed. We can do this if they never call va_start. This loop cannot be diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp index 57481e1e429a..7e7a4c0ae835 100644 --- a/lib/Transforms/IPO/GlobalDCE.cpp +++ b/lib/Transforms/IPO/GlobalDCE.cpp @@ -77,13 +77,19 @@ bool GlobalDCE::runOnModule(Module &M) { // Remove empty functions from the global ctors list. Changed |= optimizeGlobalCtorsList(M, isEmptyFunction); + typedef std::multimap ComdatGVPairsTy; + ComdatGVPairsTy ComdatGVPairs; + // Loop over the module, adding globals which are obviously necessary. for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { Changed |= RemoveUnusedGlobalValue(*I); // Functions with external linkage are needed if they have a body - if (!I->isDiscardableIfUnused() && - !I->isDeclaration() && !I->hasAvailableExternallyLinkage()) - GlobalIsNeeded(I); + if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) { + if (!I->isDiscardableIfUnused()) + GlobalIsNeeded(I); + else if (const Comdat *C = I->getComdat()) + ComdatGVPairs.insert(std::make_pair(C, I)); + } } for (Module::global_iterator I = M.global_begin(), E = M.global_end(); @@ -91,17 +97,38 @@ bool GlobalDCE::runOnModule(Module &M) { Changed |= RemoveUnusedGlobalValue(*I); // Externally visible & appending globals are needed, if they have an // initializer. - if (!I->isDiscardableIfUnused() && - !I->isDeclaration() && !I->hasAvailableExternallyLinkage()) - GlobalIsNeeded(I); + if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) { + if (!I->isDiscardableIfUnused()) + GlobalIsNeeded(I); + else if (const Comdat *C = I->getComdat()) + ComdatGVPairs.insert(std::make_pair(C, I)); + } } for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E; ++I) { Changed |= RemoveUnusedGlobalValue(*I); // Externally visible aliases are needed. - if (!I->isDiscardableIfUnused()) + if (!I->isDiscardableIfUnused()) { GlobalIsNeeded(I); + } else if (const Comdat *C = I->getComdat()) { + ComdatGVPairs.insert(std::make_pair(C, I)); + } + } + + for (ComdatGVPairsTy::iterator I = ComdatGVPairs.begin(), + E = ComdatGVPairs.end(); + I != E;) { + ComdatGVPairsTy::iterator UB = ComdatGVPairs.upper_bound(I->first); + bool CanDiscard = std::all_of(I, UB, [](ComdatGVPairsTy::value_type Pair) { + return Pair.second->isDiscardableIfUnused(); + }); + if (!CanDiscard) { + std::for_each(I, UB, [this](ComdatGVPairsTy::value_type Pair) { + GlobalIsNeeded(Pair.second); + }); + } + I = UB; } // Now that all globals which are needed are in the AliveGlobals set, we loop diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index ae80c437643a..c1d0d3bcdb17 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" @@ -1699,9 +1700,6 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { /// possible. If we make a change, return true. bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, Module::global_iterator &GVI) { - if (!GV->isDiscardableIfUnused()) - return false; - // Do more involved optimizations if the global is internal. GV->removeDeadConstantUsers(); @@ -1910,7 +1908,7 @@ bool GlobalOpt::OptimizeFunctions(Module &M) { for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) { Function *F = FI++; // Functions without names cannot be referenced outside this module. - if (!F->hasName() && !F->isDeclaration()) + if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage()) F->setLinkage(GlobalValue::InternalLinkage); F->removeDeadConstantUsers(); if (F->isDefTriviallyDead()) { @@ -1944,11 +1942,18 @@ bool GlobalOpt::OptimizeFunctions(Module &M) { bool GlobalOpt::OptimizeGlobalVars(Module &M) { bool Changed = false; + + SmallSet NotDiscardableComdats; + for (const GlobalVariable &GV : M.globals()) + if (const Comdat *C = GV.getComdat()) + if (!GV.isDiscardableIfUnused()) + NotDiscardableComdats.insert(C); + for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); GVI != E; ) { GlobalVariable *GV = GVI++; // Global variables without names cannot be referenced outside this module. - if (!GV->hasName() && !GV->isDeclaration()) + if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage()) GV->setLinkage(GlobalValue::InternalLinkage); // Simplify the initializer. if (GV->hasInitializer()) @@ -1958,7 +1963,12 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) { GV->setInitializer(New); } - Changed |= ProcessGlobal(GV, GVI); + if (GV->isDiscardableIfUnused()) { + if (const Comdat *C = GV->getComdat()) + if (NotDiscardableComdats.count(C)) + continue; + Changed |= ProcessGlobal(GV, GVI); + } } return Changed; } @@ -1980,10 +1990,13 @@ isSimpleEnoughValueToCommit(Constant *C, static bool isSimpleEnoughValueToCommitHelper(Constant *C, SmallPtrSet &SimpleConstants, const DataLayout *DL) { - // Simple integer, undef, constant aggregate zero, global addresses, etc are - // all supported. - if (C->getNumOperands() == 0 || isa(C) || - isa(C)) + // Simple global addresses are supported, do not allow dllimport or + // thread-local globals. + if (auto *GV = dyn_cast(C)) + return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal(); + + // Simple integer, undef, constant aggregate zero, etc are all supported. + if (C->getNumOperands() == 0 || isa(C)) return true; // Aggregate values are safe if all their elements are. @@ -2054,8 +2067,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) { return false; if (GlobalVariable *GV = dyn_cast(C)) - // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or - // external globals. + // Do not allow weak/*_odr/linkonce linkage or external globals. return GV->hasUniqueInitializer(); if (ConstantExpr *CE = dyn_cast(C)) { @@ -2846,14 +2858,19 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { I != E;) { Module::alias_iterator J = I++; // Aliases without names cannot be referenced outside this module. - if (!J->hasName() && !J->isDeclaration()) + if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage()) J->setLinkage(GlobalValue::InternalLinkage); // If the aliasee may change at link time, nothing can be done - bail out. if (J->mayBeOverridden()) continue; Constant *Aliasee = J->getAliasee(); - GlobalValue *Target = cast(Aliasee->stripPointerCasts()); + GlobalValue *Target = dyn_cast(Aliasee->stripPointerCasts()); + // We can't trivially replace the alias with the aliasee if the aliasee is + // non-trivial in some way. + // TODO: Try to handle non-zero GEPs of local aliasees. + if (!Target) + continue; Target->removeDeadConstantUsers(); // Make all users of the alias use the aliasee instead. diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp index c9b4af769136..2fb0ddb174a0 100644 --- a/lib/Transforms/IPO/MergeFunctions.cpp +++ b/lib/Transforms/IPO/MergeFunctions.cpp @@ -9,13 +9,24 @@ // // This pass looks for equivalent functions that are mergable and folds them. // -// A hash is computed from the function, based on its type and number of -// basic blocks. +// Order relation is defined on set of functions. It was made through +// special function comparison procedure that returns +// 0 when functions are equal, +// -1 when Left function is less than right function, and +// 1 for opposite case. We need total-ordering, so we need to maintain +// four properties on the functions set: +// a <= a (reflexivity) +// if a <= b and b <= a then a = b (antisymmetry) +// if a <= b and b <= c then a <= c (transitivity). +// for all a and b: a <= b or b <= a (totality). // -// Once all hashes are computed, we perform an expensive equality comparison -// on each function pair. This takes n^2/2 comparisons per bucket, so it's -// important that the hash function be high quality. The equality comparison -// iterates through each instruction in each basic block. +// Comparison iterates through each instruction in each basic block. +// Functions are kept on binary tree. For each new function F we perform +// lookup in binary tree. +// In practice it works the following way: +// -- We define Function* container class with custom "operator<" (FunctionPtr). +// -- "FunctionPtr" instances are stored in std::set collection, so every +// std::set::insert operation will give you result in log(N) time. // // When a match is found the functions are folded. If both functions are // overridable, we move the functionality into a new internal function and @@ -31,9 +42,6 @@ // the object they belong to. However, as long as it's only used for a lookup // and call, this is irrelevant, and we'd like to fold such functions. // -// * switch from n^2 pair-wise comparisons to an n-way comparison for each -// bucket. -// // * be smarter about bitcasts. // // In order to fold functions, we will sometimes add either bitcast instructions @@ -41,6 +49,36 @@ // analysis since the two functions differ where one has a bitcast and the // other doesn't. We should learn to look through bitcasts. // +// * Compare complex types with pointer types inside. +// * Compare cross-reference cases. +// * Compare complex expressions. +// +// All the three issues above could be described as ability to prove that +// fA == fB == fC == fE == fF == fG in example below: +// +// void fA() { +// fB(); +// } +// void fB() { +// fA(); +// } +// +// void fE() { +// fF(); +// } +// void fF() { +// fG(); +// } +// void fG() { +// fE(); +// } +// +// Simplest cross-reference case (fA <--> fB) was implemented in previous +// versions of MergeFunctions, though it presented only in two function pairs +// in test-suite (that counts >50k functions) +// Though possibility to detect complex cross-referencing (e.g.: A->B->C->D->A) +// could cover much more cases. +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO.h" @@ -60,6 +98,7 @@ #include "llvm/IR/Operator.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -73,89 +112,12 @@ STATISTIC(NumThunksWritten, "Number of thunks generated"); STATISTIC(NumAliasesWritten, "Number of aliases generated"); STATISTIC(NumDoubleWeak, "Number of new functions created"); -/// Returns the type id for a type to be hashed. We turn pointer types into -/// integers here because the actual compare logic below considers pointers and -/// integers of the same size as equal. -static Type::TypeID getTypeIDForHash(Type *Ty) { - if (Ty->isPointerTy()) - return Type::IntegerTyID; - return Ty->getTypeID(); -} - -/// Creates a hash-code for the function which is the same for any two -/// functions that will compare equal, without looking at the instructions -/// inside the function. -static unsigned profileFunction(const Function *F) { - FunctionType *FTy = F->getFunctionType(); - - FoldingSetNodeID ID; - ID.AddInteger(F->size()); - ID.AddInteger(F->getCallingConv()); - ID.AddBoolean(F->hasGC()); - ID.AddBoolean(FTy->isVarArg()); - ID.AddInteger(getTypeIDForHash(FTy->getReturnType())); - for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) - ID.AddInteger(getTypeIDForHash(FTy->getParamType(i))); - return ID.ComputeHash(); -} - -namespace { - -/// ComparableFunction - A struct that pairs together functions with a -/// DataLayout so that we can keep them together as elements in the DenseSet. -class ComparableFunction { -public: - static const ComparableFunction EmptyKey; - static const ComparableFunction TombstoneKey; - static DataLayout * const LookupOnly; - - ComparableFunction(Function *Func, const DataLayout *DL) - : Func(Func), Hash(profileFunction(Func)), DL(DL) {} - - Function *getFunc() const { return Func; } - unsigned getHash() const { return Hash; } - const DataLayout *getDataLayout() const { return DL; } - - // Drops AssertingVH reference to the function. Outside of debug mode, this - // does nothing. - void release() { - assert(Func && - "Attempted to release function twice, or release empty/tombstone!"); - Func = nullptr; - } - -private: - explicit ComparableFunction(unsigned Hash) - : Func(nullptr), Hash(Hash), DL(nullptr) {} - - AssertingVH Func; - unsigned Hash; - const DataLayout *DL; -}; - -const ComparableFunction ComparableFunction::EmptyKey = ComparableFunction(0); -const ComparableFunction ComparableFunction::TombstoneKey = - ComparableFunction(1); -DataLayout *const ComparableFunction::LookupOnly = (DataLayout*)(-1); - -} - -namespace llvm { - template <> - struct DenseMapInfo { - static ComparableFunction getEmptyKey() { - return ComparableFunction::EmptyKey; - } - static ComparableFunction getTombstoneKey() { - return ComparableFunction::TombstoneKey; - } - static unsigned getHashValue(const ComparableFunction &CF) { - return CF.getHash(); - } - static bool isEqual(const ComparableFunction &LHS, - const ComparableFunction &RHS); - }; -} +static cl::opt NumFunctionsForSanityCheck( + "mergefunc-sanity", + cl::desc("How many functions in module could be used for " + "MergeFunctions pass sanity check. " + "'0' disables this check. Works only with '-debug' key."), + cl::init(0), cl::Hidden); namespace { @@ -167,14 +129,14 @@ class FunctionComparator { public: FunctionComparator(const DataLayout *DL, const Function *F1, const Function *F2) - : F1(F1), F2(F2), DL(DL) {} + : FnL(F1), FnR(F2), DL(DL) {} /// Test whether the two functions have equivalent behaviour. - bool compare(); + int compare(); private: /// Test whether two basic blocks have equivalent behaviour. - bool compare(const BasicBlock *BB1, const BasicBlock *BB2); + int compare(const BasicBlock *BBL, const BasicBlock *BBR); /// Constants comparison. /// Its analog to lexicographical comparison between hypothetical numbers @@ -300,10 +262,6 @@ class FunctionComparator { /// see comments for sn_mapL and sn_mapR. int cmpValues(const Value *L, const Value *R); - bool enumerate(const Value *V1, const Value *V2) { - return cmpValues(V1, V2) == 0; - } - /// Compare two Instructions for equivalence, similar to /// Instruction::isSameOperationAs but with modifications to the type /// comparison. @@ -325,15 +283,11 @@ class FunctionComparator { /// 6.1.Load: volatile (as boolean flag) /// 6.2.Load: alignment (as integer numbers) /// 6.3.Load: synch-scope (as integer numbers) + /// 6.4.Load: range metadata (as integer numbers) /// On this stage its better to see the code, since its not more than 10-15 /// strings for particular instruction, and could change sometimes. int cmpOperation(const Instruction *L, const Instruction *R) const; - bool isEquivalentOperation(const Instruction *I1, - const Instruction *I2) const { - return cmpOperation(I1, I2) == 0; - } - /// Compare two GEPs for equivalent pointer arithmetic. /// Parts to be compared for each comparison stage, /// most significant stage first: @@ -348,14 +302,6 @@ class FunctionComparator { return cmpGEP(cast(GEPL), cast(GEPR)); } - bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2) { - return cmpGEP(GEP1, GEP2) == 0; - } - bool isEquivalentGEP(const GetElementPtrInst *GEP1, - const GetElementPtrInst *GEP2) { - return isEquivalentGEP(cast(GEP1), cast(GEP2)); - } - /// cmpType - compares two types, /// defines total ordering among the types set. /// @@ -398,10 +344,6 @@ class FunctionComparator { /// 6. For all other cases put llvm_unreachable. int cmpType(Type *TyL, Type *TyR) const; - bool isEquivalentType(Type *Ty1, Type *Ty2) const { - return cmpType(Ty1, Ty2) == 0; - } - int cmpNumbers(uint64_t L, uint64_t R) const; int cmpAPInt(const APInt &L, const APInt &R) const; @@ -410,7 +352,7 @@ class FunctionComparator { int cmpAttrs(const AttributeSet L, const AttributeSet R) const; // The two functions undergoing comparison. - const Function *F1, *F2; + const Function *FnL, *FnR; const DataLayout *DL; @@ -450,6 +392,18 @@ class FunctionComparator { DenseMap sn_mapL, sn_mapR; }; +class FunctionPtr { + AssertingVH F; + const DataLayout *DL; + +public: + FunctionPtr(Function *F, const DataLayout *DL) : F(F), DL(DL) {} + Function *getFunc() const { return F; } + void release() { F = 0; } + bool operator<(const FunctionPtr &RHS) const { + return (FunctionComparator(DL, F, RHS.getFunc()).compare()) == -1; + } +}; } int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const { @@ -788,7 +742,11 @@ int FunctionComparator::cmpOperation(const Instruction *L, if (int Res = cmpNumbers(LI->getOrdering(), cast(R)->getOrdering())) return Res; - return cmpNumbers(LI->getSynchScope(), cast(R)->getSynchScope()); + if (int Res = + cmpNumbers(LI->getSynchScope(), cast(R)->getSynchScope())) + return Res; + return cmpNumbers((uint64_t)LI->getMetadata(LLVMContext::MD_range), + (uint64_t)cast(R)->getMetadata(LLVMContext::MD_range)); } if (const StoreInst *SI = dyn_cast(L)) { if (int Res = @@ -808,13 +766,23 @@ int FunctionComparator::cmpOperation(const Instruction *L, if (int Res = cmpNumbers(CI->getCallingConv(), cast(R)->getCallingConv())) return Res; - return cmpAttrs(CI->getAttributes(), cast(R)->getAttributes()); + if (int Res = + cmpAttrs(CI->getAttributes(), cast(R)->getAttributes())) + return Res; + return cmpNumbers( + (uint64_t)CI->getMetadata(LLVMContext::MD_range), + (uint64_t)cast(R)->getMetadata(LLVMContext::MD_range)); } if (const InvokeInst *CI = dyn_cast(L)) { if (int Res = cmpNumbers(CI->getCallingConv(), cast(R)->getCallingConv())) return Res; - return cmpAttrs(CI->getAttributes(), cast(R)->getAttributes()); + if (int Res = + cmpAttrs(CI->getAttributes(), cast(R)->getAttributes())) + return Res; + return cmpNumbers( + (uint64_t)CI->getMetadata(LLVMContext::MD_range), + (uint64_t)cast(R)->getMetadata(LLVMContext::MD_range)); } if (const InsertValueInst *IVI = dyn_cast(L)) { ArrayRef LIndices = IVI->getIndices(); @@ -917,13 +885,13 @@ int FunctionComparator::cmpGEP(const GEPOperator *GEPL, /// See comments in declaration for more details. int FunctionComparator::cmpValues(const Value *L, const Value *R) { // Catch self-reference case. - if (L == F1) { - if (R == F2) + if (L == FnL) { + if (R == FnR) return 0; return -1; } - if (R == F2) { - if (L == F1) + if (R == FnR) { + if (L == FnL) return 0; return 1; } @@ -957,90 +925,102 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) { return cmpNumbers(LeftSN.first->second, RightSN.first->second); } // Test whether two basic blocks have equivalent behaviour. -bool FunctionComparator::compare(const BasicBlock *BB1, const BasicBlock *BB2) { - BasicBlock::const_iterator F1I = BB1->begin(), F1E = BB1->end(); - BasicBlock::const_iterator F2I = BB2->begin(), F2E = BB2->end(); +int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { + BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end(); + BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end(); do { - if (!enumerate(F1I, F2I)) - return false; + if (int Res = cmpValues(InstL, InstR)) + return Res; - if (const GetElementPtrInst *GEP1 = dyn_cast(F1I)) { - const GetElementPtrInst *GEP2 = dyn_cast(F2I); - if (!GEP2) - return false; + const GetElementPtrInst *GEPL = dyn_cast(InstL); + const GetElementPtrInst *GEPR = dyn_cast(InstR); - if (!enumerate(GEP1->getPointerOperand(), GEP2->getPointerOperand())) - return false; + if (GEPL && !GEPR) + return 1; + if (GEPR && !GEPL) + return -1; - if (!isEquivalentGEP(GEP1, GEP2)) - return false; + if (GEPL && GEPR) { + if (int Res = + cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand())) + return Res; + if (int Res = cmpGEP(GEPL, GEPR)) + return Res; } else { - if (!isEquivalentOperation(F1I, F2I)) - return false; - - assert(F1I->getNumOperands() == F2I->getNumOperands()); - for (unsigned i = 0, e = F1I->getNumOperands(); i != e; ++i) { - Value *OpF1 = F1I->getOperand(i); - Value *OpF2 = F2I->getOperand(i); - - if (!enumerate(OpF1, OpF2)) - return false; + if (int Res = cmpOperation(InstL, InstR)) + return Res; + assert(InstL->getNumOperands() == InstR->getNumOperands()); - if (OpF1->getValueID() != OpF2->getValueID() || - !isEquivalentType(OpF1->getType(), OpF2->getType())) - return false; + for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) { + Value *OpL = InstL->getOperand(i); + Value *OpR = InstR->getOperand(i); + if (int Res = cmpValues(OpL, OpR)) + return Res; + if (int Res = cmpNumbers(OpL->getValueID(), OpR->getValueID())) + return Res; + // TODO: Already checked in cmpOperation + if (int Res = cmpType(OpL->getType(), OpR->getType())) + return Res; } } - ++F1I, ++F2I; - } while (F1I != F1E && F2I != F2E); + ++InstL, ++InstR; + } while (InstL != InstLE && InstR != InstRE); - return F1I == F1E && F2I == F2E; + if (InstL != InstLE && InstR == InstRE) + return 1; + if (InstL == InstLE && InstR != InstRE) + return -1; + return 0; } // Test whether the two functions have equivalent behaviour. -bool FunctionComparator::compare() { - // We need to recheck everything, but check the things that weren't included - // in the hash first. +int FunctionComparator::compare() { sn_mapL.clear(); sn_mapR.clear(); - if (F1->getAttributes() != F2->getAttributes()) - return false; + if (int Res = cmpAttrs(FnL->getAttributes(), FnR->getAttributes())) + return Res; - if (F1->hasGC() != F2->hasGC()) - return false; + if (int Res = cmpNumbers(FnL->hasGC(), FnR->hasGC())) + return Res; - if (F1->hasGC() && F1->getGC() != F2->getGC()) - return false; + if (FnL->hasGC()) { + if (int Res = cmpNumbers((uint64_t)FnL->getGC(), (uint64_t)FnR->getGC())) + return Res; + } - if (F1->hasSection() != F2->hasSection()) - return false; + if (int Res = cmpNumbers(FnL->hasSection(), FnR->hasSection())) + return Res; - if (F1->hasSection() && F1->getSection() != F2->getSection()) - return false; + if (FnL->hasSection()) { + if (int Res = cmpStrings(FnL->getSection(), FnR->getSection())) + return Res; + } - if (F1->isVarArg() != F2->isVarArg()) - return false; + if (int Res = cmpNumbers(FnL->isVarArg(), FnR->isVarArg())) + return Res; // TODO: if it's internal and only used in direct calls, we could handle this // case too. - if (F1->getCallingConv() != F2->getCallingConv()) - return false; + if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv())) + return Res; - if (!isEquivalentType(F1->getFunctionType(), F2->getFunctionType())) - return false; + if (int Res = cmpType(FnL->getFunctionType(), FnR->getFunctionType())) + return Res; - assert(F1->arg_size() == F2->arg_size() && + assert(FnL->arg_size() == FnR->arg_size() && "Identically typed functions have different numbers of args!"); // Visit the arguments so that they get enumerated in the order they're // passed in. - for (Function::const_arg_iterator f1i = F1->arg_begin(), - f2i = F2->arg_begin(), f1e = F1->arg_end(); f1i != f1e; ++f1i, ++f2i) { - if (!enumerate(f1i, f2i)) + for (Function::const_arg_iterator ArgLI = FnL->arg_begin(), + ArgRI = FnR->arg_begin(), + ArgLE = FnL->arg_end(); + ArgLI != ArgLE; ++ArgLI, ++ArgRI) { + if (cmpValues(ArgLI, ArgRI) != 0) llvm_unreachable("Arguments repeat!"); } @@ -1048,33 +1028,36 @@ bool FunctionComparator::compare() { // linked list is immaterial. Our walk starts at the entry block for both // functions, then takes each block from each terminator in order. As an // artifact, this also means that unreachable blocks are ignored. - SmallVector F1BBs, F2BBs; + SmallVector FnLBBs, FnRBBs; SmallSet VisitedBBs; // in terms of F1. - F1BBs.push_back(&F1->getEntryBlock()); - F2BBs.push_back(&F2->getEntryBlock()); + FnLBBs.push_back(&FnL->getEntryBlock()); + FnRBBs.push_back(&FnR->getEntryBlock()); - VisitedBBs.insert(F1BBs[0]); - while (!F1BBs.empty()) { - const BasicBlock *F1BB = F1BBs.pop_back_val(); - const BasicBlock *F2BB = F2BBs.pop_back_val(); + VisitedBBs.insert(FnLBBs[0]); + while (!FnLBBs.empty()) { + const BasicBlock *BBL = FnLBBs.pop_back_val(); + const BasicBlock *BBR = FnRBBs.pop_back_val(); - if (!enumerate(F1BB, F2BB) || !compare(F1BB, F2BB)) - return false; + if (int Res = cmpValues(BBL, BBR)) + return Res; - const TerminatorInst *F1TI = F1BB->getTerminator(); - const TerminatorInst *F2TI = F2BB->getTerminator(); + if (int Res = compare(BBL, BBR)) + return Res; - assert(F1TI->getNumSuccessors() == F2TI->getNumSuccessors()); - for (unsigned i = 0, e = F1TI->getNumSuccessors(); i != e; ++i) { - if (!VisitedBBs.insert(F1TI->getSuccessor(i))) + const TerminatorInst *TermL = BBL->getTerminator(); + const TerminatorInst *TermR = BBR->getTerminator(); + + assert(TermL->getNumSuccessors() == TermR->getNumSuccessors()); + for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) { + if (!VisitedBBs.insert(TermL->getSuccessor(i))) continue; - F1BBs.push_back(F1TI->getSuccessor(i)); - F2BBs.push_back(F2TI->getSuccessor(i)); + FnLBBs.push_back(TermL->getSuccessor(i)); + FnRBBs.push_back(TermR->getSuccessor(i)); } } - return true; + return 0; } namespace { @@ -1095,21 +1078,25 @@ class MergeFunctions : public ModulePass { bool runOnModule(Module &M) override; private: - typedef DenseSet FnSetType; + typedef std::set FnTreeType; /// A work queue of functions that may have been modified and should be /// analyzed again. std::vector Deferred; - /// Insert a ComparableFunction into the FnSet, or merge it away if it's + /// Checks the rules of order relation introduced among functions set. + /// Returns true, if sanity check has been passed, and false if failed. + bool doSanityCheck(std::vector &Worklist); + + /// Insert a ComparableFunction into the FnTree, or merge it away if it's /// equal to one that's already present. - bool insert(ComparableFunction &NewF); + bool insert(Function *NewFunction); - /// Remove a Function from the FnSet and queue it up for a second sweep of + /// Remove a Function from the FnTree and queue it up for a second sweep of /// analysis. void remove(Function *F); - /// Find the functions that use this Value and remove them from FnSet and + /// Find the functions that use this Value and remove them from FnTree and /// queue the functions. void removeUsers(Value *V); @@ -1134,7 +1121,7 @@ class MergeFunctions : public ModulePass { /// The set of all distinct functions. Use the insert() and remove() methods /// to modify it. - FnSetType FnSet; + FnTreeType FnTree; /// DataLayout for more accurate GEP comparisons. May be NULL. const DataLayout *DL; @@ -1152,6 +1139,78 @@ ModulePass *llvm::createMergeFunctionsPass() { return new MergeFunctions(); } +bool MergeFunctions::doSanityCheck(std::vector &Worklist) { + if (const unsigned Max = NumFunctionsForSanityCheck) { + unsigned TripleNumber = 0; + bool Valid = true; + + dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n"; + + unsigned i = 0; + for (std::vector::iterator I = Worklist.begin(), E = Worklist.end(); + I != E && i < Max; ++I, ++i) { + unsigned j = i; + for (std::vector::iterator J = I; J != E && j < Max; ++J, ++j) { + Function *F1 = cast(*I); + Function *F2 = cast(*J); + int Res1 = FunctionComparator(DL, F1, F2).compare(); + int Res2 = FunctionComparator(DL, F2, F1).compare(); + + // If F1 <= F2, then F2 >= F1, otherwise report failure. + if (Res1 != -Res2) { + dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber + << "\n"; + F1->dump(); + F2->dump(); + Valid = false; + } + + if (Res1 == 0) + continue; + + unsigned k = j; + for (std::vector::iterator K = J; K != E && k < Max; + ++k, ++K, ++TripleNumber) { + if (K == J) + continue; + + Function *F3 = cast(*K); + int Res3 = FunctionComparator(DL, F1, F3).compare(); + int Res4 = FunctionComparator(DL, F2, F3).compare(); + + bool Transitive = true; + + if (Res1 != 0 && Res1 == Res4) { + // F1 > F2, F2 > F3 => F1 > F3 + Transitive = Res3 == Res1; + } else if (Res3 != 0 && Res3 == -Res4) { + // F1 > F3, F3 > F2 => F1 > F2 + Transitive = Res3 == Res1; + } else if (Res4 != 0 && -Res3 == Res4) { + // F2 > F3, F3 > F1 => F2 > F1 + Transitive = Res4 == -Res1; + } + + if (!Transitive) { + dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: " + << TripleNumber << "\n"; + dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", " + << Res4 << "\n"; + F1->dump(); + F2->dump(); + F3->dump(); + Valid = false; + } + } + } + } + + dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n"; + return Valid; + } + return true; +} + bool MergeFunctions::runOnModule(Module &M) { bool Changed = false; DataLayoutPass *DLP = getAnalysisIfAvailable(); @@ -1161,12 +1220,13 @@ bool MergeFunctions::runOnModule(Module &M) { if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) Deferred.push_back(WeakVH(I)); } - FnSet.resize(Deferred.size()); do { std::vector Worklist; Deferred.swap(Worklist); + DEBUG(doSanityCheck(Worklist)); + DEBUG(dbgs() << "size of module: " << M.size() << '\n'); DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n'); @@ -1178,8 +1238,7 @@ bool MergeFunctions::runOnModule(Module &M) { Function *F = cast(*I); if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() && !F->mayBeOverridden()) { - ComparableFunction CF = ComparableFunction(F, DL); - Changed |= insert(CF); + Changed |= insert(F); } } @@ -1193,38 +1252,17 @@ bool MergeFunctions::runOnModule(Module &M) { Function *F = cast(*I); if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() && F->mayBeOverridden()) { - ComparableFunction CF = ComparableFunction(F, DL); - Changed |= insert(CF); + Changed |= insert(F); } } - DEBUG(dbgs() << "size of FnSet: " << FnSet.size() << '\n'); + DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n'); } while (!Deferred.empty()); - FnSet.clear(); + FnTree.clear(); return Changed; } -bool DenseMapInfo::isEqual(const ComparableFunction &LHS, - const ComparableFunction &RHS) { - if (LHS.getFunc() == RHS.getFunc() && - LHS.getHash() == RHS.getHash()) - return true; - if (!LHS.getFunc() || !RHS.getFunc()) - return false; - - // One of these is a special "underlying pointer comparison only" object. - if (LHS.getDataLayout() == ComparableFunction::LookupOnly || - RHS.getDataLayout() == ComparableFunction::LookupOnly) - return false; - - assert(LHS.getDataLayout() == RHS.getDataLayout() && - "Comparing functions for different targets"); - - return FunctionComparator(LHS.getDataLayout(), LHS.getFunc(), - RHS.getFunc()).compare(); -} - // Replace direct callers of Old with New. void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType()); @@ -1379,54 +1417,57 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { ++NumFunctionsMerged; } -// Insert a ComparableFunction into the FnSet, or merge it away if equal to one +// Insert a ComparableFunction into the FnTree, or merge it away if equal to one // that was already inserted. -bool MergeFunctions::insert(ComparableFunction &NewF) { - std::pair Result = FnSet.insert(NewF); +bool MergeFunctions::insert(Function *NewFunction) { + std::pair Result = + FnTree.insert(FunctionPtr(NewFunction, DL)); + if (Result.second) { - DEBUG(dbgs() << "Inserting as unique: " << NewF.getFunc()->getName() << '\n'); + DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n'); return false; } - const ComparableFunction &OldF = *Result.first; + const FunctionPtr &OldF = *Result.first; // Don't merge tiny functions, since it can just end up making the function // larger. // FIXME: Should still merge them if they are unnamed_addr and produce an // alias. - if (NewF.getFunc()->size() == 1) { - if (NewF.getFunc()->front().size() <= 2) { - DEBUG(dbgs() << NewF.getFunc()->getName() - << " is to small to bother merging\n"); + if (NewFunction->size() == 1) { + if (NewFunction->front().size() <= 2) { + DEBUG(dbgs() << NewFunction->getName() + << " is to small to bother merging\n"); return false; } } // Never thunk a strong function to a weak function. - assert(!OldF.getFunc()->mayBeOverridden() || - NewF.getFunc()->mayBeOverridden()); + assert(!OldF.getFunc()->mayBeOverridden() || NewFunction->mayBeOverridden()); - DEBUG(dbgs() << " " << OldF.getFunc()->getName() << " == " - << NewF.getFunc()->getName() << '\n'); + DEBUG(dbgs() << " " << OldF.getFunc()->getName() + << " == " << NewFunction->getName() << '\n'); - Function *DeleteF = NewF.getFunc(); - NewF.release(); + Function *DeleteF = NewFunction; mergeTwoFunctions(OldF.getFunc(), DeleteF); return true; } -// Remove a function from FnSet. If it was already in FnSet, add it to Deferred -// so that we'll look at it in the next round. +// Remove a function from FnTree. If it was already in FnTree, add +// it to Deferred so that we'll look at it in the next round. void MergeFunctions::remove(Function *F) { // We need to make sure we remove F, not a function "equal" to F per the // function equality comparator. - // - // The special "lookup only" ComparableFunction bypasses the expensive - // function comparison in favour of a pointer comparison on the underlying - // Function*'s. - ComparableFunction CF = ComparableFunction(F, ComparableFunction::LookupOnly); - if (FnSet.erase(CF)) { - DEBUG(dbgs() << "Removed " << F->getName() << " from set and deferred it.\n"); + FnTreeType::iterator found = FnTree.find(FunctionPtr(F, DL)); + size_t Erased = 0; + if (found != FnTree.end() && found->getFunc() == F) { + Erased = 1; + FnTree.erase(found); + } + + if (Erased) { + DEBUG(dbgs() << "Removed " << F->getName() + << " from set and deferred it.\n"); Deferred.push_back(F); } } diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index c20c717de5e7..8a715bbe30e0 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -156,9 +156,9 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { if (!DisableUnitAtATime) { addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); + MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createGlobalOptimizerPass()); // Optimize out global vars - MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createDeadArgEliminationPass()); // Dead argument elimination MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE @@ -187,6 +187,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createEarlyCSEPass()); // Catch trivial redundancies MPM.add(createJumpThreadingPass()); // Thread jumps. MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals + // Specific to the rust-lang llvm branch: + MPM.add(createNullCheckEliminationPass()); // Eliminate null checks MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createInstructionCombiningPass()); // Combine silly seq's addExtensionsToPM(EP_Peephole, MPM); @@ -207,8 +209,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createSimpleLoopUnrollPass()); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); - if (OptLevel > 1) + if (OptLevel > 1) { + MPM.add(createMergedLoadStoreMotionPass()); // Merge load/stores in diamond MPM.add(createGVNPass()); // Remove redundancies + } MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset MPM.add(createSCCPPass()); // Constant prop with SCCP @@ -218,6 +222,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { addExtensionsToPM(EP_Peephole, MPM); MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass()); + // Specific to the rust-lang llvm branch: + MPM.add(createNullCheckEliminationPass()); // Eliminate null checks MPM.add(createDeadStoreEliminationPass()); // Delete dead stores addExtensionsToPM(EP_ScalarOptimizerLate, MPM); @@ -346,6 +352,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, PM.add(createGlobalsModRefPass()); // IP alias analysis. PM.add(createLICMPass()); // Hoist loop invariants. + PM.add(createMergedLoadStoreMotionPass()); // Merge load/stores in diamonds PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index f6658964ae85..e80d6a9ee39b 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -32,7 +32,7 @@ namespace { /// class FAddendCoef { public: - // The constructor has to initialize a APFloat, which is uncessary for + // The constructor has to initialize a APFloat, which is unnecessary for // most addends which have coefficient either 1 or -1. So, the constructor // is expensive. In order to avoid the cost of the constructor, we should // reuse some instances whenever possible. The pre-created instances @@ -865,30 +865,6 @@ Value *FAddCombine::createAddendVal return createFMul(OpndVal, Coeff.getValue(Instr->getType())); } -// dyn_castFoldableMul - If this value is a multiply that can be folded into -// other computations (because it has a constant operand), return the -// non-constant operand of the multiply, and set CST to point to the multiplier. -// Otherwise, return null. -// -static inline Value *dyn_castFoldableMul(Value *V, Constant *&CST) { - if (!V->hasOneUse() || !V->getType()->isIntOrIntVectorTy()) - return nullptr; - - Instruction *I = dyn_cast(V); - if (!I) return nullptr; - - if (I->getOpcode() == Instruction::Mul) - if ((CST = dyn_cast(I->getOperand(1)))) - return I->getOperand(0); - if (I->getOpcode() == Instruction::Shl) - if ((CST = dyn_cast(I->getOperand(1)))) { - // The multiplier is really 1 << CST. - CST = ConstantExpr::getShl(ConstantInt::get(V->getType(), 1), CST); - return I->getOperand(0); - } - return nullptr; -} - // If one of the operands only has one non-zero bit, and if the other // operand has a known-zero bit in a more significant place than it (not // including the sign bit) the ripple may go up to and fill the zero, but @@ -980,18 +956,79 @@ bool InstCombiner::WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS) { return false; } -Instruction *InstCombiner::visitAdd(BinaryOperator &I) { - bool Changed = SimplifyAssociativeOrCommutative(I); +// Checks if any operand is negative and we can convert add to sub. +// This function checks for following negative patterns +// ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C)) +// ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C)) +// XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even +static Value *checkForNegativeOperand(BinaryOperator &I, + InstCombiner::BuilderTy *Builder) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - if (Value *V = SimplifyVectorOp(I)) - return ReplaceInstUsesWith(I, V); + // This function creates 2 instructions to replace ADD, we need at least one + // of LHS or RHS to have one use to ensure benefit in transform. + if (!LHS->hasOneUse() && !RHS->hasOneUse()) + return nullptr; - if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), DL)) - return ReplaceInstUsesWith(I, V); + Value *X = nullptr, *Y = nullptr, *Z = nullptr; + const APInt *C1 = nullptr, *C2 = nullptr; + + // if ONE is on other side, swap + if (match(RHS, m_Add(m_Value(X), m_One()))) + std::swap(LHS, RHS); + + if (match(LHS, m_Add(m_Value(X), m_One()))) { + // if XOR on other side, swap + if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1)))) + std::swap(X, RHS); + + if (match(X, m_Xor(m_Value(Y), m_APInt(C1)))) { + // X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1)) + // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1)) + if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (*C2 == ~(*C1))) { + Value *NewAnd = Builder->CreateAnd(Z, *C1); + return Builder->CreateSub(RHS, NewAnd, "sub"); + } else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (*C1 == *C2)) { + // X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1)) + // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1)) + Value *NewOr = Builder->CreateOr(Z, ~(*C1)); + return Builder->CreateSub(RHS, NewOr, "sub"); + } + } + } + + // Restore LHS and RHS + LHS = I.getOperand(0); + RHS = I.getOperand(1); + + // if XOR is on other side, swap + if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1)))) + std::swap(LHS, RHS); + + // C2 is ODD + // LHS = XOR(Y, C1), Y = AND(Z, C2), C1 == (C2 + 1) => LHS == NEG(OR(Z, ~C2)) + // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2)) + if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1)))) + if (C1->countTrailingZeros() == 0) + if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) { + Value *NewOr = Builder->CreateOr(Z, ~(*C2)); + return Builder->CreateSub(RHS, NewOr, "sub"); + } + return nullptr; +} + +Instruction *InstCombiner::visitAdd(BinaryOperator &I) { + bool Changed = SimplifyAssociativeOrCommutative(I); + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), + I.hasNoUnsignedWrap(), DL)) + return ReplaceInstUsesWith(I, V); - // (A*B)+(A*C) -> A*(B+C) etc + // (A*B)+(A*C) -> A*(B+C) etc if (Value *V = SimplifyUsingDistributiveLaws(I)) return ReplaceInstUsesWith(I, V); @@ -1089,23 +1126,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (Value *V = dyn_castNegVal(RHS)) return BinaryOperator::CreateSub(LHS, V); - - { - Constant *C2; - if (Value *X = dyn_castFoldableMul(LHS, C2)) { - if (X == RHS) // X*C + X --> X * (C+1) - return BinaryOperator::CreateMul(RHS, AddOne(C2)); - - // X*C1 + X*C2 --> X * (C1+C2) - Constant *C1; - if (X == dyn_castFoldableMul(RHS, C1)) - return BinaryOperator::CreateMul(X, ConstantExpr::getAdd(C1, C2)); - } - - // X + X*C --> X * (C+1) - if (dyn_castFoldableMul(RHS, C2) == LHS) - return BinaryOperator::CreateMul(LHS, AddOne(C2)); - } + if (Value *V = checkForNegativeOperand(I, Builder)) + return ReplaceInstUsesWith(I, V); // A+B --> A|B iff A and B have no bits set in common. if (IntegerType *IT = dyn_cast(I.getType())) { @@ -1123,29 +1145,6 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { } } - // W*X + Y*Z --> W * (X+Z) iff W == Y - { - Value *W, *X, *Y, *Z; - if (match(LHS, m_Mul(m_Value(W), m_Value(X))) && - match(RHS, m_Mul(m_Value(Y), m_Value(Z)))) { - if (W != Y) { - if (W == Z) { - std::swap(Y, Z); - } else if (Y == X) { - std::swap(W, X); - } else if (X == Z) { - std::swap(Y, Z); - std::swap(W, X); - } - } - - if (W == Y) { - Value *NewAdd = Builder->CreateAdd(X, Z, LHS->getName()); - return BinaryOperator::CreateMul(W, NewAdd); - } - } - } - if (Constant *CRHS = dyn_cast(RHS)) { Value *X; if (match(LHS, m_Not(m_Value(X)))) // ~X + C --> (C-1) - X @@ -1554,9 +1553,9 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { return BinaryOperator::CreateAnd(Op0, Builder->CreateNot(Y, Y->getName() + ".not")); - // 0 - (X sdiv C) -> (X sdiv -C) - if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) && - match(Op0, m_Zero())) + // 0 - (X sdiv C) -> (X sdiv -C) provided the negation doesn't overflow. + if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) && match(Op0, m_Zero()) && + !C->isMinSignedValue()) return BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(C)); // 0 - (X << Y) -> (-X << Y) when X is freely negatable. @@ -1564,19 +1563,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { if (Value *XNeg = dyn_castNegVal(X)) return BinaryOperator::CreateShl(XNeg, Y); - // X - X*C --> X * (1-C) - if (match(Op1, m_Mul(m_Specific(Op0), m_Constant(CI)))) { - Constant *CP1 = ConstantExpr::getSub(ConstantInt::get(I.getType(),1), CI); - return BinaryOperator::CreateMul(Op0, CP1); - } - - // X - X< X * (1-(1< X + A*B // X - -A*B -> X + A*B Value *A, *B; @@ -1593,16 +1579,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { } } - Constant *C1; - if (Value *X = dyn_castFoldableMul(Op0, C1)) { - if (X == Op1) // X*C - X --> X * (C-1) - return BinaryOperator::CreateMul(Op1, SubOne(C1)); - - Constant *C2; // X*C1 - X*C2 -> X * (C1-C2) - if (X == dyn_castFoldableMul(Op1, C2)) - return BinaryOperator::CreateMul(X, ConstantExpr::getSub(C1, C2)); - } - // Optimize pointer differences into the same array into a size. Consider: // &A[10] - &A[0]: we should compile this to "10". if (DL) { @@ -1617,7 +1593,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) return ReplaceInstUsesWith(I, Res); - } + } return nullptr; } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 4f5d65ab785f..431f73235113 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1996,29 +1996,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { C1 = dyn_cast(C); C2 = dyn_cast(D); if (C1 && C2) { // (A & C1)|(B & C2) - // If we have: ((V + N) & C1) | (V & C2) - // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 - // replace with V+N. - if (C1->getValue() == ~C2->getValue()) { - if ((C2->getValue() & (C2->getValue()+1)) == 0 && // C2 == 0+1+ - match(A, m_Add(m_Value(V1), m_Value(V2)))) { - // Add commutes, try both ways. - if (V1 == B && MaskedValueIsZero(V2, C2->getValue())) - return ReplaceInstUsesWith(I, A); - if (V2 == B && MaskedValueIsZero(V1, C2->getValue())) - return ReplaceInstUsesWith(I, A); - } - // Or commutes, try both ways. - if ((C1->getValue() & (C1->getValue()+1)) == 0 && - match(B, m_Add(m_Value(V1), m_Value(V2)))) { - // Add commutes, try both ways. - if (V1 == A && MaskedValueIsZero(V2, C1->getValue())) - return ReplaceInstUsesWith(I, B); - if (V2 == A && MaskedValueIsZero(V1, C1->getValue())) - return ReplaceInstUsesWith(I, B); - } - } - if ((C1->getValue() & C2->getValue()) == 0) { // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2) // iff (C1&C2) == 0 and (N&~C1) == 0 @@ -2469,6 +2446,12 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { } } + // (A | B)^(~A) -> (A | ~B) + Value *A = nullptr, *B = nullptr; + if (match(Op0, m_Or(m_Value(A), m_Value(B))) && + match(Op1, m_Not(m_Specific(A)))) + return BinaryOperator::CreateOr(A, Builder->CreateNot(B)); + // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B) if (ICmpInst *RHS = dyn_cast(I.getOperand(1))) if (ICmpInst *LHS = dyn_cast(I.getOperand(0))) diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index d4bdd75fa82a..658178d5914e 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -421,6 +421,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return InsertValueInst::Create(Struct, II->getArgOperand(0), 0); } } + + // We can strength reduce reduce this signed add into a regular add if we + // can prove that it will never overflow. + if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow) { + Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); + if (WillNotOverflowSignedAdd(LHS, RHS)) { + Value *Add = Builder->CreateNSWAdd(LHS, RHS); + Add->takeName(&CI); + Constant *V[] = {UndefValue::get(Add->getType()), Builder->getFalse()}; + StructType *ST = cast(II->getType()); + Constant *Struct = ConstantStruct::get(ST, V); + return InsertValueInst::Create(Struct, Add, 0); + } + } + break; case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: @@ -922,6 +937,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::AMDGPU_rcp: { + if (const ConstantFP *C = dyn_cast(II->getArgOperand(0))) { + const APFloat &ArgVal = C->getValueAPF(); + APFloat Val(ArgVal.getSemantics(), 1.0); + APFloat::opStatus Status = Val.divide(ArgVal, + APFloat::rmNearestTiesToEven); + // Only do this if it was exact and therefore not dependent on the + // rounding mode. + if (Status == APFloat::opOK) + return ReplaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); + } + + break; + } case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index ff083d7926cc..b9c3d0f64718 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1909,9 +1909,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { } Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) { - // If the destination pointer element type is not the the same as the source's - // do the addrspacecast to the same type, and then the bitcast in the new - // address space. This allows the cast to be exposed to other transforms. + // If the destination pointer element type is not the same as the source's + // first do a bitcast to the destination type, and then the addrspacecast. + // This allows the cast to be exposed to other transforms. Value *Src = CI.getOperand(0); PointerType *SrcTy = cast(Src->getType()->getScalarType()); PointerType *DestTy = cast(CI.getType()->getScalarType()); diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 639d8319e0de..5e71c5c4b7cb 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -683,26 +683,12 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, } // If one of the GEPs has all zero indices, recurse. - bool AllZeros = true; - for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i) - if (!isa(GEPLHS->getOperand(i)) || - !cast(GEPLHS->getOperand(i))->isNullValue()) { - AllZeros = false; - break; - } - if (AllZeros) + if (GEPLHS->hasAllZeroIndices()) return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0), ICmpInst::getSwappedPredicate(Cond), I); // If the other GEP has all zero indices, recurse. - AllZeros = true; - for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i) - if (!isa(GEPRHS->getOperand(i)) || - !cast(GEPRHS->getOperand(i))->isNullValue()) { - AllZeros = false; - break; - } - if (AllZeros) + if (GEPRHS->hasAllZeroIndices()) return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I); bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds(); @@ -2042,9 +2028,13 @@ static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV, /// replacement required. static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, Value *OtherVal, InstCombiner &IC) { + // Don't bother doing this transformation for pointers, don't do it for + // vectors. + if (!isa(MulVal->getType())) + return nullptr; + assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal); assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal); - assert(isa(MulVal->getType())); Instruction *MulInstr = cast(MulVal); assert(MulInstr->getOpcode() == Instruction::Mul); diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 66d09388f460..e9c25d32c281 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -50,99 +50,102 @@ static bool pointsToConstantGlobal(Value *V) { /// can optimize this. static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, - SmallVectorImpl &ToDelete, - bool IsOffset = false) { + SmallVectorImpl &ToDelete) { // We track lifetime intrinsics as we encounter them. If we decide to go // ahead and replace the value with the global, this lets the caller quickly // eliminate the markers. - for (Use &U : V->uses()) { - Instruction *I = cast(U.getUser()); - - if (LoadInst *LI = dyn_cast(I)) { - // Ignore non-volatile loads, they are always ok. - if (!LI->isSimple()) return false; - continue; - } - - if (isa(I) || isa(I)) { - // If uses of the bitcast are ok, we are ok. - if (!isOnlyCopiedFromConstantGlobal(I, TheCopy, ToDelete, IsOffset)) - return false; - continue; - } - if (GetElementPtrInst *GEP = dyn_cast(I)) { - // If the GEP has all zero indices, it doesn't offset the pointer. If it - // doesn't, it does. - if (!isOnlyCopiedFromConstantGlobal( - GEP, TheCopy, ToDelete, IsOffset || !GEP->hasAllZeroIndices())) - return false; - continue; - } + SmallVector, 35> ValuesToInspect; + ValuesToInspect.push_back(std::make_pair(V, false)); + while (!ValuesToInspect.empty()) { + auto ValuePair = ValuesToInspect.pop_back_val(); + const bool IsOffset = ValuePair.second; + for (auto &U : ValuePair.first->uses()) { + Instruction *I = cast(U.getUser()); + + if (LoadInst *LI = dyn_cast(I)) { + // Ignore non-volatile loads, they are always ok. + if (!LI->isSimple()) return false; + continue; + } - if (CallSite CS = I) { - // If this is the function being called then we treat it like a load and - // ignore it. - if (CS.isCallee(&U)) + if (isa(I) || isa(I)) { + // If uses of the bitcast are ok, we are ok. + ValuesToInspect.push_back(std::make_pair(I, IsOffset)); continue; + } + if (GetElementPtrInst *GEP = dyn_cast(I)) { + // If the GEP has all zero indices, it doesn't offset the pointer. If it + // doesn't, it does. + ValuesToInspect.push_back( + std::make_pair(I, IsOffset || !GEP->hasAllZeroIndices())); + continue; + } - // Inalloca arguments are clobbered by the call. - unsigned ArgNo = CS.getArgumentNo(&U); - if (CS.isInAllocaArgument(ArgNo)) - return false; + if (CallSite CS = I) { + // If this is the function being called then we treat it like a load and + // ignore it. + if (CS.isCallee(&U)) + continue; - // If this is a readonly/readnone call site, then we know it is just a - // load (but one that potentially returns the value itself), so we can - // ignore it if we know that the value isn't captured. - if (CS.onlyReadsMemory() && - (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo))) - continue; + // Inalloca arguments are clobbered by the call. + unsigned ArgNo = CS.getArgumentNo(&U); + if (CS.isInAllocaArgument(ArgNo)) + return false; - // If this is being passed as a byval argument, the caller is making a - // copy, so it is only a read of the alloca. - if (CS.isByValArgument(ArgNo)) - continue; - } + // If this is a readonly/readnone call site, then we know it is just a + // load (but one that potentially returns the value itself), so we can + // ignore it if we know that the value isn't captured. + if (CS.onlyReadsMemory() && + (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo))) + continue; + + // If this is being passed as a byval argument, the caller is making a + // copy, so it is only a read of the alloca. + if (CS.isByValArgument(ArgNo)) + continue; + } - // Lifetime intrinsics can be handled by the caller. - if (IntrinsicInst *II = dyn_cast(I)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end) { - assert(II->use_empty() && "Lifetime markers have no result to use!"); - ToDelete.push_back(II); - continue; + // Lifetime intrinsics can be handled by the caller. + if (IntrinsicInst *II = dyn_cast(I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + assert(II->use_empty() && "Lifetime markers have no result to use!"); + ToDelete.push_back(II); + continue; + } } - } - // If this is isn't our memcpy/memmove, reject it as something we can't - // handle. - MemTransferInst *MI = dyn_cast(I); - if (!MI) - return false; + // If this is isn't our memcpy/memmove, reject it as something we can't + // handle. + MemTransferInst *MI = dyn_cast(I); + if (!MI) + return false; - // If the transfer is using the alloca as a source of the transfer, then - // ignore it since it is a load (unless the transfer is volatile). - if (U.getOperandNo() == 1) { - if (MI->isVolatile()) return false; - continue; - } + // If the transfer is using the alloca as a source of the transfer, then + // ignore it since it is a load (unless the transfer is volatile). + if (U.getOperandNo() == 1) { + if (MI->isVolatile()) return false; + continue; + } - // If we already have seen a copy, reject the second one. - if (TheCopy) return false; + // If we already have seen a copy, reject the second one. + if (TheCopy) return false; - // If the pointer has been offset from the start of the alloca, we can't - // safely handle this. - if (IsOffset) return false; + // If the pointer has been offset from the start of the alloca, we can't + // safely handle this. + if (IsOffset) return false; - // If the memintrinsic isn't using the alloca as the dest, reject it. - if (U.getOperandNo() != 0) return false; + // If the memintrinsic isn't using the alloca as the dest, reject it. + if (U.getOperandNo() != 0) return false; - // If the source of the memcpy/move is not a constant global, reject it. - if (!pointsToConstantGlobal(MI->getSource())) - return false; + // If the source of the memcpy/move is not a constant global, reject it. + if (!pointsToConstantGlobal(MI->getSource())) + return false; - // Otherwise, the transform is safe. Remember the copy instruction. - TheCopy = MI; + // Otherwise, the transform is safe. Remember the copy instruction. + TheCopy = MI; + } } return true; } @@ -470,7 +473,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) { User *CI = cast(SI.getOperand(1)); Value *CastOp = CI->getOperand(0); - Type *DestPTy = cast(CI->getType())->getElementType(); + Type *DestPTy = CI->getType()->getPointerElementType(); PointerType *SrcTy = dyn_cast(CastOp->getType()); if (!SrcTy) return nullptr; @@ -515,8 +518,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) { // If the pointers point into different address spaces don't do the // transformation. - if (SrcTy->getAddressSpace() != - cast(CI->getType())->getAddressSpace()) + if (SrcTy->getAddressSpace() != CI->getType()->getPointerAddressSpace()) return nullptr; // If the pointers point to values of different sizes don't do the diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 9996ebc2e744..6c6e7d815163 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -203,8 +203,11 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { Value *X; Constant *C1; if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) { - Value *Add = Builder->CreateMul(X, Op1); - return BinaryOperator::CreateAdd(Add, Builder->CreateMul(C1, Op1)); + Value *Mul = Builder->CreateMul(C1, Op1); + // Only go forward with the transform if C1*CI simplifies to a tidier + // constant. + if (!match(Mul, m_Mul(m_Value(), m_Value()))) + return BinaryOperator::CreateAdd(Builder->CreateMul(X, Op1), Mul); } } } @@ -990,6 +993,10 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { } if (Constant *RHS = dyn_cast(Op1)) { + // X/INT_MIN -> X == INT_MIN + if (RHS->isMinSignedValue()) + return new ZExtInst(Builder->CreateICmpEQ(Op0, Op1), I.getType()); + // -X/C --> X/-C provided the negation doesn't overflow. if (SubOperator *Sub = dyn_cast(Op0)) if (match(Sub->getOperand(0), m_Zero()) && Sub->hasNoSignedWrap()) diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index cc6665c947d7..3d0cc05f30c6 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -488,7 +488,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1, } - // If the operand is an bitwise operator with a constant RHS, and the + // If the operand is a bitwise operator with a constant RHS, and the // shift is the only use, we can pull it out of the shift. if (ConstantInt *Op0C = dyn_cast(Op0BO->getOperand(1))) { bool isValid = true; // Valid only for And, Or, Xor @@ -789,11 +789,6 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { // have a sign-extend idiom. Value *X; if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1)))) { - // If the left shift is just shifting out partial signbits, delete the - // extension. - if (cast(Op0)->hasNoSignedWrap()) - return ReplaceInstUsesWith(I, X); - // If the input is an extension from the shifted amount value, e.g. // %x = zext i8 %A to i32 // %y = shl i32 %x, 24 @@ -820,10 +815,5 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { APInt::getSignBit(I.getType()->getScalarSizeInBits()))) return BinaryOperator::CreateLShr(Op0, Op1); - // Arithmetic shifting an all-sign-bit value is a no-op. - unsigned NumSignBits = ComputeNumSignBits(Op0); - if (NumSignBits == Op0->getType()->getScalarSizeInBits()) - return ReplaceInstUsesWith(I, Op0); - return nullptr; } diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 8c5e202b5c51..cb165844bdce 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -144,7 +144,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // If the operand is the PHI induction variable: if (PHIInVal == PHIUser) { // Scalarize the binary operation. Its first operand is the - // scalar PHI and the second operand is extracted from the other + // scalar PHI, and the second operand is extracted from the other // vector operand. BinaryOperator *B0 = cast(PHIUser); unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0; @@ -361,7 +361,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, unsigned InsertedIdx = cast(IdxOp)->getZExtValue(); if (isa(ScalarOp)) { // inserting undef into vector. - // Okay, we can handle this if the vector we are insertinting into is + // We can handle this if the vector we are inserting into is // transitively ok. if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { // If so, update the mask to reflect the inserted undef. @@ -376,7 +376,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, // This must be extracting from either LHS or RHS. if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) { - // Okay, we can handle this if the vector we are insertinting into is + // We can handle this if the vector we are inserting into is // transitively ok. if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { // If so, update the mask to reflect the inserted value. @@ -403,7 +403,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, /// We are building a shuffle to create V, which is a sequence of insertelement, /// extractelement pairs. If PermittedRHS is set, then we must either use it or -/// not rely on the second vector source. Return an std::pair containing the +/// not rely on the second vector source. Return a std::pair containing the /// left and right vectors of the proposed shuffle (or 0), and set the Mask /// parameter as required. /// diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 991ad796a7e3..d3648e2d0505 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -42,6 +42,7 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GetElementPtrTypeIterator.h" @@ -395,6 +396,127 @@ static bool RightDistributesOverLeft(Instruction::BinaryOps LOp, return false; } +/// This function returns identity value for given opcode, which can be used to +/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1). +static Value *getIdentityValue(Instruction::BinaryOps OpCode, Value *V) { + if (isa(V)) + return nullptr; + + if (OpCode == Instruction::Mul) + return ConstantInt::get(V->getType(), 1); + + // TODO: We can handle other cases e.g. Instruction::And, Instruction::Or etc. + + return nullptr; +} + +/// This function factors binary ops which can be combined using distributive +/// laws. This also factor SHL as MUL e.g. SHL(X, 2) ==> MUL(X, 4). +static Instruction::BinaryOps +getBinOpsForFactorization(BinaryOperator *Op, Value *&LHS, Value *&RHS) { + if (!Op) + return Instruction::BinaryOpsEnd; + + if (Op->getOpcode() == Instruction::Shl) { + if (Constant *CST = dyn_cast(Op->getOperand(1))) { + // The multiplier is really 1 << CST. + RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), CST); + LHS = Op->getOperand(0); + return Instruction::Mul; + } + } + + // TODO: We can add other conversions e.g. shr => div etc. + + LHS = Op->getOperand(0); + RHS = Op->getOperand(1); + return Op->getOpcode(); +} + +/// This tries to simplify binary operations by factorizing out common terms +/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)"). +static Value *tryFactorization(InstCombiner::BuilderTy *Builder, + const DataLayout *DL, BinaryOperator &I, + Instruction::BinaryOps InnerOpcode, Value *A, + Value *B, Value *C, Value *D) { + + // If any of A, B, C, D are null, we can not factor I, return early. + // Checking A and C should be enough. + if (!A || !C || !B || !D) + return nullptr; + + Value *SimplifiedInst = nullptr; + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); + + // Does "X op' Y" always equal "Y op' X"? + bool InnerCommutative = Instruction::isCommutative(InnerOpcode); + + // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"? + if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode)) + // Does the instruction have the form "(A op' B) op (A op' D)" or, in the + // commutative case, "(A op' B) op (C op' A)"? + if (A == C || (InnerCommutative && A == D)) { + if (A != C) + std::swap(C, D); + // Consider forming "A op' (B op D)". + // If "B op D" simplifies then it can be formed with no cost. + Value *V = SimplifyBinOp(TopLevelOpcode, B, D, DL); + // If "B op D" doesn't simplify then only go on if both of the existing + // operations "A op' B" and "C op' D" will be zapped as no longer used. + if (!V && LHS->hasOneUse() && RHS->hasOneUse()) + V = Builder->CreateBinOp(TopLevelOpcode, B, D, RHS->getName()); + if (V) { + SimplifiedInst = Builder->CreateBinOp(InnerOpcode, A, V); + } + } + + // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"? + if (!SimplifiedInst && RightDistributesOverLeft(TopLevelOpcode, InnerOpcode)) + // Does the instruction have the form "(A op' B) op (C op' B)" or, in the + // commutative case, "(A op' B) op (B op' D)"? + if (B == D || (InnerCommutative && B == C)) { + if (B != D) + std::swap(C, D); + // Consider forming "(A op C) op' B". + // If "A op C" simplifies then it can be formed with no cost. + Value *V = SimplifyBinOp(TopLevelOpcode, A, C, DL); + + // If "A op C" doesn't simplify then only go on if both of the existing + // operations "A op' B" and "C op' D" will be zapped as no longer used. + if (!V && LHS->hasOneUse() && RHS->hasOneUse()) + V = Builder->CreateBinOp(TopLevelOpcode, A, C, LHS->getName()); + if (V) { + SimplifiedInst = Builder->CreateBinOp(InnerOpcode, V, B); + } + } + + if (SimplifiedInst) { + ++NumFactor; + SimplifiedInst->takeName(&I); + + // Check if we can add NSW flag to SimplifiedInst. If so, set NSW flag. + // TODO: Check for NUW. + if (BinaryOperator *BO = dyn_cast(SimplifiedInst)) { + if (isa(SimplifiedInst)) { + bool HasNSW = false; + if (isa(&I)) + HasNSW = I.hasNoSignedWrap(); + + if (BinaryOperator *Op0 = dyn_cast(LHS)) + if (isa(Op0)) + HasNSW &= Op0->hasNoSignedWrap(); + + if (BinaryOperator *Op1 = dyn_cast(RHS)) + if (isa(Op1)) + HasNSW &= Op1->hasNoSignedWrap(); + BO->setHasNoSignedWrap(HasNSW); + } + } + } + return SimplifiedInst; +} + /// SimplifyUsingDistributiveLaws - This tries to simplify binary operations /// which some other binary operation distributes over either by factorizing /// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this @@ -404,65 +526,33 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); BinaryOperator *Op0 = dyn_cast(LHS); BinaryOperator *Op1 = dyn_cast(RHS); - Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); // op // Factorization. - if (Op0 && Op1 && Op0->getOpcode() == Op1->getOpcode()) { - // The instruction has the form "(A op' B) op (C op' D)". Try to factorize - // a common term. - Value *A = Op0->getOperand(0), *B = Op0->getOperand(1); - Value *C = Op1->getOperand(0), *D = Op1->getOperand(1); - Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op' + Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; + Instruction::BinaryOps LHSOpcode = getBinOpsForFactorization(Op0, A, B); + Instruction::BinaryOps RHSOpcode = getBinOpsForFactorization(Op1, C, D); + + // The instruction has the form "(A op' B) op (C op' D)". Try to factorize + // a common term. + if (LHSOpcode == RHSOpcode) { + if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, C, D)) + return V; + } - // Does "X op' Y" always equal "Y op' X"? - bool InnerCommutative = Instruction::isCommutative(InnerOpcode); - - // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"? - if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode)) - // Does the instruction have the form "(A op' B) op (A op' D)" or, in the - // commutative case, "(A op' B) op (C op' A)"? - if (A == C || (InnerCommutative && A == D)) { - if (A != C) - std::swap(C, D); - // Consider forming "A op' (B op D)". - // If "B op D" simplifies then it can be formed with no cost. - Value *V = SimplifyBinOp(TopLevelOpcode, B, D, DL); - // If "B op D" doesn't simplify then only go on if both of the existing - // operations "A op' B" and "C op' D" will be zapped as no longer used. - if (!V && Op0->hasOneUse() && Op1->hasOneUse()) - V = Builder->CreateBinOp(TopLevelOpcode, B, D, Op1->getName()); - if (V) { - ++NumFactor; - V = Builder->CreateBinOp(InnerOpcode, A, V); - V->takeName(&I); - return V; - } - } + // The instruction has the form "(A op' B) op (C)". Try to factorize common + // term. + if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, RHS, + getIdentityValue(LHSOpcode, RHS))) + return V; - // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"? - if (RightDistributesOverLeft(TopLevelOpcode, InnerOpcode)) - // Does the instruction have the form "(A op' B) op (C op' B)" or, in the - // commutative case, "(A op' B) op (B op' D)"? - if (B == D || (InnerCommutative && B == C)) { - if (B != D) - std::swap(C, D); - // Consider forming "(A op C) op' B". - // If "A op C" simplifies then it can be formed with no cost. - Value *V = SimplifyBinOp(TopLevelOpcode, A, C, DL); - // If "A op C" doesn't simplify then only go on if both of the existing - // operations "A op' B" and "C op' D" will be zapped as no longer used. - if (!V && Op0->hasOneUse() && Op1->hasOneUse()) - V = Builder->CreateBinOp(TopLevelOpcode, A, C, Op0->getName()); - if (V) { - ++NumFactor; - V = Builder->CreateBinOp(InnerOpcode, V, B); - V->takeName(&I); - return V; - } - } - } + // The instruction has the form "(B) op (C op' D)". Try to factorize common + // term. + if (Value *V = tryFactorization(Builder, DL, I, RHSOpcode, LHS, + getIdentityValue(RHSOpcode, LHS), C, D)) + return V; // Expansion. + Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) { // The instruction has the form "(A op' B) op C". See if expanding it out // to "(A op C) op' (B op C)" results in simplifications. @@ -1030,6 +1120,12 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { return nullptr; } + // If Op is zero then Val = Op * Scale. + if (match(Op, m_Zero())) { + NoSignedWrap = true; + return Op; + } + // We know that we can successfully descale, so from here on we can safely // modify the IR. Op holds the descaled version of the deepest term in the // expression. NoSignedWrap is 'true' if multiplying Op by Scale is known @@ -1106,6 +1202,11 @@ static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS, Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { if (!Inst.getType()->isVectorTy()) return nullptr; + // It may not be safe to reorder shuffles and things like div, urem, etc. + // because we may trap when executing those ops on unknown vector elements. + // See PR20059. + if (!isSafeToSpeculativelyExecute(&Inst, DL)) return nullptr; + unsigned VWidth = cast(Inst.getType())->getNumElements(); Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1); assert(cast(LHS->getType())->getNumElements() == VWidth); @@ -1138,7 +1239,9 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { if (isa(RHS)) Shuffle = cast(RHS); if (isa(LHS)) C1 = cast(LHS); if (isa(RHS)) C1 = cast(RHS); - if (Shuffle && C1 && isa(Shuffle->getOperand(1)) && + if (Shuffle && C1 && + (isa(C1) || isa(C1)) && + isa(Shuffle->getOperand(1)) && Shuffle->getType() == Shuffle->getOperand(0)->getType()) { SmallVector ShMask = Shuffle->getShuffleMask(); // Find constant C2 that has property: @@ -1479,9 +1582,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { Builder->CreateGEP(StrippedPtr, Idx, GEP.getName()); // V and GEP are both pointer types --> BitCast - if (StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) - return new BitCastInst(NewGEP, GEP.getType()); - return new AddrSpaceCastInst(NewGEP, GEP.getType()); + return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, + GEP.getType()); } // Transform things like: @@ -1513,9 +1615,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { Builder->CreateGEP(StrippedPtr, NewIdx, GEP.getName()); // The NewGEP must be pointer typed, so must the old one -> BitCast - if (StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) - return new BitCastInst(NewGEP, GEP.getType()); - return new AddrSpaceCastInst(NewGEP, GEP.getType()); + return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, + GEP.getType()); } } } @@ -1555,9 +1656,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) : Builder->CreateGEP(StrippedPtr, Off, GEP.getName()); // The NewGEP must be pointer typed, so must the old one -> BitCast - if (StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) - return new BitCastInst(NewGEP, GEP.getType()); - return new AddrSpaceCastInst(NewGEP, GEP.getType()); + return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, + GEP.getType()); } } } @@ -2627,9 +2727,18 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // If the user is one of our immediate successors, and if that successor // only has us as a predecessors (we'd have to split the critical edge // otherwise), we can keep going. - if (UserIsSuccessor && UserParent->getSinglePredecessor()) + if (UserIsSuccessor && UserParent->getSinglePredecessor()) { // Okay, the CFG is simple enough, try to sink this instruction. - MadeIRChange |= TryToSinkInstruction(I, UserParent); + if (TryToSinkInstruction(I, UserParent)) { + MadeIRChange = true; + // We'll add uses of the sunk instruction below, but since sinking + // can expose opportunities for it's *operands* add them to the + // worklist + for (Use &U : I->operands()) + if (Instruction *OpI = dyn_cast(U.get())) + Worklist.Add(OpI); + } + } } } diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 6dbcde03cf2d..124ffe2f8f87 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -16,6 +16,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" @@ -44,7 +45,6 @@ #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/Transforms/Utils/SpecialCaseList.h" #include #include #include @@ -79,7 +79,7 @@ static const char *const kAsanUnregisterGlobalsName = "__asan_unregister_globals"; static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init"; static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init"; -static const char *const kAsanInitName = "__asan_init_v3"; +static const char *const kAsanInitName = "__asan_init_v4"; static const char *const kAsanCovModuleInitName = "__sanitizer_cov_module_init"; static const char *const kAsanCovName = "__sanitizer_cov"; static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp"; @@ -148,9 +148,6 @@ static cl::opt ClInvalidPointerPairs("asan-detect-invalid-pointer-pair", static cl::opt ClRealignStack("asan-realign-stack", cl::desc("Realign stack to the value of this flag (power of two)"), cl::Hidden, cl::init(32)); -static cl::opt ClBlacklistFile("asan-blacklist", - cl::desc("File containing the list of objects to ignore " - "during instrumentation"), cl::Hidden); static cl::opt ClInstrumentationWithCallsThreshold( "asan-instrumentation-with-call-threshold", cl::desc("If the function being instrumented contains more than " @@ -215,28 +212,86 @@ STATISTIC(NumOptimizedAccessesToGlobalVar, "Number of optimized accesses to global vars"); namespace { -/// A set of dynamically initialized globals extracted from metadata. -class SetOfDynamicallyInitializedGlobals { +/// Frontend-provided metadata for global variables. +class GlobalsMetadata { public: - void Init(Module& M) { - // Clang generates metadata identifying all dynamically initialized globals. - NamedMDNode *DynamicGlobals = - M.getNamedMetadata("llvm.asan.dynamically_initialized_globals"); - if (!DynamicGlobals) + struct Entry { + Entry() + : SourceLoc(nullptr), Name(nullptr), IsDynInit(false), + IsBlacklisted(false) {} + GlobalVariable *SourceLoc; + GlobalVariable *Name; + bool IsDynInit; + bool IsBlacklisted; + }; + + GlobalsMetadata() : inited_(false) {} + + void init(Module& M) { + assert(!inited_); + inited_ = true; + NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals"); + if (!Globals) return; - for (const auto MDN : DynamicGlobals->operands()) { - assert(MDN->getNumOperands() == 1); - Value *VG = MDN->getOperand(0); - // The optimizer may optimize away a global entirely, in which case we - // cannot instrument access to it. - if (!VG) + for (auto MDN : Globals->operands()) { + // Metadata node contains the global and the fields of "Entry". + assert(MDN->getNumOperands() == 5); + Value *V = MDN->getOperand(0); + // The optimizer may optimize away a global entirely. + if (!V) continue; - DynInitGlobals.insert(cast(VG)); + GlobalVariable *GV = cast(V); + // We can already have an entry for GV if it was merged with another + // global. + Entry &E = Entries[GV]; + if (Value *Loc = MDN->getOperand(1)) { + GlobalVariable *GVLoc = cast(Loc); + E.SourceLoc = GVLoc; + addSourceLocationGlobal(GVLoc); + } + if (Value *Name = MDN->getOperand(2)) { + GlobalVariable *GVName = cast(Name); + E.Name = GVName; + InstrumentationGlobals.insert(GVName); + } + ConstantInt *IsDynInit = cast(MDN->getOperand(3)); + E.IsDynInit |= IsDynInit->isOne(); + ConstantInt *IsBlacklisted = cast(MDN->getOperand(4)); + E.IsBlacklisted |= IsBlacklisted->isOne(); } } - bool Contains(GlobalVariable *G) { return DynInitGlobals.count(G) != 0; } + + /// Returns metadata entry for a given global. + Entry get(GlobalVariable *G) const { + auto Pos = Entries.find(G); + return (Pos != Entries.end()) ? Pos->second : Entry(); + } + + /// Check if the global was generated by the instrumentation + /// (we don't want to instrument it again in this case). + bool isInstrumentationGlobal(GlobalVariable *G) const { + return InstrumentationGlobals.count(G); + } + private: - SmallSet DynInitGlobals; + bool inited_; + DenseMap Entries; + // Globals generated by the frontend instrumentation. + DenseSet InstrumentationGlobals; + + void addSourceLocationGlobal(GlobalVariable *SourceLocGV) { + // Source location global is a struct with layout: + // { + // filename, + // i32 line_number, + // i32 column_number, + // } + InstrumentationGlobals.insert(SourceLocGV); + ConstantStruct *Contents = + cast(SourceLocGV->getInitializer()); + GlobalVariable *FilenameGV = cast(Contents->getOperand(0)); + InstrumentationGlobals.insert(FilenameGV); + } }; /// This struct defines the shadow mapping using the rule: @@ -351,16 +406,14 @@ struct AddressSanitizer : public FunctionPass { *AsanMemoryAccessCallbackSized[2]; Function *AsanMemmove, *AsanMemcpy, *AsanMemset; InlineAsm *EmptyAsm; - SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals; + GlobalsMetadata GlobalsMD; friend struct FunctionStackPoisoner; }; class AddressSanitizerModule : public ModulePass { public: - AddressSanitizerModule(StringRef BlacklistFile = StringRef()) - : ModulePass(ID), BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile - : BlacklistFile) {} + AddressSanitizerModule() : ModulePass(ID) {} bool runOnModule(Module &M) override; static char ID; // Pass identification, replacement for typeid const char *getPassName() const override { @@ -378,10 +431,7 @@ class AddressSanitizerModule : public ModulePass { return RedzoneSizeForScale(Mapping.Scale); } - SmallString<64> BlacklistFile; - - std::unique_ptr BL; - SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals; + GlobalsMetadata GlobalsMD; Type *IntptrTy; LLVMContext *C; const DataLayout *DL; @@ -541,8 +591,8 @@ char AddressSanitizerModule::ID = 0; INITIALIZE_PASS(AddressSanitizerModule, "asan-module", "AddressSanitizer: detects use-after-free and out-of-bounds bugs." "ModulePass", false, false) -ModulePass *llvm::createAddressSanitizerModulePass(StringRef BlacklistFile) { - return new AddressSanitizerModule(BlacklistFile); +ModulePass *llvm::createAddressSanitizerModulePass() { + return new AddressSanitizerModule(); } static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { @@ -605,6 +655,9 @@ void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { // and set IsWrite/Alignment. Otherwise return NULL. static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite, unsigned *Alignment) { + // Skip memory accesses inserted by another instrumentation. + if (I->getMetadata("nosanitize")) + return nullptr; if (LoadInst *LI = dyn_cast(I)) { if (!ClInstrumentReads) return nullptr; *IsWrite = false; @@ -659,7 +712,7 @@ bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) { // If a global variable does not have dynamic initialization we don't // have to instrument it. However, if a global does not have initializer // at all, we assume it has dynamic initializer (in other TU). - return G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G); + return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit; } void @@ -866,16 +919,20 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { Type *Ty = cast(G->getType())->getElementType(); DEBUG(dbgs() << "GLOBAL: " << *G << "\n"); - if (BL->isIn(*G)) return false; + if (GlobalsMD.get(G).IsBlacklisted) return false; + if (GlobalsMD.isInstrumentationGlobal(G)) return false; if (!Ty->isSized()) return false; if (!G->hasInitializer()) return false; if (GlobalWasGeneratedByAsan(G)) return false; // Our own global. // Touch only those globals that will not be defined in other modules. - // Don't handle ODR type linkages since other modules may be built w/o asan. + // Don't handle ODR linkage types and COMDATs since other modules may be built + // without ASan. if (G->getLinkage() != GlobalVariable::ExternalLinkage && G->getLinkage() != GlobalVariable::PrivateLinkage && G->getLinkage() != GlobalVariable::InternalLinkage) return false; + if (G->hasComdat()) + return false; // Two problems with thread-locals: // - The address of the main thread's copy can't be computed at link-time. // - Need to poison all copies, not just the main thread's one. @@ -967,7 +1024,7 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) { // trailing redzones. It also creates a function that poisons // redzones and inserts this function into llvm.global_ctors. bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { - DynamicallyInitializedGlobals.Init(M); + GlobalsMD.init(M); SmallVector GlobalsToChange; @@ -986,10 +1043,11 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { // const char *name; // const char *module_name; // size_t has_dynamic_init; + // void *source_location; // We initialize an array of such structures and pass it to a run-time call. - StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy, - IntptrTy, IntptrTy, - IntptrTy, IntptrTy, NULL); + StructType *GlobalStructTy = + StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy, + IntptrTy, IntptrTy, NULL); SmallVector Initializers(n); bool HasDynamicallyInitializedGlobals = false; @@ -1002,6 +1060,14 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { for (size_t i = 0; i < n; i++) { static const uint64_t kMaxGlobalRedzone = 1 << 18; GlobalVariable *G = GlobalsToChange[i]; + + auto MD = GlobalsMD.get(G); + // Create string holding the global name unless it was provided by + // the metadata. + GlobalVariable *Name = + MD.Name ? MD.Name : createPrivateGlobalForString(M, G->getName(), + /*AllowMerging*/ true); + PointerType *PtrTy = cast(G->getType()); Type *Ty = PtrTy->getElementType(); uint64_t SizeInBytes = DL->getTypeAllocSize(Ty); @@ -1017,18 +1083,12 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ); assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0); Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize); - // Determine whether this global should be poisoned in initialization. - bool GlobalHasDynamicInitializer = - DynamicallyInitializedGlobals.Contains(G); StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL); Constant *NewInitializer = ConstantStruct::get( NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy), NULL); - GlobalVariable *Name = - createPrivateGlobalForString(M, G->getName(), /*AllowMerging*/true); - // Create a new global variable with enough space for a redzone. GlobalValue::LinkageTypes Linkage = G->getLinkage(); if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage) @@ -1049,17 +1109,17 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { G->eraseFromParent(); Initializers[i] = ConstantStruct::get( - GlobalStructTy, - ConstantExpr::getPointerCast(NewGlobal, IntptrTy), + GlobalStructTy, ConstantExpr::getPointerCast(NewGlobal, IntptrTy), ConstantInt::get(IntptrTy, SizeInBytes), ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize), ConstantExpr::getPointerCast(Name, IntptrTy), ConstantExpr::getPointerCast(ModuleName, IntptrTy), - ConstantInt::get(IntptrTy, GlobalHasDynamicInitializer), + ConstantInt::get(IntptrTy, MD.IsDynInit), + MD.SourceLoc ? ConstantExpr::getPointerCast(MD.SourceLoc, IntptrTy) + : ConstantInt::get(IntptrTy, 0), NULL); - // Populate the first and last globals declared in this TU. - if (ClInitializers && GlobalHasDynamicInitializer) + if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true; DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n"); @@ -1098,7 +1158,6 @@ bool AddressSanitizerModule::runOnModule(Module &M) { if (!DLP) return false; DL = &DLP->getDataLayout(); - BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); C = &(M.getContext()); int LongSize = DL->getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); @@ -1118,7 +1177,8 @@ bool AddressSanitizerModule::runOnModule(Module &M) { Changed = true; } - if (ClGlobals && !BL->isIn(M)) Changed |= InstrumentGlobals(IRB, M); + if (ClGlobals) + Changed |= InstrumentGlobals(IRB, M); return Changed; } @@ -1186,7 +1246,7 @@ bool AddressSanitizer::doInitialization(Module &M) { report_fatal_error("data layout missing"); DL = &DLP->getDataLayout(); - DynamicallyInitializedGlobals.Init(M); + GlobalsMD.init(M); C = &(M.getContext()); LongSize = DL->getPointerSizeInBits(); diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 7f468f79e22d..35057cdd47e9 100644 --- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -50,6 +50,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/InstVisitor.h" @@ -59,10 +60,13 @@ #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/SpecialCaseList.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/SpecialCaseList.h" +#include #include +#include +#include using namespace llvm; @@ -120,6 +124,51 @@ static cl::opt ClDebugNonzeroLabels( namespace { +StringRef GetGlobalTypeString(const GlobalValue &G) { + // Types of GlobalVariables are always pointer types. + Type *GType = G.getType()->getElementType(); + // For now we support blacklisting struct types only. + if (StructType *SGType = dyn_cast(GType)) { + if (!SGType->isLiteral()) + return SGType->getName(); + } + return ""; +} + +class DFSanABIList { + std::unique_ptr SCL; + + public: + DFSanABIList(SpecialCaseList *SCL) : SCL(SCL) {} + + /// Returns whether either this function or its source file are listed in the + /// given category. + bool isIn(const Function &F, const StringRef Category) const { + return isIn(*F.getParent(), Category) || + SCL->inSection("fun", F.getName(), Category); + } + + /// Returns whether this global alias is listed in the given category. + /// + /// If GA aliases a function, the alias's name is matched as a function name + /// would be. Similarly, aliases of globals are matched like globals. + bool isIn(const GlobalAlias &GA, const StringRef Category) const { + if (isIn(*GA.getParent(), Category)) + return true; + + if (isa(GA.getType()->getElementType())) + return SCL->inSection("fun", GA.getName(), Category); + + return SCL->inSection("global", GA.getName(), Category) || + SCL->inSection("type", GetGlobalTypeString(GA), Category); + } + + /// Returns whether this module is listed in the given category. + bool isIn(const Module &M, const StringRef Category) const { + return SCL->inSection("src", M.getModuleIdentifier(), Category); + } +}; + class DataFlowSanitizer : public ModulePass { friend struct DFSanFunction; friend class DFSanVisitor; @@ -190,12 +239,11 @@ class DataFlowSanitizer : public ModulePass { Constant *DFSanSetLabelFn; Constant *DFSanNonzeroLabelFn; MDNode *ColdCallWeights; - std::unique_ptr ABIList; + DFSanABIList ABIList; DenseMap UnwrappedFnMap; AttributeSet ReadOnlyNoneAttrs; Value *getShadowAddress(Value *Addr, Instruction *Pos); - Value *combineShadows(Value *V1, Value *V2, Instruction *Pos); bool isInstrumented(const Function *F); bool isInstrumented(const GlobalAlias *GA); FunctionType *getArgsFunctionType(FunctionType *T); @@ -221,6 +269,7 @@ class DataFlowSanitizer : public ModulePass { struct DFSanFunction { DataFlowSanitizer &DFS; Function *F; + DominatorTree DT; DataFlowSanitizer::InstrumentedABI IA; bool IsNativeABI; Value *ArgTLSPtr; @@ -232,15 +281,26 @@ struct DFSanFunction { DenseSet SkipInsts; DenseSet NonZeroChecks; + struct CachedCombinedShadow { + BasicBlock *Block; + Value *Shadow; + }; + DenseMap, CachedCombinedShadow> + CachedCombinedShadows; + DenseMap> ShadowElements; + DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI) : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()), IsNativeABI(IsNativeABI), ArgTLSPtr(nullptr), RetvalTLSPtr(nullptr), - LabelReturnAlloca(nullptr) {} + LabelReturnAlloca(nullptr) { + DT.recalculate(*F); + } Value *getArgTLSPtr(); Value *getArgTLS(unsigned Index, Instruction *Pos); Value *getRetvalTLS(); Value *getShadow(Value *V); void setShadow(Instruction *I, Value *Shadow); + Value *combineShadows(Value *V1, Value *V2, Instruction *Pos); Value *combineOperandShadows(Instruction *Inst); Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align, Instruction *Pos); @@ -395,11 +455,11 @@ bool DataFlowSanitizer::doInitialization(Module &M) { } bool DataFlowSanitizer::isInstrumented(const Function *F) { - return !ABIList->isIn(*F, "uninstrumented"); + return !ABIList.isIn(*F, "uninstrumented"); } bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) { - return !ABIList->isIn(*GA, "uninstrumented"); + return !ABIList.isIn(*GA, "uninstrumented"); } DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() { @@ -407,11 +467,11 @@ DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() { } DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) { - if (ABIList->isIn(*F, "functional")) + if (ABIList.isIn(*F, "functional")) return WK_Functional; - if (ABIList->isIn(*F, "discard")) + if (ABIList.isIn(*F, "discard")) return WK_Discard; - if (ABIList->isIn(*F, "custom")) + if (ABIList.isIn(*F, "custom")) return WK_Custom; return WK_Warning; @@ -500,7 +560,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { if (!DL) return false; - if (ABIList->isIn(M, "skip")) + if (ABIList.isIn(M, "skip")) return false; if (!GetArgTLSPtr) { @@ -557,7 +617,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { ++i; // Don't stop on weak. We assume people aren't playing games with the // instrumentedness of overridden weak aliases. - if (Function *F = dyn_cast(GA->getAliasee())) { + if (auto F = dyn_cast(GA->getBaseObject())) { bool GAInst = isInstrumented(GA), FInst = isInstrumented(F); if (GAInst && FInst) { addGlobalNamePrefix(GA); @@ -567,7 +627,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { // below will take care of instrumenting it. Function *NewF = buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType()); - GA->replaceAllUsesWith(NewF); + GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType())); NewF->takeName(GA); GA->eraseFromParent(); FnsToInstrument.push_back(NewF); @@ -828,29 +888,71 @@ Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) { // Generates IR to compute the union of the two given shadows, inserting it // before Pos. Returns the computed union Value. -Value *DataFlowSanitizer::combineShadows(Value *V1, Value *V2, - Instruction *Pos) { - if (V1 == ZeroShadow) +Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) { + if (V1 == DFS.ZeroShadow) return V2; - if (V2 == ZeroShadow) + if (V2 == DFS.ZeroShadow) return V1; if (V1 == V2) return V1; + + auto V1Elems = ShadowElements.find(V1); + auto V2Elems = ShadowElements.find(V2); + if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) { + if (std::includes(V1Elems->second.begin(), V1Elems->second.end(), + V2Elems->second.begin(), V2Elems->second.end())) { + return V1; + } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(), + V1Elems->second.begin(), V1Elems->second.end())) { + return V2; + } + } else if (V1Elems != ShadowElements.end()) { + if (V1Elems->second.count(V2)) + return V1; + } else if (V2Elems != ShadowElements.end()) { + if (V2Elems->second.count(V1)) + return V2; + } + + auto Key = std::make_pair(V1, V2); + if (V1 > V2) + std::swap(Key.first, Key.second); + CachedCombinedShadow &CCS = CachedCombinedShadows[Key]; + if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent())) + return CCS.Shadow; + IRBuilder<> IRB(Pos); BasicBlock *Head = Pos->getParent(); Value *Ne = IRB.CreateICmpNE(V1, V2); BranchInst *BI = cast(SplitBlockAndInsertIfThen( - Ne, Pos, /*Unreachable=*/false, ColdCallWeights)); + Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT)); IRBuilder<> ThenIRB(BI); - CallInst *Call = ThenIRB.CreateCall2(DFSanUnionFn, V1, V2); + CallInst *Call = ThenIRB.CreateCall2(DFS.DFSanUnionFn, V1, V2); Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); Call->addAttribute(1, Attribute::ZExt); Call->addAttribute(2, Attribute::ZExt); BasicBlock *Tail = BI->getSuccessor(0); - PHINode *Phi = PHINode::Create(ShadowTy, 2, "", Tail->begin()); + PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", Tail->begin()); Phi->addIncoming(Call, Call->getParent()); Phi->addIncoming(V1, Head); + + CCS.Block = Tail; + CCS.Shadow = Phi; + + std::set UnionElems; + if (V1Elems != ShadowElements.end()) { + UnionElems = V1Elems->second; + } else { + UnionElems.insert(V1); + } + if (V2Elems != ShadowElements.end()) { + UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end()); + } else { + UnionElems.insert(V2); + } + ShadowElements[Phi] = std::move(UnionElems); + return Phi; } @@ -863,7 +965,7 @@ Value *DFSanFunction::combineOperandShadows(Instruction *Inst) { Value *Shadow = getShadow(Inst->getOperand(0)); for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) { - Shadow = DFS.combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst); + Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst); } return Shadow; } @@ -916,9 +1018,8 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, IRBuilder<> IRB(Pos); Value *ShadowAddr1 = IRB.CreateGEP(ShadowAddr, ConstantInt::get(DFS.IntptrTy, 1)); - return DFS.combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign), - IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), - Pos); + return combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign), + IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), Pos); } } if (Size % (64 / DFS.ShadowWidth) == 0) { @@ -945,16 +1046,27 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, BasicBlock *Head = Pos->getParent(); BasicBlock *Tail = Head->splitBasicBlock(Pos); + + if (DomTreeNode *OldNode = DT.getNode(Head)) { + std::vector Children(OldNode->begin(), OldNode->end()); + + DomTreeNode *NewNode = DT.addNewBlock(Tail, Head); + for (auto Child : Children) + DT.changeImmediateDominator(Child, NewNode); + } + // In the following code LastBr will refer to the previous basic block's // conditional branch instruction, whose true successor is fixed up to point // to the next block during the loop below or to the tail after the final // iteration. BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq); ReplaceInstWithInst(Head->getTerminator(), LastBr); + DT.addNewBlock(FallbackBB, Head); for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size; Ofs += 64 / DFS.ShadowWidth) { BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F); + DT.addNewBlock(NextBB, LastBr->getParent()); IRBuilder<> NextIRB(NextBB); WideAddr = NextIRB.CreateGEP(WideAddr, ConstantInt::get(DFS.IntptrTy, 1)); Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign); @@ -992,7 +1104,7 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) { Value *Shadow = DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI); if (ClCombinePointerLabelsOnLoad) { Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand()); - Shadow = DFSF.DFS.combineShadows(Shadow, PtrShadow, &LI); + Shadow = DFSF.combineShadows(Shadow, PtrShadow, &LI); } if (Shadow != DFSF.DFS.ZeroShadow) DFSF.NonZeroChecks.insert(Shadow); @@ -1066,7 +1178,7 @@ void DFSanVisitor::visitStoreInst(StoreInst &SI) { Value* Shadow = DFSF.getShadow(SI.getValueOperand()); if (ClCombinePointerLabelsOnStore) { Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand()); - Shadow = DFSF.DFS.combineShadows(Shadow, PtrShadow, &SI); + Shadow = DFSF.combineShadows(Shadow, PtrShadow, &SI); } DFSF.storeShadow(SI.getPointerOperand(), Size, Align, Shadow, &SI); } @@ -1131,9 +1243,9 @@ void DFSanVisitor::visitSelectInst(SelectInst &I) { if (isa(I.getCondition()->getType())) { DFSF.setShadow( - &I, DFSF.DFS.combineShadows( - CondShadow, - DFSF.DFS.combineShadows(TrueShadow, FalseShadow, &I), &I)); + &I, + DFSF.combineShadows( + CondShadow, DFSF.combineShadows(TrueShadow, FalseShadow, &I), &I)); } else { Value *ShadowSel; if (TrueShadow == FalseShadow) { @@ -1142,7 +1254,7 @@ void DFSanVisitor::visitSelectInst(SelectInst &I) { ShadowSel = SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I); } - DFSF.setShadow(&I, DFSF.DFS.combineShadows(CondShadow, ShadowSel, &I)); + DFSF.setShadow(&I, DFSF.combineShadows(CondShadow, ShadowSel, &I)); } } diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp index 1bfef574ddb9..f2f1738808be 100644 --- a/lib/Transforms/Instrumentation/DebugIR.cpp +++ b/lib/Transforms/Instrumentation/DebugIR.cpp @@ -352,10 +352,12 @@ class DIUpdater : public InstVisitor { } std::string getTypeName(Type *T) { - assert(T != nullptr && "Expecting non-null Type"); std::string TypeName; raw_string_ostream TypeStream(TypeName); - T->print(TypeStream); + if (T) + T->print(TypeStream); + else + TypeStream << "Printing Type"; TypeStream.flush(); return TypeName; } diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 75c56c2d4300..57e308c20dba 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -511,7 +511,7 @@ struct MemorySanitizerVisitor : public InstVisitor { // The following flags disable parts of MSan instrumentation based on // blacklist contents and command-line options. bool InsertChecks; - bool LoadShadow; + bool PropagateShadow; bool PoisonStack; bool PoisonUndef; bool CheckReturnValue; @@ -532,7 +532,7 @@ struct MemorySanitizerVisitor : public InstVisitor { bool SanitizeFunction = F.getAttributes().hasAttribute( AttributeSet::FunctionIndex, Attribute::SanitizeMemory); InsertChecks = SanitizeFunction; - LoadShadow = SanitizeFunction; + PropagateShadow = SanitizeFunction; PoisonStack = SanitizeFunction && ClPoisonStack; PoisonUndef = SanitizeFunction && ClPoisonUndef; // FIXME: Consider using SpecialCaseList to specify a list of functions that @@ -569,7 +569,7 @@ struct MemorySanitizerVisitor : public InstVisitor { ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex))); IRB.CreateCall3(Fn, ConvertedShadow2, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), - updateOrigin(Origin, IRB)); + Origin); } else { Value *Cmp = IRB.CreateICmpNE( ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp"); @@ -721,8 +721,7 @@ struct MemorySanitizerVisitor : public InstVisitor { size_t NumValues = PN->getNumIncomingValues(); for (size_t v = 0; v < NumValues; v++) { PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v)); - if (PNO) - PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v)); + if (PNO) PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v)); } } @@ -856,7 +855,7 @@ struct MemorySanitizerVisitor : public InstVisitor { /// \brief Set SV to be the shadow value for V. void setShadow(Value *V, Value *SV) { assert(!ShadowMap.count(V) && "Values may only have one shadow"); - ShadowMap[V] = SV; + ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V); } /// \brief Set Origin to be the origin value for V. @@ -908,6 +907,7 @@ struct MemorySanitizerVisitor : public InstVisitor { /// This function either returns the value set earlier with setShadow, /// or extracts if from ParamTLS (for function arguments). Value *getShadow(Value *V) { + if (!PropagateShadow) return getCleanShadow(V); if (Instruction *I = dyn_cast(V)) { // For instructions the shadow is already stored in the map. Value *Shadow = ShadowMap[V]; @@ -1075,7 +1075,7 @@ struct MemorySanitizerVisitor : public InstVisitor { IRBuilder<> IRB(I.getNextNode()); Type *ShadowTy = getShadowTy(&I); Value *Addr = I.getPointerOperand(); - if (LoadShadow) { + if (PropagateShadow) { Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB); setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld")); @@ -1090,7 +1090,7 @@ struct MemorySanitizerVisitor : public InstVisitor { I.setOrdering(addAcquireOrdering(I.getOrdering())); if (MS.TrackOrigins) { - if (LoadShadow) { + if (PropagateShadow) { unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment()); setOrigin(&I, IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB), Alignment)); @@ -1757,7 +1757,7 @@ struct MemorySanitizerVisitor : public InstVisitor { Value *Addr = I.getArgOperand(0); Type *ShadowTy = getShadowTy(&I); - if (LoadShadow) { + if (PropagateShadow) { Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB); // We don't know the pointer alignment (could be unaligned SSE load!). // Have to assume to worst case. @@ -1770,7 +1770,7 @@ struct MemorySanitizerVisitor : public InstVisitor { insertShadowCheck(Addr, &I); if (MS.TrackOrigins) { - if (LoadShadow) + if (PropagateShadow) setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB))); else setOrigin(&I, getCleanOrigin()); @@ -2272,12 +2272,6 @@ struct MemorySanitizerVisitor : public InstVisitor { return; } - // Allow only tail calls with the same types, otherwise - // we may have a false positive: shadow for a non-void RetVal - // will get propagated to a void RetVal. - if (Call->isTailCall() && Call->getType() != Call->getParent()->getType()) - Call->setTailCall(false); - assert(!isa(&I) && "intrinsics are handled elsewhere"); // We are going to insert code that relies on the fact that the callee @@ -2319,6 +2313,7 @@ struct MemorySanitizerVisitor : public InstVisitor { Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset); DEBUG(dbgs() << " Arg#" << i << ": " << *A << " Shadow: " << *ArgShadow << "\n"); + bool ArgIsInitialized = false; if (CS.paramHasAttr(i + 1, Attribute::ByVal)) { assert(A->getType()->isPointerTy() && "ByVal argument is not a pointer!"); @@ -2331,8 +2326,10 @@ struct MemorySanitizerVisitor : public InstVisitor { Size = MS.DL->getTypeAllocSize(A->getType()); Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase, kShadowTLSAlignment); + Constant *Cst = dyn_cast(ArgShadow); + if (Cst && Cst->isNullValue()) ArgIsInitialized = true; } - if (MS.TrackOrigins) + if (MS.TrackOrigins && !ArgIsInitialized) IRB.CreateStore(getOrigin(A), getOriginPtrForArgument(A, IRB, ArgOffset)); (void)Store; @@ -2400,6 +2397,11 @@ struct MemorySanitizerVisitor : public InstVisitor { void visitPHINode(PHINode &I) { IRBuilder<> IRB(&I); + if (!PropagateShadow) { + setShadow(&I, getCleanShadow(&I)); + return; + } + ShadowPHINodes.push_back(&I); setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(), "_msphi_s")); diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index f3bc36f04cf0..89386a6a86de 100644 --- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -40,7 +40,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/Transforms/Utils/SpecialCaseList.h" using namespace llvm; diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 2dcfa237ca33..8e8a9de1ff08 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -22,6 +22,8 @@ add_llvm_library(LLVMScalarOpts LoopUnswitch.cpp LowerAtomic.cpp MemCpyOptimizer.cpp + MergedLoadStoreMotion.cpp + NullCheckElimination.cpp PartiallyInlineLibCalls.cpp Reassociate.cpp Reg2Mem.cpp diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 56781d44aaa0..106eba099ca0 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -1798,6 +1798,10 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { case LLVMContext::MD_fpmath: ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD)); break; + case LLVMContext::MD_invariant_load: + // Only set the !invariant.load if it is present in both instructions. + ReplInst->setMetadata(Kind, IMD); + break; } } } diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index e501ff29d038..21f80385cf46 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -158,7 +158,13 @@ bool JumpThreading::runOnFunction(Function &F) { TLI = &getAnalysis(); LVI = &getAnalysis(); - // Remove unreachable blocks from function as they may result in infinite loop. + // Remove unreachable blocks from function as they may result in infinite + // loop. We do threading if we found something profitable. Jump threading a + // branch can create other opportunities. If these opportunities form a cycle + // i.e. if any jump treading is undoing previous threading in the path, then + // we will loop forever. We take care of this issue by not jump threading for + // back edges. This works for normal cases but not for unreachable blocks as + // they may have cycle with no back edge. removeUnreachableBlocks(F); FindLoopHeaders(F); @@ -663,14 +669,9 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { if (LoopHeaders.erase(SinglePred)) LoopHeaders.insert(BB); - // Remember if SinglePred was the entry block of the function. If so, we - // will need to move BB back to the entry position. - bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); LVI->eraseBlock(SinglePred); MergeBasicBlockIntoOnlyPred(BB); - if (isEntry && BB != &BB->getParent()->getEntryBlock()) - BB->moveBefore(&BB->getParent()->getEntryBlock()); return true; } } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 0a8d16f49e03..abcceb20050a 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -192,6 +192,14 @@ namespace { SmallVectorImpl &ExitBlocks, SmallVectorImpl &InsertPts, PredIteratorCache &PIC); + + /// \brief Create a copy of the instruction in the exit block and patch up + /// SSA. + /// PN is a user of I in ExitBlock that can be used to get the number and + /// list of predecessors fast. + Instruction *CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN); }; } @@ -531,6 +539,35 @@ bool LICM::isNotUsedInLoop(Instruction &I) { return true; } +Instruction *LICM::CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN) { + Instruction *New = I.clone(); + ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New); + if (!I.getName().empty()) New->setName(I.getName() + ".le"); + + // Build LCSSA PHI nodes for any in-loop operands. Note that this is + // particularly cheap because we can rip off the PHI node that we're + // replacing for the number and blocks of the predecessors. + // OPT: If this shows up in a profile, we can instead finish sinking all + // invariant instructions, and then walk their operands to re-establish + // LCSSA. That will eliminate creating PHI nodes just to nuke them when + // sinking bottom-up. + for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE; + ++OI) + if (Instruction *OInst = dyn_cast(*OI)) + if (Loop *OLoop = LI->getLoopFor(OInst->getParent())) + if (!OLoop->contains(&PN)) { + PHINode *OpPN = + PHINode::Create(OInst->getType(), PN.getNumIncomingValues(), + OInst->getName() + ".lcssa", ExitBlock.begin()); + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) + OpPN->addIncoming(OInst, PN.getIncomingBlock(i)); + *OI = OpPN; + } + return New; +} + /// sink - When an instruction is found to only be used outside of the loop, /// this function moves it to the exit blocks and patches up SSA form as needed. /// This method is guaranteed to remove the original instruction from its @@ -550,6 +587,9 @@ void LICM::sink(Instruction &I) { SmallPtrSet ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); #endif + // Clones of this instruction. Don't create more than one per exit block! + SmallDenseMap SunkCopies; + // If this instruction is only used outside of the loop, then all users are // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of // the instruction. @@ -561,30 +601,13 @@ void LICM::sink(Instruction &I) { assert(ExitBlockSet.count(ExitBlock) && "The LCSSA PHI is not in an exit block!"); - Instruction *New = I.clone(); - ExitBlock->getInstList().insert(ExitBlock->getFirstInsertionPt(), New); - if (!I.getName().empty()) - New->setName(I.getName() + ".le"); - - // Build LCSSA PHI nodes for any in-loop operands. Note that this is - // particularly cheap because we can rip off the PHI node that we're - // replacing for the number and blocks of the predecessors. - // OPT: If this shows up in a profile, we can instead finish sinking all - // invariant instructions, and then walk their operands to re-establish - // LCSSA. That will eliminate creating PHI nodes just to nuke them when - // sinking bottom-up. - for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE; - ++OI) - if (Instruction *OInst = dyn_cast(*OI)) - if (Loop *OLoop = LI->getLoopFor(OInst->getParent())) - if (!OLoop->contains(PN)) { - PHINode *OpPN = PHINode::Create( - OInst->getType(), PN->getNumIncomingValues(), - OInst->getName() + ".lcssa", ExitBlock->begin()); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - OpPN->addIncoming(OInst, PN->getIncomingBlock(i)); - *OI = OpPN; - } + Instruction *New; + auto It = SunkCopies.find(ExitBlock); + if (It != SunkCopies.end()) + New = It->second; + else + New = SunkCopies[ExitBlock] = + CloneInstructionInExitBlock(I, *ExitBlock, *PN); PN->replaceAllUsesWith(New); PN->eraseFromParent(); @@ -616,7 +639,7 @@ void LICM::hoist(Instruction &I) { /// bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { // If it is not a trapping instruction, it is always safe to hoist. - if (isSafeToSpeculativelyExecute(&Inst)) + if (isSafeToSpeculativelyExecute(&Inst, DL)) return true; return isGuaranteedToExecute(Inst); diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp index 8b5e036dbec2..b6fbb16166dd 100644 --- a/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -924,8 +924,10 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, // them, and this matching fails. As an exception, we allow the alias // set tracker to handle regular (simple) load/store dependencies. if (FutureSideEffects && - ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) || - (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) { + ((!isSimpleLoadStore(J1) && + !isSafeToSpeculativelyExecute(J1, DL)) || + (!isSimpleLoadStore(J2) && + !isSafeToSpeculativelyExecute(J2, DL)))) { DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << " vs. " << *J2 << " (side effects prevent reordering)\n"); diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 0af5a71c126d..935f289f040f 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -220,7 +220,7 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, } // Returns the value associated with the given metadata node name (for -// example, "llvm.loopunroll.count"). If no such named metadata node +// example, "llvm.loop.unroll.count"). If no such named metadata node // exists, then nullptr is returned. static const ConstantInt *GetUnrollMetadataValue(const Loop *L, StringRef Name) { @@ -250,24 +250,22 @@ static const ConstantInt *GetUnrollMetadataValue(const Loop *L, // Returns true if the loop has an unroll(enable) pragma. static bool HasUnrollEnablePragma(const Loop *L) { const ConstantInt *EnableValue = - GetUnrollMetadataValue(L, "llvm.loopunroll.enable"); + GetUnrollMetadataValue(L, "llvm.loop.unroll.enable"); return (EnableValue && EnableValue->getZExtValue()); - return false; } // Returns true if the loop has an unroll(disable) pragma. static bool HasUnrollDisablePragma(const Loop *L) { const ConstantInt *EnableValue = - GetUnrollMetadataValue(L, "llvm.loopunroll.enable"); + GetUnrollMetadataValue(L, "llvm.loop.unroll.enable"); return (EnableValue && !EnableValue->getZExtValue()); - return false; } // If loop has an unroll_count pragma return the (necessarily // positive) value from the pragma. Otherwise return 0. static unsigned UnrollCountPragmaValue(const Loop *L) { const ConstantInt *CountValue = - GetUnrollMetadataValue(L, "llvm.loopunroll.count"); + GetUnrollMetadataValue(L, "llvm.loop.unroll.count"); if (CountValue) { unsigned Count = CountValue->getZExtValue(); assert(Count >= 1 && "Unroll count must be positive."); @@ -276,6 +274,43 @@ static unsigned UnrollCountPragmaValue(const Loop *L) { return 0; } +// Remove existing unroll metadata and add unroll disable metadata to +// indicate the loop has already been unrolled. This prevents a loop +// from being unrolled more than is directed by a pragma if the loop +// unrolling pass is run more than once (which it generally is). +static void SetLoopAlreadyUnrolled(Loop *L) { + MDNode *LoopID = L->getLoopID(); + if (!LoopID) return; + + // First remove any existing loop unrolling metadata. + SmallVector Vals; + // Reserve first location for self reference to the LoopID metadata node. + Vals.push_back(nullptr); + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + bool IsUnrollMetadata = false; + MDNode *MD = dyn_cast(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast(MD->getOperand(0)); + IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); + } + if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i)); + } + + // Add unroll(disable) metadata to disable future unrolling. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector DisableOperands; + DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.enable")); + DisableOperands.push_back(ConstantInt::get(Type::getInt1Ty(Context), 0)); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + Vals.push_back(DisableNode); + + MDNode *NewLoopID = MDNode::get(Context, Vals); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + LoopID->replaceAllUsesWith(NewLoopID); +} + unsigned LoopUnroll::selectUnrollCount( const Loop *L, unsigned TripCount, bool HasEnablePragma, unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP, @@ -430,6 +465,10 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } if (HasPragma) { + // Mark loop as unrolled to prevent unrolling beyond that + // requested by the pragma. + SetLoopAlreadyUnrolled(L); + // Emit optimization remarks if we are unable to unroll the loop // as directed by a pragma. DebugLoc LoopLoc = L->getStartLoc(); diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index b6bc79228824..7c184a4ad2c3 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -684,6 +684,12 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, } } + // Check that src isn't captured by the called function since the + // transformation can cause aliasing issues in that case. + for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) + if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i)) + return false; + // Since we're changing the parameter to the callsite, we need to make sure // that what would be the new parameter dominates the callsite. DominatorTree &DT = getAnalysis().getDomTree(); diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp new file mode 100644 index 000000000000..a7e80240d9e7 --- /dev/null +++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -0,0 +1,632 @@ +//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//! \file +//! \brief This pass performs merges of loads and stores on both sides of a +// diamond (hammock). It hoists the loads and sinks the stores. +// +// The algorithm iteratively hoists two loads to the same address out of a +// diamond (hammock) and merges them into a single load in the header. Similar +// it sinks and merges two stores to the tail block (footer). The algorithm +// iterates over the instructions of one side of the diamond and attempts to +// find a matching load/store on the other side. It hoists / sinks when it +// thinks it safe to do so. This optimization helps with eg. hiding load +// latencies, triggering if-conversion, and reducing static code size. +// +//===----------------------------------------------------------------------===// +// +// +// Example: +// Diamond shaped code before merge: +// +// header: +// br %cond, label %if.then, label %if.else +// / \ +// / \ +// / \ +// if.then: if.else: +// %lt = load %addr_l %le = load %addr_l +// +// <...> <...> +// store %st, %addr_s store %se, %addr_s +// br label %if.end br label %if.end +// \ / +// \ / +// \ / +// if.end ("footer"): +// <...> +// +// Diamond shaped code after merge: +// +// header: +// %l = load %addr_l +// br %cond, label %if.then, label %if.else +// / \ +// / \ +// / \ +// if.then: if.else: +// +// <...> <...> +// br label %if.end br label %if.end +// \ / +// \ / +// \ / +// if.end ("footer"): +// %s.sink = phi [%st, if.then], [%se, if.else] +// <...> +// store %s.sink, %addr_s +// <...> +// +// +//===----------------------- TODO -----------------------------------------===// +// +// 1) Generalize to regions other than diamonds +// 2) Be more aggressive merging memory operations +// Note that both changes require register pressure control +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include +using namespace llvm; + +#define DEBUG_TYPE "mldst-motion" + +//===----------------------------------------------------------------------===// +// MergedLoadStoreMotion Pass +//===----------------------------------------------------------------------===// +static cl::opt +EnableMLSM("mlsm", cl::desc("Enable motion of merged load and store"), + cl::init(true)); + +namespace { +class MergedLoadStoreMotion : public FunctionPass { + AliasAnalysis *AA; + MemoryDependenceAnalysis *MD; + +public: + static char ID; // Pass identification, replacement for typeid + explicit MergedLoadStoreMotion(void) + : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) { + initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + +private: + // This transformation requires dominator postdominator info + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + } + + // Helper routines + + /// + /// \brief Remove instruction from parent and update memory dependence + /// analysis. + /// + void removeInstruction(Instruction *Inst); + BasicBlock *getDiamondTail(BasicBlock *BB); + bool isDiamondHead(BasicBlock *BB); + // Routines for hoisting loads + bool isLoadHoistBarrier(Instruction *Inst); + LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI); + void hoistInstruction(BasicBlock *BB, Instruction *HoistCand, + Instruction *ElseInst); + bool isSafeToHoist(Instruction *I) const; + bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst); + bool mergeLoads(BasicBlock *BB); + // Routines for sinking stores + StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); + PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); + bool isStoreSinkBarrier(Instruction *Inst); + bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); + bool mergeStores(BasicBlock *BB); + // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, + // where Size0 and Size1 are the #instructions on the two sides of + // the diamond. The constant chosen here is arbitrary. Compiler Time + // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl. + const int MagicCompileTimeControl; +}; + +char MergedLoadStoreMotion::ID = 0; +} + +/// +/// \brief createMergedLoadStoreMotionPass - The public interface to this file. +/// +FunctionPass *llvm::createMergedLoadStoreMotionPass() { + return new MergedLoadStoreMotion(); +} + +INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", + "MergedLoadStoreMotion", false, false) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", + "MergedLoadStoreMotion", false, false) + +/// +/// \brief Remove instruction from parent and update memory dependence analysis. +/// +void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) { + // Notify the memory dependence analysis. + if (MD) { + MD->removeInstruction(Inst); + if (LoadInst *LI = dyn_cast(Inst)) + MD->invalidateCachedPointerInfo(LI->getPointerOperand()); + if (Inst->getType()->getScalarType()->isPointerTy()) { + MD->invalidateCachedPointerInfo(Inst); + } + } + Inst->eraseFromParent(); +} + +/// +/// \brief Return tail block of a diamond. +/// +BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) { + assert(isDiamondHead(BB) && "Basic block is not head of a diamond"); + BranchInst *BI = (BranchInst *)(BB->getTerminator()); + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0); + return Tail; +} + +/// +/// \brief True when BB is the head of a diamond (hammock) +/// +bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { + if (!BB) + return false; + if (!isa(BB->getTerminator())) + return false; + if (BB->getTerminator()->getNumSuccessors() != 2) + return false; + + BranchInst *BI = (BranchInst *)(BB->getTerminator()); + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Succ1 = BI->getSuccessor(1); + + if (!Succ0->getSinglePredecessor() || + Succ0->getTerminator()->getNumSuccessors() != 1) + return false; + if (!Succ1->getSinglePredecessor() || + Succ1->getTerminator()->getNumSuccessors() != 1) + return false; + + BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0); + // Ignore triangles. + if (Succ1->getTerminator()->getSuccessor(0) != Tail) + return false; + return true; +} + +/// +/// \brief True when instruction is a hoist barrier for a load +/// +/// Whenever an instruction could possibly modify the value +/// being loaded or protect against the load from happening +/// it is considered a hoist barrier. +/// +bool MergedLoadStoreMotion::isLoadHoistBarrier(Instruction *Inst) { + // FIXME: A call with no side effects should not be a barrier. + // Aren't all such calls covered by mayHaveSideEffects() below? + // Then this check can be removed. + if (isa(Inst)) + return true; + if (isa(Inst)) + return true; + // FIXME: Conservatively let a store instruction block the load. + // Use alias analysis instead. + if (isa(Inst)) + return true; + // Note: mayHaveSideEffects covers all instructions that could + // trigger a change to state. Eg. in-flight stores have to be executed + // before ordered loads or fences, calls could invoke functions that store + // data to memory etc. + if (Inst->mayHaveSideEffects()) { + return true; + } + DEBUG(dbgs() << "No Hoist Barrier\n"); + return false; +} + +/// +/// \brief Decide if a load can be hoisted +/// +/// When there is a load in \p BB to the same address as \p LI +/// and it can be hoisted from \p BB, return that load. +/// Otherwise return Null. +/// +LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB, + LoadInst *LI) { + LoadInst *I = nullptr; + assert(isa(LI)); + if (LI->isUsedOutsideOfBlock(LI->getParent())) + return nullptr; + + for (BasicBlock::iterator BBI = BB->begin(), BBE = BB->end(); BBI != BBE; + ++BBI) { + Instruction *Inst = BBI; + + // Only merge and hoist loads when their result in used only in BB + if (isLoadHoistBarrier(Inst)) + break; + if (!isa(Inst)) + continue; + if (Inst->isUsedOutsideOfBlock(Inst->getParent())) + continue; + + AliasAnalysis::Location LocLI = AA->getLocation(LI); + AliasAnalysis::Location LocInst = AA->getLocation((LoadInst *)Inst); + if (AA->isMustAlias(LocLI, LocInst) && LI->getType() == Inst->getType()) { + I = (LoadInst *)Inst; + break; + } + } + return I; +} + +/// +/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into +/// \p BB +/// +/// BB is the head of a diamond +/// +void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB, + Instruction *HoistCand, + Instruction *ElseInst) { + DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n"); + // Hoist the instruction. + assert(HoistCand->getParent() != BB); + + // Intersect optional metadata. + HoistCand->intersectOptionalDataWith(ElseInst); + HoistCand->dropUnknownMetadata(); + + // Prepend point for instruction insert + Instruction *HoistPt = BB->getTerminator(); + + // Merged instruction + Instruction *HoistedInst = HoistCand->clone(); + + // Notify AA of the new value. + if (isa(HoistCand)) + AA->copyValue(HoistCand, HoistedInst); + + // Hoist instruction. + HoistedInst->insertBefore(HoistPt); + + HoistCand->replaceAllUsesWith(HoistedInst); + removeInstruction(HoistCand); + // Replace the else block instruction. + ElseInst->replaceAllUsesWith(HoistedInst); + removeInstruction(ElseInst); +} + +/// +/// \brief Return true if no operand of \p I is defined in I's parent block +/// +bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const { + BasicBlock *Parent = I->getParent(); + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Instruction *Instr = dyn_cast(I->getOperand(i)); + if (Instr && Instr->getParent() == Parent) + return false; + } + return true; +} + +/// +/// \brief Merge two equivalent loads and GEPs and hoist into diamond head +/// +bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0, + LoadInst *L1) { + // Only one definition? + Instruction *A0 = dyn_cast(L0->getPointerOperand()); + Instruction *A1 = dyn_cast(L1->getPointerOperand()); + if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) && + A0->hasOneUse() && (A0->getParent() == L0->getParent()) && + A1->hasOneUse() && (A1->getParent() == L1->getParent()) && + isa(A0)) { + DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n"); + hoistInstruction(BB, A0, A1); + hoistInstruction(BB, L0, L1); + return true; + } else + return false; +} + +/// +/// \brief Try to hoist two loads to same address into diamond header +/// +/// Starting from a diamond head block, iterate over the instructions in one +/// successor block and try to match a load in the second successor. +/// +bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { + bool MergedLoads = false; + assert(isDiamondHead(BB)); + BranchInst *BI = dyn_cast(BB->getTerminator()); + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Succ1 = BI->getSuccessor(1); + // #Instructions in Succ1 for Compile Time Control + int Size1 = Succ1->size(); + int NLoads = 0; + for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end(); + BBI != BBE;) { + + Instruction *I = BBI; + ++BBI; + if (isLoadHoistBarrier(I)) + break; + + // Only move non-simple (atomic, volatile) loads. + if (!isa(I)) + continue; + + LoadInst *L0 = (LoadInst *)I; + if (!L0->isSimple()) + continue; + + ++NLoads; + if (NLoads * Size1 >= MagicCompileTimeControl) + break; + if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) { + bool Res = hoistLoad(BB, L0, L1); + MergedLoads |= Res; + // Don't attempt to hoist above loads that had not been hoisted. + if (!Res) + break; + } + } + return MergedLoads; +} + +/// +/// \brief True when instruction is sink barrier for a store +/// +bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) { + // FIXME: Conservatively let a load instruction block the store. + // Use alias analysis instead. + if (isa(Inst)) + return true; + if (isa(Inst)) + return true; + if (isa(Inst) && !isa(Inst)) + return true; + // Note: mayHaveSideEffects covers all instructions that could + // trigger a change to state. Eg. in-flight stores have to be executed + // before ordered loads or fences, calls could invoke functions that store + // data to memory etc. + if (!isa(Inst) && Inst->mayHaveSideEffects()) { + return true; + } + DEBUG(dbgs() << "No Sink Barrier\n"); + return false; +} + +/// +/// \brief Check if \p BB contains a store to the same address as \p SI +/// +/// \return The store in \p when it is safe to sink. Otherwise return Null. +/// +StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB, + StoreInst *SI) { + StoreInst *I = 0; + DEBUG(dbgs() << "can Sink? : "; SI->dump(); dbgs() << "\n"); + for (BasicBlock::reverse_iterator RBI = BB->rbegin(), RBE = BB->rend(); + RBI != RBE; ++RBI) { + Instruction *Inst = &*RBI; + + // Only move loads if they are used in the block. + if (isStoreSinkBarrier(Inst)) + break; + if (isa(Inst)) { + AliasAnalysis::Location LocSI = AA->getLocation(SI); + AliasAnalysis::Location LocInst = AA->getLocation((StoreInst *)Inst); + if (AA->isMustAlias(LocSI, LocInst)) { + I = (StoreInst *)Inst; + break; + } + } + } + return I; +} + +/// +/// \brief Create a PHI node in BB for the operands of S0 and S1 +/// +PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, + StoreInst *S1) { + // Create a phi if the values mismatch. + PHINode *NewPN = 0; + Value *Opd1 = S0->getValueOperand(); + Value *Opd2 = S1->getValueOperand(); + if (Opd1 != Opd2) { + NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", + BB->begin()); + NewPN->addIncoming(Opd1, S0->getParent()); + NewPN->addIncoming(Opd2, S1->getParent()); + if (NewPN->getType()->getScalarType()->isPointerTy()) { + // Notify AA of the new value. + AA->copyValue(Opd1, NewPN); + AA->copyValue(Opd2, NewPN); + // AA needs to be informed when a PHI-use of the pointer value is added + for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) { + unsigned J = PHINode::getOperandNumForIncomingValue(I); + AA->addEscapingUse(NewPN->getOperandUse(J)); + } + if (MD) + MD->invalidateCachedPointerInfo(NewPN); + } + } + return NewPN; +} + +/// +/// \brief Merge two stores to same address and sink into \p BB +/// +/// Also sinks GEP instruction computing the store address +/// +bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, + StoreInst *S1) { + // Only one definition? + Instruction *A0 = dyn_cast(S0->getPointerOperand()); + Instruction *A1 = dyn_cast(S1->getPointerOperand()); + if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && + (A0->getParent() == S0->getParent()) && A1->hasOneUse() && + (A1->getParent() == S1->getParent()) && isa(A0)) { + DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); + // Hoist the instruction. + BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); + // Intersect optional metadata. + S0->intersectOptionalDataWith(S1); + S0->dropUnknownMetadata(); + + // Create the new store to be inserted at the join point. + StoreInst *SNew = (StoreInst *)(S0->clone()); + Instruction *ANew = A0->clone(); + AA->copyValue(S0, SNew); + SNew->insertBefore(InsertPt); + ANew->insertBefore(SNew); + + assert(S0->getParent() == A0->getParent()); + assert(S1->getParent() == A1->getParent()); + + PHINode *NewPN = getPHIOperand(BB, S0, S1); + // New PHI operand? Use it. + if (NewPN) + SNew->setOperand(0, NewPN); + removeInstruction(S0); + removeInstruction(S1); + A0->replaceAllUsesWith(ANew); + removeInstruction(A0); + A1->replaceAllUsesWith(ANew); + removeInstruction(A1); + return true; + } + return false; +} + +/// +/// \brief True when two stores are equivalent and can sink into the footer +/// +/// Starting from a diamond tail block, iterate over the instructions in one +/// predecessor block and try to match a store in the second predecessor. +/// +bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { + + bool MergedStores = false; + assert(T && "Footer of a diamond cannot be empty"); + + pred_iterator PI = pred_begin(T), E = pred_end(T); + assert(PI != E); + BasicBlock *Pred0 = *PI; + ++PI; + BasicBlock *Pred1 = *PI; + ++PI; + // tail block of a diamond/hammock? + if (Pred0 == Pred1) + return false; // No. + if (PI != E) + return false; // No. More than 2 predecessors. + + // #Instructions in Succ1 for Compile Time Control + int Size1 = Pred1->size(); + int NStores = 0; + + for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend(); + RBI != RBE;) { + + Instruction *I = &*RBI; + ++RBI; + if (isStoreSinkBarrier(I)) + break; + // Sink move non-simple (atomic, volatile) stores + if (!isa(I)) + continue; + StoreInst *S0 = (StoreInst *)I; + if (!S0->isSimple()) + continue; + + ++NStores; + if (NStores * Size1 >= MagicCompileTimeControl) + break; + if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) { + bool Res = sinkStore(T, S0, S1); + MergedStores |= Res; + // Don't attempt to sink below stores that had to stick around + // But after removal of a store and some of its feeding + // instruction search again from the beginning since the iterator + // is likely stale at this point. + if (!Res) + break; + else { + RBI = Pred0->rbegin(); + RBE = Pred0->rend(); + DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); + } + } + } + return MergedStores; +} +/// +/// \brief Run the transformation for each function +/// +bool MergedLoadStoreMotion::runOnFunction(Function &F) { + MD = &getAnalysis(); + AA = &getAnalysis(); + + bool Changed = false; + if (!EnableMLSM) + return false; + DEBUG(dbgs() << "Instruction Merger\n"); + + // Merge unconditional branches, allowing PRE to catch more + // optimization opportunities. + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { + BasicBlock *BB = FI++; + + // Hoist equivalent loads and sink stores + // outside diamonds when possible + // Run outside core GVN + if (isDiamondHead(BB)) { + Changed |= mergeLoads(BB); + Changed |= mergeStores(getDiamondTail(BB)); + } + } + return Changed; +} diff --git a/lib/Transforms/Scalar/NullCheckElimination.cpp b/lib/Transforms/Scalar/NullCheckElimination.cpp new file mode 100644 index 000000000000..7295db7adc47 --- /dev/null +++ b/lib/Transforms/Scalar/NullCheckElimination.cpp @@ -0,0 +1,404 @@ +//===-- NullCheckElimination.cpp - Null Check Elimination Pass ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +using namespace llvm; + +#define DEBUG_TYPE "null-check-elimination" + +namespace { + struct NullCheckElimination : public FunctionPass { + static char ID; + NullCheckElimination() : FunctionPass(ID) { + initializeNullCheckEliminationPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + + private: + static const unsigned kPhiLimit = 16; + typedef SmallPtrSet SmallPhiSet; + + enum CmpKind { + /// A null check of an unconditionally nonnull-or-poison value. + NullCheckDefiniteCmp, + + /// A null check of the phi representing a nontrivial inbounds recurrence, + /// which is not known to be unconditionally nonnull-or-poison. + NullCheckRecurrenceCmp, + + /// A comparison of the phi representing a nontrivial inbounds recurrence + /// with an inbounds GEP derived from the base of the recurrence, which + /// will typically represent a bound on the recurrence. + RecurrencePhiBoundCmp, + }; + + enum CmpPred { + CmpEq, + CmpNe, + }; + + struct CmpDesc { + CmpDesc(CmpKind k, CmpPred p, Use *u, Value *v) + : kind(k), pred(p), use(u), ptrValue(v) { } + + CmpKind kind; + CmpPred pred; + Use *use; + Value *ptrValue; + }; + + typedef SmallVector CmpDescVec; + + bool isNonNullOrPoisonPhi(SmallPhiSet *VisitedPhis, PHINode*); + Value *isNontrivialInBoundsRecurrence(PHINode*); + + bool classifyCmp(CmpDescVec*, Use*); + bool findRelevantCmps(CmpDescVec*, Use*); + + bool blockContainsLoadDerivedFrom(BasicBlock*, Value*); + + /// Tracks values that are unconditionally nonnull-or-poison by definition, + /// but not values that are known nonnull-or-poison in a given context by + /// their uses, e.g. in recurrences. + DenseSet NonNullOrPoisonValues; + + /// Tracks values that are bases of nontrivial inbounds recurrences. + DenseSet InBoundsRecurrenceBases; + + /// Maps phis that correspond to nontrivial inbounds recurrences to their + /// base values. + DenseMap InBoundsRecurrenceBaseMap; + }; +} + +char NullCheckElimination::ID = 0; +INITIALIZE_PASS_BEGIN(NullCheckElimination, + "null-check-elimination", + "Null Check Elimination", + false, false) +INITIALIZE_PASS_END(NullCheckElimination, + "null-check-elimination", + "Null Check Elimination", + false, false) + +FunctionPass *llvm::createNullCheckEliminationPass() { + return new NullCheckElimination(); +} + +static GetElementPtrInst *castToInBoundsGEP(Value *V) { + auto *GEP = dyn_cast(V); + if (!GEP || !GEP->isInBounds()) + return nullptr; + return GEP; +} + +static bool isZeroConstant(Value *V) { + auto *C = dyn_cast(V); + return C && C->isZeroValue(); +} + +bool NullCheckElimination::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + + bool Changed = false; + + // Collect arguments with the `nonnull` attribute. + for (auto &Arg : F.args()) { + if (Arg.hasNonNullAttr()) + NonNullOrPoisonValues.insert(&Arg); + } + + // Collect instructions that definitely produce nonnull-or-poison values. At + // the moment, this is restricted to inbounds GEPs, and phis that are derived + // entirely from nonnull-or-poison-values (including other phis that are + // themselves derived from the same). + for (auto &BB : F) { + for (auto &I : BB) { + if (auto *GEP = castToInBoundsGEP(&I)) { + NonNullOrPoisonValues.insert(GEP); + } else if (auto *PN = dyn_cast(&I)) { + SmallPhiSet VisitedPHIs; + if (isNonNullOrPoisonPhi(&VisitedPHIs, PN)) + NonNullOrPoisonValues.insert(PN); + + if (auto *BaseV = isNontrivialInBoundsRecurrence(PN)) { + InBoundsRecurrenceBases.insert(BaseV); + InBoundsRecurrenceBaseMap[PN] = BaseV; + } + } + } + } + + for (auto &BB : F) { + // This could also be extended to handle SwitchInst, but using a SwitchInst + // for a null check seems unlikely. + auto *BI = dyn_cast(BB.getTerminator()); + if (!BI || BI->isUnconditional()) + continue; + + // The first operand of a conditional branch is the condition. + CmpDescVec Cmps; + if (!findRelevantCmps(&Cmps, &BI->getOperandUse(0))) + continue; + + for (auto &Cmp : Cmps) { + // We are only tracking comparisons of inbounds recurrence phis with their + // bounds so that we can eliminate null checks based on them, which are of + // kind NullCheckRecurrenceCmp. We can't use a lone RecurrencePhiBoundCmp + // to perform any optimizations. + if (Cmp.kind == RecurrencePhiBoundCmp) + continue; + + if (Cmp.kind == NullCheckRecurrenceCmp) { + // Look for a matching RecurrencePhiBoundCmp. If one exists, then we can + // be sure that this branch condition depends on the recurrence. Since + // both the bounds and the recurrence successor value are inbounds, and + // they are both derived from the base, the base being null would imply + // that the bounds and recurrence successor values are poison. + bool FoundMatchingCmp = false; + for (auto &OtherCmp : Cmps) { + if (OtherCmp.kind == RecurrencePhiBoundCmp && + OtherCmp.ptrValue == Cmp.ptrValue) { + FoundMatchingCmp = true; + break; + } + } + if (!FoundMatchingCmp) + continue; + } + + BasicBlock *NonNullBB; + if (Cmp.pred == CmpEq) { + // If the comparison instruction is checking for equality with null then + // the pointer is nonnull on the `false` branch. + NonNullBB = BI->getSuccessor(1); + } else { + // Otherwise, if the comparison instruction is checking for inequality + // with null, the pointer is nonnull on the `true` branch. + NonNullBB = BI->getSuccessor(0); + } + + // This is a crude approximation of control dependence: if the branch + // target has a single predecessor edge, then it must be control- + // dependent on the branch. + if (!NonNullBB->getSinglePredecessor()) + continue; + + // Due to the semantics of poison values in LLVM, we have to check that + // there is actually some externally visible side effect that is dependent + // on the poison value. Since poison values are otherwise treated as + // undef, and a load of undef is undefined behavior (which is externally + // visible), it suffices to look for a load of the nonnull-or-poison + // value. + // + // This could be extended to any block control-dependent on this branch of + // the null check, it's unclear if that will actually catch more cases in + // real code. + if (blockContainsLoadDerivedFrom(NonNullBB, Cmp.ptrValue)) { + Type *BoolTy = Type::getInt1Ty(F.getContext()); + Value *NewV = ConstantInt::get(BoolTy, Cmp.pred == CmpNe); + Cmp.use->set(NewV); + Changed = true; + } + } + } + + NonNullOrPoisonValues.clear(); + InBoundsRecurrenceBases.clear(); + InBoundsRecurrenceBaseMap.clear(); + + return Changed; +} + +/// Checks whether a phi is derived from known nonnnull-or-poison values, +/// including other phis that are derived from the same. May return `false` +/// conservatively in some cases, e.g. if exploring a large cycle of phis. +/// +/// This function may also insert any inbounds GEPs that it finds into +/// NonNullOrPoisonValues. +bool +NullCheckElimination::isNonNullOrPoisonPhi(SmallPhiSet *VisitedPhis, + PHINode *PN) { + // If we've already seen this phi, return `true`, even though it may not be + // nonnull, since some other operand in a cycle of phis may invalidate the + // optimistic assumption that the entire cycle is nonnull, including this phi. + if (!VisitedPhis->insert(PN)) + return true; + + // Use a sensible limit to avoid iterating over long chains of phis that are + // unlikely to be nonnull. + if (VisitedPhis->size() >= kPhiLimit) + return false; + + unsigned numOperands = PN->getNumOperands(); + for (unsigned i = 0; i < numOperands; ++i) { + Value *SrcValue = PN->getOperand(i); + if (NonNullOrPoisonValues.count(SrcValue)) { + continue; + } else if (auto *GEP = castToInBoundsGEP(SrcValue)) { + NonNullOrPoisonValues.insert(GEP); + } else if (auto *SrcPN = dyn_cast(SrcValue)) { + if (!isNonNullOrPoisonPhi(VisitedPhis, SrcPN)) + return false; + } else { + return false; + } + } + + return true; +} + +/// Determines whether a phi corresponds to an inbounds recurrence where the +/// base is not a known nonnull-or-poison value. Returns the base value, or +/// null if the phi doesn't correspond to such a recurrence. +Value *NullCheckElimination::isNontrivialInBoundsRecurrence(PHINode *PN) { + if (PN->getNumOperands() != 2) + return nullptr; + + Value *BaseV; + GetElementPtrInst *SuccessorI; + if (auto *GEP = castToInBoundsGEP(PN->getOperand(0))) { + BaseV = PN->getOperand(1); + SuccessorI = GEP; + } else if (auto *GEP = castToInBoundsGEP(PN->getOperand(1))) { + BaseV = PN->getOperand(0); + SuccessorI = GEP; + } else { + return nullptr; + } + + if (NonNullOrPoisonValues.count(BaseV) || SuccessorI->getOperand(0) != PN) + return nullptr; + + return BaseV; +} + +/// Determines whether an ICmpInst is one of the forms that is relevant to +/// null check elimination, and then adds a CmpDesc to Cmps when applicable. +/// The ICmpInst is passed as a Use so this Use can be placed into the CmpDesc, +/// but the Use parameter must be a Use of an ICmpInst. +bool NullCheckElimination::classifyCmp(CmpDescVec *Cmps, Use *U) { + auto *CI = cast(U); + if (!CI->isEquality()) + return false; + + CmpPred Pred = (CI->getPredicate() == llvm::CmpInst::ICMP_EQ) ? CmpEq : CmpNe; + Value *Op0 = CI->getOperand(0); + Value *Op1 = CI->getOperand(1); + + if (NonNullOrPoisonValues.count(Op0)) { + if (isZeroConstant(Op1)) { + Cmps->push_back(CmpDesc(NullCheckDefiniteCmp, Pred, U, Op0)); + return true; + } + + auto it = InBoundsRecurrenceBaseMap.find(Op1); + if (it == InBoundsRecurrenceBaseMap.end()) + return false; + + auto *GEP = castToInBoundsGEP(Op0); + if (!GEP) + return false; + + auto *BaseV = it->second; + if (GEP->getOperand(0) != BaseV) + return false; + + Cmps->push_back(CmpDesc(RecurrencePhiBoundCmp, Pred, U, Op1)); + return true; + } + + // Since InstCombine or InstSimplify should have canonicalized a comparison + // with `null` to have the `null` in the second operand, we don't need to + // handle the case where Op0 is `null` like we did with Op1 above. + if (NonNullOrPoisonValues.count(Op1)) { + auto it = InBoundsRecurrenceBaseMap.find(Op0); + if (it == InBoundsRecurrenceBaseMap.end()) + return false; + + auto *GEP = castToInBoundsGEP(Op1); + if (!GEP) + return false; + + auto *BaseV = it->second; + if (GEP->getOperand(0) != BaseV) + return false; + + Cmps->push_back(CmpDesc(RecurrencePhiBoundCmp, Pred, U, Op0)); + return true; + } + + if (InBoundsRecurrenceBaseMap.count(Op0)) { + if (isZeroConstant(Op1)) { + Cmps->push_back(CmpDesc(NullCheckRecurrenceCmp, Pred, U, Op0)); + return true; + } + } + + return false; +} + +/// Classifies the comparisons that are relevant to null check elimination, +/// starting from a Use. The CmpDescs of the comparisons are collected in Cmps. +bool NullCheckElimination::findRelevantCmps(CmpDescVec *Cmps, Use *U) { + auto *I = dyn_cast(U->get()); + if (!I) + return false; + + if (isa(I)) + return classifyCmp(Cmps, U); + + unsigned Opcode = I->getOpcode(); + if (Opcode == Instruction::Or || Opcode == Instruction::And) { + bool FoundCmps = findRelevantCmps(Cmps, &I->getOperandUse(0)); + FoundCmps |= findRelevantCmps(Cmps, &I->getOperandUse(1)); + return FoundCmps; + } + + return false; +} + +/// Determines whether `BB` contains a load from `PtrV`, or any inbounds GEP +/// derived from `PtrV`. +bool +NullCheckElimination::blockContainsLoadDerivedFrom(BasicBlock *BB, + Value *PtrV) { + for (auto &I : *BB) { + auto *LI = dyn_cast(&I); + if (!LI) + continue; + + Value *V = LI->getPointerOperand(); + while (1) { + if (V == PtrV) + return true; + + auto *GEP = castToInBoundsGEP(V); + if (!GEP) + break; + + V = GEP->getOperand(0); + } + } + + return false; +} + diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 6532b7a09b99..8c7f253290ba 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -1130,7 +1130,7 @@ static bool isSafePHIToSpeculate(PHINode &PN, // If this pointer is always safe to load, or if we can prove that there // is already a load in the block, then we can move the load to the pred // block. - if (InVal->isDereferenceablePointer() || + if (InVal->isDereferenceablePointer(DL) || isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) continue; @@ -1198,8 +1198,8 @@ static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = nullptr) { Value *TValue = SI.getTrueValue(); Value *FValue = SI.getFalseValue(); - bool TDerefable = TValue->isDereferenceablePointer(); - bool FDerefable = FValue->isDereferenceablePointer(); + bool TDerefable = TValue->isDereferenceablePointer(DL); + bool FDerefable = FValue->isDereferenceablePointer(DL); for (User *U : SI.users()) { LoadInst *LI = dyn_cast(U); diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp index 888e25273839..73c97ffeef4f 100644 --- a/lib/Transforms/Scalar/SampleProfile.cpp +++ b/lib/Transforms/Scalar/SampleProfile.cpp @@ -450,13 +450,14 @@ void SampleModuleProfile::dump() { /// /// \returns true if the file was loaded successfully, false otherwise. bool SampleModuleProfile::loadText() { - std::unique_ptr Buffer; - std::error_code EC = MemoryBuffer::getFile(Filename, Buffer); - if (EC) { + ErrorOr> BufferOrErr = + MemoryBuffer::getFile(Filename); + if (std::error_code EC = BufferOrErr.getError()) { std::string Msg(EC.message()); M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg)); return false; } + std::unique_ptr Buffer = std::move(BufferOrErr.get()); line_iterator LineIt(*Buffer, '#'); // Read the profile of each function. Since each function may be diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index edf012d81171..efa3af71bf2e 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -52,6 +52,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLowerAtomicPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeMemCpyOptPass(Registry); + initializeMergedLoadStoreMotionPass(Registry); initializePartiallyInlineLibCallsPass(Registry); initializeReassociatePass(Registry); initializeRegToMemPass(Registry); @@ -66,6 +67,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeTailCallElimPass(Registry); initializeSeparateConstOffsetFromGEPPass(Registry); initializeLoadCombinePass(Registry); + initializeNullCheckEliminationPass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -92,6 +94,10 @@ void LLVMAddGVNPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createGVNPass()); } +void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createMergedLoadStoreMotionPass()); +} + void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createIndVarSimplifyPass()); } diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 58192fc02be4..e2a24a7fd4a7 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -1142,8 +1142,8 @@ class AllocaPromoter : public LoadAndStorePromoter { /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) { - bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); - bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); + bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(DL); + bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(DL); for (User *U : SI->users()) { LoadInst *LI = dyn_cast(U); @@ -1226,7 +1226,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { // If this pointer is always safe to load, or if we can prove that there is // already a load in the block, then we can move the load to the pred block. - if (InVal->isDereferenceablePointer() || + if (InVal->isDereferenceablePointer(DL) || isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, DL)) continue; diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 62f2026b8d9f..6557ce4575dd 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -272,23 +272,6 @@ class SeparateConstOffsetFromGEP : public FunctionPass { /// /// Verified in @i32_add in split-gep.ll bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); - /// For each array index that is in the form of zext(a), convert it to sext(a) - /// if we can prove zext(a) <= max signed value of typeof(a). We prefer - /// sext(a) to zext(a), because in the special case where x + y >= 0 and - /// (x >= 0 or y >= 0), function CanTraceInto can split sext(x + y), - /// while no such case exists for zext(x + y). - /// - /// Note that - /// zext(x + y) = zext(x) + zext(y) - /// is wrong, e.g., - /// zext i32(UINT_MAX + 1) to i64 != - /// (zext i32 UINT_MAX to i64) + (zext i32 1 to i64) - /// - /// Returns true if the module changes. - /// - /// Verified in @inbounds_zext_add in split-gep.ll and @sum_of_array3 in - /// split-gep-and-gvn.ll - bool convertInBoundsZExtToSExt(GetElementPtrInst *GEP); const DataLayout *DL; }; @@ -613,43 +596,6 @@ bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( return Changed; } -bool -SeparateConstOffsetFromGEP::convertInBoundsZExtToSExt(GetElementPtrInst *GEP) { - if (!GEP->isInBounds()) - return false; - - // TODO: consider alloca - GlobalVariable *UnderlyingObject = - dyn_cast(GEP->getPointerOperand()); - if (UnderlyingObject == nullptr) - return false; - - uint64_t ObjectSize = - DL->getTypeAllocSize(UnderlyingObject->getType()->getElementType()); - gep_type_iterator GTI = gep_type_begin(*GEP); - bool Changed = false; - for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); I != E; - ++I, ++GTI) { - if (isa(*GTI)) { - if (ZExtInst *Extended = dyn_cast(*I)) { - unsigned SrcBitWidth = - cast(Extended->getSrcTy())->getBitWidth(); - // For GEP operand zext(a), if a <= max signed value of typeof(a), then - // the sign bit of a is zero and sext(a) = zext(a). Because the GEP is - // in bounds, we know a <= ObjectSize, so the condition can be reduced - // to ObjectSize <= max signed value of typeof(a). - if (ObjectSize <= - APInt::getSignedMaxValue(SrcBitWidth).getZExtValue()) { - *I = new SExtInst(Extended->getOperand(0), Extended->getType(), - Extended->getName(), GEP); - Changed = true; - } - } - } - } - return Changed; -} - int64_t SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction) { @@ -684,9 +630,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { if (GEP->hasAllConstantIndices()) return false; - bool Changed = false; - Changed |= canonicalizeArrayIndicesToPointerSize(GEP); - Changed |= convertInBoundsZExtToSExt(GEP); + bool Changed = canonicalizeArrayIndicesToPointerSize(GEP); bool NeedsExtraction; int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 482c33aa6e0b..7348c45c5d37 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" @@ -34,6 +35,7 @@ namespace { DominatorTree *DT; LoopInfo *LI; AliasAnalysis *AA; + const DataLayout *DL; public: static char ID; // Pass identification @@ -98,6 +100,8 @@ bool Sinking::runOnFunction(Function &F) { DT = &getAnalysis().getDomTree(); LI = &getAnalysis(); AA = &getAnalysis(); + DataLayoutPass *DLP = getAnalysisIfAvailable(); + DL = DLP ? &DLP->getDataLayout() : nullptr; bool MadeChange, EverMadeChange = false; @@ -193,7 +197,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { // We cannot sink a load across a critical edge - there may be stores in // other code paths. - if (!isSafeToSpeculativelyExecute(Inst)) + if (!isSafeToSpeculativelyExecute(Inst, DL)) return false; // We don't want to sink across a critical edge if we don't dominate the diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index 7b77ae1de1b4..b9673ed655e0 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -260,7 +260,7 @@ INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) INITIALIZE_PASS_DEPENDENCY(LowerSwitch) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(RegionInfo) +INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) @@ -406,11 +406,11 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { } else { // It's an exit from a sub region - while(R->getParent() != ParentRegion) + while (R->getParent() != ParentRegion) R = R->getParent(); // Edge from inside a subregion to its entry, ignore it - if (R == N) + if (*R == *N) continue; BasicBlock *Entry = R->getEntry(); diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp index 80b7e22bacad..602e8ba55107 100644 --- a/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -673,7 +673,8 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, - MDNode *BranchWeights) { + MDNode *BranchWeights, + DominatorTree *DT) { BasicBlock *Head = SplitBefore->getParent(); BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); TerminatorInst *HeadOldTerm = Head->getTerminator(); @@ -690,6 +691,20 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond, HeadNewTerm->setDebugLoc(SplitBefore->getDebugLoc()); HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights); ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); + + if (DT) { + if (DomTreeNode *OldNode = DT->getNode(Head)) { + std::vector Children(OldNode->begin(), OldNode->end()); + + DomTreeNode *NewNode = DT->addNewBlock(Tail, Head); + for (auto Child : Children) + DT->changeImmediateDominator(Child, NewNode); + + // Head dominates ThenBlock. + DT->addNewBlock(ThenBlock, Head); + } + } + return CheckTerm; } diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt index e10ca90749c5..fcf548f97c5d 100644 --- a/lib/Transforms/Utils/CMakeLists.txt +++ b/lib/Transforms/Utils/CMakeLists.txt @@ -33,7 +33,6 @@ add_llvm_library(LLVMTransformUtils SimplifyIndVar.cpp SimplifyInstructions.cpp SimplifyLibCalls.cpp - SpecialCaseList.cpp UnifyFunctionExitNodes.cpp Utils.cpp ValueMapper.cpp diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index d93db5dc1ef9..f0a9f2b1fcb3 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -189,6 +189,7 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split, Invoke.getOuterResumeDest(), InvokeArgs, CI->getName(), BB); + II->setDebugLoc(CI->getDebugLoc()); II->setCallingConv(CI->getCallingConv()); II->setAttributes(CI->getAttributes()); diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index aedd787ecf8d..a5e443fcf46b 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -509,6 +509,11 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { PredBB->getTerminator()->eraseFromParent(); DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList()); + // If the PredBB is the entry block of the function, move DestBB up to + // become the entry block after we erase PredBB. + if (PredBB == &DestBB->getParent()->getEntryBlock()) + DestBB->moveAfter(PredBB); + if (P) { if (DominatorTreeWrapperPass *DTWP = P->getAnalysisIfAvailable()) { diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp index f7787dafd5bf..ef422914b6b2 100644 --- a/lib/Transforms/Utils/LoopSimplify.cpp +++ b/lib/Transforms/Utils/LoopSimplify.cpp @@ -50,6 +50,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" @@ -473,7 +474,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, /// explicit if they accepted the analysis directly and then updated it. static bool simplifyOneLoop(Loop *L, SmallVectorImpl &Worklist, AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, - ScalarEvolution *SE, Pass *PP) { + ScalarEvolution *SE, Pass *PP, + const DataLayout *DL) { bool Changed = false; ReprocessLoop: @@ -672,7 +674,7 @@ static bool simplifyOneLoop(Loop *L, SmallVectorImpl &Worklist, // The block has now been cleared of all instructions except for // a comparison and a conditional branch. SimplifyCFG may be able // to fold it now. - if (!FoldBranchToCommonDest(BI)) continue; + if (!FoldBranchToCommonDest(BI, DL)) continue; // Success. The block is now dead, so remove it from the loop, // update the dominator tree and delete it. @@ -709,7 +711,8 @@ static bool simplifyOneLoop(Loop *L, SmallVectorImpl &Worklist, } bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, - AliasAnalysis *AA, ScalarEvolution *SE) { + AliasAnalysis *AA, ScalarEvolution *SE, + const DataLayout *DL) { bool Changed = false; // Worklist maintains our depth-first queue of loops in this nest to process. @@ -726,7 +729,8 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, } while (!Worklist.empty()) - Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI, SE, PP); + Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI, + SE, PP, DL); return Changed; } @@ -744,6 +748,7 @@ namespace { DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; + const DataLayout *DL; bool runOnFunction(Function &F) override; @@ -787,10 +792,12 @@ bool LoopSimplify::runOnFunction(Function &F) { LI = &getAnalysis(); DT = &getAnalysis().getDomTree(); SE = getAnalysisIfAvailable(); + DataLayoutPass *DLP = getAnalysisIfAvailable(); + DL = DLP ? &DLP->getDataLayout() : nullptr; // Simplify each loop nest in the function. for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) - Changed |= simplifyLoop(*I, DT, LI, this, AA, SE); + Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL); return Changed; } diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index 16975b9e6374..ab1c25a75e26 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -17,12 +17,14 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/LLVMContext.h" @@ -63,10 +65,15 @@ static inline void RemapInstruction(Instruction *I, /// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it /// only has one predecessor, and that predecessor only has one successor. -/// The LoopInfo Analysis that is passed will be kept consistent. -/// Returns the new combined block. -static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, - LPPassManager *LPM) { +/// The LoopInfo Analysis that is passed will be kept consistent. If folding is +/// successful references to the containing loop must be removed from +/// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have +/// references to the eliminated BB. The argument ForgottenLoops contains a set +/// of loops that have already been forgotten to prevent redundant, expensive +/// calls to ScalarEvolution::forgetLoop. Returns the new combined block. +static BasicBlock * +FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, + SmallPtrSetImpl &ForgottenLoops) { // Merge basic blocks into their predecessor if there is only one distinct // pred, and if there is only one distinct successor of the predecessor, and // if there are no PHI nodes. @@ -103,8 +110,10 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, // ScalarEvolution holds references to loop exit blocks. if (LPM) { if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable()) { - if (Loop *L = LI->getLoopFor(BB)) - SE->forgetLoop(L); + if (Loop *L = LI->getLoopFor(BB)) { + if (ForgottenLoops.insert(L)) + SE->forgetLoop(L); + } } } LI->removeBlock(BB); @@ -242,21 +251,25 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, Twine("completely unrolled loop with ") + Twine(TripCount) + " iterations"); } else { + auto EmitDiag = [&](const Twine &T) { + emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, + "unrolled loop by a factor of " + Twine(Count) + + T); + }; + DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by " << Count); - Twine DiagMsg("unrolled loop by a factor of " + Twine(Count)); if (TripMultiple == 0 || BreakoutTrip != TripMultiple) { DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip); - DiagMsg.concat(" with a breakout at trip " + Twine(BreakoutTrip)); + EmitDiag(" with a breakout at trip " + Twine(BreakoutTrip)); } else if (TripMultiple != 1) { DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); - DiagMsg.concat(" with " + Twine(TripMultiple) + " trips per branch"); + EmitDiag(" with " + Twine(TripMultiple) + " trips per branch"); } else if (RuntimeTripCount) { DEBUG(dbgs() << " with run-time trip count"); - DiagMsg.concat(" with run-time trip count"); + EmitDiag(" with run-time trip count"); } DEBUG(dbgs() << "!\n"); - emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, DiagMsg); } bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); @@ -418,11 +431,13 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, } // Merge adjacent basic blocks, if possible. + SmallPtrSet ForgottenLoops; for (unsigned i = 0, e = Latches.size(); i != e; ++i) { BranchInst *Term = cast(Latches[i]->getTerminator()); if (Term->isUnconditional()) { BasicBlock *Dest = Term->getSuccessor(0); - if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM)) + if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM, + ForgottenLoops)) std::replace(Latches.begin(), Latches.end(), Dest, Fold); } } @@ -485,8 +500,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, if (!OuterL && !CompletelyUnroll) OuterL = L; if (OuterL) { + DataLayoutPass *DLP = PP->getAnalysisIfAvailable(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; ScalarEvolution *SE = PP->getAnalysisIfAvailable(); - simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE); + simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL); // LCSSA must be performed on the outermost affected loop. The unrolled // loop's last loop latch is guaranteed to be in the outermost loop after diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 5bef091a499b..a96c46ad63e0 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -280,17 +280,17 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, SCEVExpander Expander(*SE, "loop-unroll"); Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); - Type *CountTy = TripCount->getType(); - BinaryOperator *ModVal = - BinaryOperator::CreateURem(TripCount, - ConstantInt::get(CountTy, Count), - "xtraiter"); - ModVal->insertBefore(PreHeaderBR); - - // Check if for no extra iterations, then jump to unrolled loop - Value *BranchVal = new ICmpInst(PreHeaderBR, - ICmpInst::ICMP_NE, ModVal, - ConstantInt::get(CountTy, 0), "lcmp"); + + IRBuilder<> B(PreHeaderBR); + Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); + + // Check if for no extra iterations, then jump to unrolled loop. We have to + // check that the trip count computation didn't overflow when adding one to + // the backedge taken count. + Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod"); + Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow"); + Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or"); + // Branch to either the extra iterations or the unrolled loop // We will fix up the true branch label when adding loop body copies BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR); @@ -344,6 +344,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, } // The comparison w/ the extra iteration value and branch + Type *CountTy = TripCount->getType(); Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal, ConstantInt::get(CountTy, leftOverIters), "un.tmp"); diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp index eac693bdf8a4..d6e5bb626805 100644 --- a/lib/Transforms/Utils/LowerSwitch.cpp +++ b/lib/Transforms/Utils/LowerSwitch.cpp @@ -67,8 +67,8 @@ namespace { BasicBlock *switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, ConstantInt *UpperBound, - Value *Val, BasicBlock *OrigBlock, - BasicBlock *Default); + Value *Val, BasicBlock *Predecessor, + BasicBlock *OrigBlock, BasicBlock *Default); BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val, BasicBlock *OrigBlock, BasicBlock *Default); unsigned Clusterify(CaseVector &Cases, SwitchInst *SI); @@ -131,6 +131,21 @@ static raw_ostream& operator<<(raw_ostream &O, return O << "]"; } +static void fixPhis(BasicBlock *Succ, + BasicBlock *OrigBlock, + BasicBlock *NewNode) { + for (BasicBlock::iterator I = Succ->begin(), + E = Succ->getFirstNonPHI(); + I != E; ++I) { + PHINode *PN = cast(I); + + for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { + if (PN->getIncomingBlock(I) == OrigBlock) + PN->setIncomingBlock(I, NewNode); + } + } +} + // switchConvert - Convert the switch statement into a binary lookup of // the case values. The function recursively builds this tree. // LowerBound and UpperBound are used to keep track of the bounds for Val @@ -139,6 +154,7 @@ static raw_ostream& operator<<(raw_ostream &O, BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, ConstantInt *UpperBound, Value *Val, + BasicBlock *Predecessor, BasicBlock *OrigBlock, BasicBlock *Default) { unsigned Size = End - Begin; @@ -149,6 +165,7 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, // emitting the code that checks if the value actually falls in the range // because the bounds already tell us so. if (Begin->Low == LowerBound && Begin->High == UpperBound) { + fixPhis(Begin->BB, OrigBlock, Predecessor); return Begin->BB; } return newLeafBlock(*Begin, Val, OrigBlock, Default); @@ -200,21 +217,25 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, dbgs() << "NONE\n"; }); - BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound, - NewUpperBound, Val, OrigBlock, Default); - BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound, - UpperBound, Val, OrigBlock, Default); - // Create a new node that checks if the value is < pivot. Go to the // left branch if it is and right branch if not. Function* F = OrigBlock->getParent(); BasicBlock* NewNode = BasicBlock::Create(Val->getContext(), "NodeBlock"); - Function::iterator FI = OrigBlock; - F->getBasicBlockList().insert(++FI, NewNode); ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, Val, Pivot.Low, "Pivot"); + + BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound, + NewUpperBound, Val, NewNode, OrigBlock, + Default); + BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound, + UpperBound, Val, NewNode, OrigBlock, + Default); + + Function::iterator FI = OrigBlock; + F->getBasicBlockList().insert(++FI, NewNode); NewNode->getInstList().push_back(Comp); + BranchInst::Create(LBranch, RBranch, Comp, NewNode); return NewNode; } @@ -386,7 +407,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { } BasicBlock *SwitchBlock = switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val, - OrigBlock, NewDefault); + OrigBlock, OrigBlock, NewDefault); // Branch to our shiny new if-then stuff... BranchInst::Create(SwitchBlock, OrigBlock); diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index e155daf6fcce..65b85f7114bf 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -201,8 +201,8 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, /// ComputeSpeculationCost - Compute an abstract "cost" of speculating the /// given instruction, which is assumed to be safe to speculate. 1 means /// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive. -static unsigned ComputeSpeculationCost(const User *I) { - assert(isSafeToSpeculativelyExecute(I) && +static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL) { + assert(isSafeToSpeculativelyExecute(I, DL) && "Instruction is not safe to speculatively execute!"); switch (Operator::getOpcode(I)) { default: @@ -257,7 +257,8 @@ static unsigned ComputeSpeculationCost(const User *I) { /// CostRemaining, false is returned and CostRemaining is undefined. static bool DominatesMergePoint(Value *V, BasicBlock *BB, SmallPtrSet *AggressiveInsts, - unsigned &CostRemaining) { + unsigned &CostRemaining, + const DataLayout *DL) { Instruction *I = dyn_cast(V); if (!I) { // Non-instructions all dominate instructions, but not all constantexprs @@ -290,10 +291,10 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // Okay, it looks like the instruction IS in the "condition". Check to // see if it's a cheap instruction to unconditionally compute, and if it // only uses stuff defined outside of the condition. If so, hoist it out. - if (!isSafeToSpeculativelyExecute(I)) + if (!isSafeToSpeculativelyExecute(I, DL)) return false; - unsigned Cost = ComputeSpeculationCost(I); + unsigned Cost = ComputeSpeculationCost(I, DL); if (Cost > CostRemaining) return false; @@ -303,7 +304,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // Okay, we can only really hoist these out if their operands do // not take us over the cost threshold. for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) - if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining)) + if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, DL)) return false; // Okay, it's safe to do this! Remember this instruction. AggressiveInsts->insert(I); @@ -997,7 +998,7 @@ static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2, /// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and /// BB2, hoist any common code in the two blocks up into the branch block. The /// caller of this function guarantees that BI's block dominates BB1 and BB2. -static bool HoistThenElseCodeToIf(BranchInst *BI) { +static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL) { // This does very trivial matching, with limited scanning, to find identical // instructions in the two blocks. In particular, we don't want to get into // O(M*N) situations here where M and N are the sizes of BB1 and BB2. As @@ -1039,6 +1040,10 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) { if (!I2->use_empty()) I2->replaceAllUsesWith(I1); I1->intersectOptionalDataWith(I2); + I1->setMetadata(LLVMContext::MD_range, + MDNode::getMostGenericRange( + I1->getMetadata(LLVMContext::MD_range), + I2->getMetadata(LLVMContext::MD_range))); I2->eraseFromParent(); Changed = true; @@ -1071,9 +1076,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) { if (BB1V == BB2V) continue; - if (isa(BB1V) && !isSafeToSpeculativelyExecute(BB1V)) + if (isa(BB1V) && !isSafeToSpeculativelyExecute(BB1V, DL)) return Changed; - if (isa(BB2V) && !isSafeToSpeculativelyExecute(BB2V)) + if (isa(BB2V) && !isSafeToSpeculativelyExecute(BB2V, DL)) return Changed; } } @@ -1390,7 +1395,8 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, /// \endcode /// /// \returns true if the conditional block is removed. -static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { +static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, + const DataLayout *DL) { // Be conservative for now. FP select instruction can often be expensive. Value *BrCond = BI->getCondition(); if (isa(BrCond)) @@ -1433,13 +1439,13 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { return false; // Don't hoist the instruction if it's unsafe or expensive. - if (!isSafeToSpeculativelyExecute(I) && + if (!isSafeToSpeculativelyExecute(I, DL) && !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore(I, BB, ThenBB, EndBB)))) return false; if (!SpeculatedStoreValue && - ComputeSpeculationCost(I) > PHINodeFoldingThreshold) + ComputeSpeculationCost(I, DL) > PHINodeFoldingThreshold) return false; // Store the store speculation candidate. @@ -1490,11 +1496,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { if (!OrigCE && !ThenCE) continue; // Known safe and cheap. - if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) || - (OrigCE && !isSafeToSpeculativelyExecute(OrigCE))) + if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE, DL)) || + (OrigCE && !isSafeToSpeculativelyExecute(OrigCE, DL))) return false; - unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE) : 0; - unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE) : 0; + unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, DL) : 0; + unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, DL) : 0; if (OrigCost + ThenCost > 2 * PHINodeFoldingThreshold) return false; @@ -1741,9 +1747,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) { } if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts, - MaxCostVal0) || + MaxCostVal0, DL) || !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts, - MaxCostVal1)) + MaxCostVal1, DL)) return false; } @@ -1961,7 +1967,7 @@ static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) { /// FoldBranchToCommonDest - If this basic block is simple enough, and if a /// predecessor branches to us and one of our successors, fold the block into /// the predecessor and use logical operations to pick the right destination. -bool llvm::FoldBranchToCommonDest(BranchInst *BI) { +bool llvm::FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL) { BasicBlock *BB = BI->getParent(); Instruction *Cond = nullptr; @@ -2013,7 +2019,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { Instruction *BonusInst = nullptr; if (&*FrontIt != Cond && FrontIt->hasOneUse() && FrontIt->user_back() == Cond && - isSafeToSpeculativelyExecute(FrontIt)) { + isSafeToSpeculativelyExecute(FrontIt, DL)) { BonusInst = &*FrontIt; ++FrontIt; @@ -2028,7 +2034,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Make sure the instruction after the condition is the cond branch. BasicBlock::iterator CondIt = Cond; ++CondIt; - // Ingore dbg intrinsics. + // Ignore dbg intrinsics. while (isa(CondIt)) ++CondIt; if (&*CondIt != BI) @@ -2343,7 +2349,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { } // If this is a conditional branch in an empty block, and if any - // predecessors is a conditional branch to one of our destinations, + // predecessors are a conditional branch to one of our destinations, // fold the conditions into logical ops and one cond br. BasicBlock::iterator BBI = BB->begin(); // Ignore dbg intrinsics. @@ -2378,16 +2384,33 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { // Do not perform this transformation if it would require // insertion of a large number of select instructions. For targets // without predication/cmovs, this is a big pessimization. - BasicBlock *CommonDest = PBI->getSuccessor(PBIOp); + // Also do not perform this transformation if any phi node in the common + // destination block can trap when reached by BB or PBB (PR17073). In that + // case, it would be unsafe to hoist the operation into a select instruction. + + BasicBlock *CommonDest = PBI->getSuccessor(PBIOp); unsigned NumPhis = 0; for (BasicBlock::iterator II = CommonDest->begin(); - isa(II); ++II, ++NumPhis) + isa(II); ++II, ++NumPhis) { if (NumPhis > 2) // Disable this xform. return false; + PHINode *PN = cast(II); + Value *BIV = PN->getIncomingValueForBlock(BB); + if (ConstantExpr *CE = dyn_cast(BIV)) + if (CE->canTrap()) + return false; + + unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent()); + Value *PBIV = PN->getIncomingValue(PBBIdx); + if (ConstantExpr *CE = dyn_cast(PBIV)) + if (CE->canTrap()) + return false; + } + // Finally, if everything is ok, fold the branches to logical ops. - BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1); + BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1); DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent() << "AND: " << *BI->getParent()); @@ -3311,6 +3334,11 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) { /// ValidLookupTableConstant - Return true if the backend will be able to handle /// initializing an array of constants like C. static bool ValidLookupTableConstant(Constant *C) { + if (C->isThreadDependent()) + return false; + if (C->isDLLImportDependent()) + return false; + if (ConstantExpr *CE = dyn_cast(C)) return CE->isGEPWithNoNotionalOverIndexing(); @@ -3524,7 +3552,8 @@ SwitchLookupTable::SwitchLookupTable(Module &M, // Fill in any holes in the table with the default result. if (Values.size() < TableSize) { - assert(DefaultValue && "Need a default value to fill the lookup table holes."); + assert(DefaultValue && + "Need a default value to fill the lookup table holes."); assert(DefaultValue->getType() == ValueType); for (uint64_t I = 0; I < TableSize; ++I) { if (!TableContents[I]) @@ -3993,7 +4022,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ // branches to us and our successor, fold the comparison into the // predecessor and use logical operations to update the incoming value // for PHI nodes in common successor. - if (FoldBranchToCommonDest(BI)) + if (FoldBranchToCommonDest(BI, DL)) return SimplifyCFG(BB, TTI, DL) | true; return false; } @@ -4037,7 +4066,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // If this basic block is ONLY a compare and a branch, and if a predecessor // branches to us and one of our successors, fold the comparison into the // predecessor and use logical operations to pick the right destination. - if (FoldBranchToCommonDest(BI)) + if (FoldBranchToCommonDest(BI, DL)) return SimplifyCFG(BB, TTI, DL) | true; // We have a conditional branch to two blocks that are only reachable @@ -4046,24 +4075,24 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // can hoist it up to the branching block. if (BI->getSuccessor(0)->getSinglePredecessor()) { if (BI->getSuccessor(1)->getSinglePredecessor()) { - if (HoistThenElseCodeToIf(BI)) + if (HoistThenElseCodeToIf(BI, DL)) return SimplifyCFG(BB, TTI, DL) | true; } else { // If Successor #1 has multiple preds, we may be able to conditionally - // execute Successor #0 if it branches to successor #1. + // execute Successor #0 if it branches to Successor #1. TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator(); if (Succ0TI->getNumSuccessors() == 1 && Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) - if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0))) + if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), DL)) return SimplifyCFG(BB, TTI, DL) | true; } } else if (BI->getSuccessor(1)->getSinglePredecessor()) { // If Successor #0 has multiple preds, we may be able to conditionally - // execute Successor #1 if it branches to successor #0. + // execute Successor #1 if it branches to Successor #0. TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator(); if (Succ1TI->getNumSuccessors() == 1 && Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) - if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1))) + if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), DL)) return SimplifyCFG(BB, TTI, DL) | true; } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 15d4c1c79d2b..531d349d845d 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -54,6 +54,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -209,6 +210,29 @@ namespace { class LoopVectorizationLegality; class LoopVectorizationCostModel; +/// Optimization analysis message produced during vectorization. Messages inform +/// the user why vectorization did not occur. +class Report { + std::string Message; + raw_string_ostream Out; + Instruction *Instr; + +public: + Report(Instruction *I = nullptr) : Out(Message), Instr(I) { + Out << "loop not vectorized: "; + } + + template Report &operator<<(const A &Value) { + Out << Value; + return *this; + } + + Instruction *getInstr() { return Instr; } + + std::string &str() { return Out.str(); } + operator Twine() { return Out.str(); } +}; + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -386,6 +410,8 @@ class InnerLoopVectorizer { LoopInfo *LI; /// Dominator Tree. DominatorTree *DT; + /// Alias Analysis. + AliasAnalysis *AA; /// Data Layout. const DataLayout *DL; /// Target Library Info. @@ -495,6 +521,34 @@ static std::string getDebugLocString(const Loop *L) { } #endif +/// \brief Propagate known metadata from one instruction to another. +static void propagateMetadata(Instruction *To, const Instruction *From) { + SmallVector, 4> Metadata; + From->getAllMetadataOtherThanDebugLoc(Metadata); + + for (auto M : Metadata) { + unsigned Kind = M.first; + + // These are safe to transfer (this is safe for TBAA, even when we + // if-convert, because should that metadata have had a control dependency + // on the condition, and thus actually aliased with some other + // non-speculated memory access when the condition was false, this would be + // caught by the runtime overlap checks). + if (Kind != LLVMContext::MD_tbaa && + Kind != LLVMContext::MD_fpmath) + continue; + + To->setMetadata(Kind, M.second); + } +} + +/// \brief Propagate known metadata from one instruction to a vector of others. +static void propagateMetadata(SmallVectorImpl &To, const Instruction *From) { + for (Value *V : To) + if (Instruction *I = dyn_cast(V)) + propagateMetadata(I, From); +} + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -515,10 +569,12 @@ class LoopVectorizationLegality { unsigned NumPredStores; LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, - DominatorTree *DT, TargetLibraryInfo *TLI) + DominatorTree *DT, TargetLibraryInfo *TLI, + AliasAnalysis *AA, Function *F) : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), - DT(DT), TLI(TLI), Induction(nullptr), WidestIndTy(nullptr), - HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {} + DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { + } /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -604,11 +660,12 @@ class LoopVectorizationLegality { Ends.clear(); IsWritePtr.clear(); DependencySetId.clear(); + AliasSetId.clear(); } /// Insert a pointer and calculate the start and end SCEVs. void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId, ValueToValueMap &Strides); + unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); /// This flag indicates if we need to add the runtime check. bool Need; @@ -623,6 +680,8 @@ class LoopVectorizationLegality { /// Holds the id of the set of pointers that could be dependent because of a /// shared underlying object. SmallVector DependencySetId; + /// Holds the id of the disjoint alias set to which this pointer belongs. + SmallVector AliasSetId; }; /// A struct for saving information about induction variables. @@ -747,6 +806,16 @@ class LoopVectorizationLegality { /// invariant. void collectStridedAcccess(Value *LoadOrStoreInst); + /// Report an analysis message to assist the user in diagnosing loops that are + /// not vectorized. + void emitAnalysis(Report &Message) { + DebugLoc DL = TheLoop->getStartLoc(); + if (Instruction *I = Message.getInstr()) + DL = I->getDebugLoc(); + emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, + *TheFunction, DL, Message.str()); + } + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. @@ -757,6 +826,10 @@ class LoopVectorizationLegality { DominatorTree *DT; /// Target Library Info. TargetLibraryInfo *TLI; + /// Alias analysis. + AliasAnalysis *AA; + /// Parent function + Function *TheFunction; // --- vectorization state --- // @@ -905,8 +978,8 @@ class LoopVectorizeHints { << "LV: Unrolling disabled by the pass manager\n"); } - /// Return the loop vectorizer metadata prefix. - static StringRef Prefix() { return "llvm.vectorizer."; } + /// Return the loop metadata prefix. + static StringRef Prefix() { return "llvm.loop."; } MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const { SmallVector Vals; @@ -928,8 +1001,10 @@ class LoopVectorizeHints { for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) Vals.push_back(LoopID->getOperand(i)); - Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width)); - Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1)); + Vals.push_back( + createHint(Context, Twine(Prefix(), "vectorize.width").str(), Width)); + Vals.push_back( + createHint(Context, Twine(Prefix(), "interleave.count").str(), 1)); MDNode *NewLoopID = MDNode::get(Context, Vals); // Set operand 0 to refer to the loop id itself. @@ -942,6 +1017,29 @@ class LoopVectorizeHints { LoopID = NewLoopID; } + std::string emitRemark() const { + Report R; + R << "vectorization "; + switch (Force) { + case LoopVectorizeHints::FK_Disabled: + R << "is explicitly disabled"; + break; + case LoopVectorizeHints::FK_Enabled: + R << "is explicitly enabled"; + if (Width != 0 && Unroll != 0) + R << " with width " << Width << " and interleave count " << Unroll; + else if (Width != 0) + R << " with width " << Width; + else if (Unroll != 0) + R << " with interleave count " << Unroll; + break; + case LoopVectorizeHints::FK_Undefined: + R << "was not specified"; + break; + } + return R.str(); + } + unsigned getWidth() const { return Width; } unsigned getUnroll() const { return Unroll; } enum ForceKind getForce() const { return Force; } @@ -977,7 +1075,7 @@ class LoopVectorizeHints { if (!S) continue; - // Check if the hint starts with the vectorizer prefix. + // Check if the hint starts with the loop metadata prefix. StringRef Hint = S->getString(); if (!Hint.startswith(Prefix())) continue; @@ -995,22 +1093,22 @@ class LoopVectorizeHints { if (!C) return; unsigned Val = C->getZExtValue(); - if (Hint == "width") { + if (Hint == "vectorize.width") { if (isPowerOf2_32(Val) && Val <= MaxVectorWidth) Width = Val; else DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n"); - } else if (Hint == "unroll") { - if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor) - Unroll = Val; - else - DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n"); - } else if (Hint == "enable") { + } else if (Hint == "vectorize.enable") { if (C->getBitWidth() == 1) Force = Val == 1 ? LoopVectorizeHints::FK_Enabled : LoopVectorizeHints::FK_Disabled; else DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n"); + } else if (Hint == "interleave.count") { + if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor) + Unroll = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n"); } else { DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n'); } @@ -1026,6 +1124,23 @@ class LoopVectorizeHints { MDNode *LoopID; }; +static void emitMissedWarning(Function *F, Loop *L, + const LoopVectorizeHints &LH) { + emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), LH.emitRemark()); + + if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { + if (LH.getWidth() != 1) + emitLoopVectorizeWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop vectorization"); + else if (LH.getUnroll() != 1) + emitLoopInterleaveWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop interleaving"); + } +} + static void addInnerLoop(Loop &L, SmallVectorImpl &V) { if (L.empty()) return V.push_back(&L); @@ -1053,6 +1168,7 @@ struct LoopVectorize : public FunctionPass { DominatorTree *DT; BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; + AliasAnalysis *AA; bool DisableUnrolling; bool AlwaysVectorize; @@ -1067,6 +1183,7 @@ struct LoopVectorize : public FunctionPass { DT = &getAnalysis().getDomTree(); BFI = &getAnalysis(); TLI = getAnalysisIfAvailable(); + AA = &getAnalysis(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1125,18 +1242,37 @@ struct LoopVectorize : public FunctionPass { : "?")) << " width=" << Hints.getWidth() << " unroll=" << Hints.getUnroll() << "\n"); + // Function containing loop + Function *F = L->getHeader()->getParent(); + + // Looking at the diagnostic output is the only way to determine if a loop + // was vectorized (other than looking at the IR or machine code), so it + // is important to generate an optimization remark for each loop. Most of + // these messages are generated by emitOptimizationRemarkAnalysis. Remarks + // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are + // less verbose reporting vectorized loops and unvectorized loops that may + // benefit from vectorization, respectively. + if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), Hints.emitRemark()); return false; } if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), Hints.emitRemark()); return false; } if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) { DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "loop not vectorized: vector width and interleave count are " + "explicitly set to 1"); return false; } @@ -1151,14 +1287,18 @@ struct LoopVectorize : public FunctionPass { DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "vectorization is not beneficial and is not explicitly forced"); return false; } } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); + emitMissedWarning(F, L, Hints); return false; } @@ -1167,7 +1307,6 @@ struct LoopVectorize : public FunctionPass { // Check the function attributes to find out if this function should be // optimized for size. - Function *F = L->getHeader()->getParent(); bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->hasFnAttribute(Attribute::OptimizeForSize); @@ -1190,6 +1329,10 @@ struct LoopVectorize : public FunctionPass { if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "loop not vectorized due to NoImplicitFloat attribute"); + emitMissedWarning(F, L, Hints); return false; } @@ -1208,9 +1351,14 @@ struct LoopVectorize : public FunctionPass { DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); if (VF.Width == 1) { - DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); - if (UF == 1) + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n"); + + if (UF == 1) { + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "not beneficial to vectorize and user disabled interleaving"); return false; + } DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); // Report the unrolling decision. @@ -1220,6 +1368,7 @@ struct LoopVectorize : public FunctionPass { " (vectorization not beneficial)")); // We decided not to vectorize, but we may want to unroll. + InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); Unroller.vectorize(&LVL); } else { @@ -1250,8 +1399,10 @@ struct LoopVectorize : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addPreserved(); } }; @@ -1307,7 +1458,7 @@ static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, void LoopVectorizationLegality::RuntimePointerCheck::insert( ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, - ValueToValueMap &Strides) { + unsigned ASId, ValueToValueMap &Strides) { // Get the stride replaced scev. const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); const SCEVAddRecExpr *AR = dyn_cast(Sc); @@ -1319,6 +1470,7 @@ void LoopVectorizationLegality::RuntimePointerCheck::insert( Ends.push_back(ScEnd); IsWritePtr.push_back(WritePtr); DependencySetId.push_back(DepSetId); + AliasSetId.push_back(ASId); } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { @@ -1625,7 +1777,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); + StoreInst *NewSI = + Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); + propagateMetadata(NewSI, SI); } return; } @@ -1646,9 +1800,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); - cast(LI)->setAlignment(Alignment); - Entry[Part] = Reverse ? reverseVector(LI) : LI; + LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + propagateMetadata(NewLI, LI); + Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; } } @@ -1862,6 +2016,9 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { // Only need to check pointers between two different dependency sets. if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) continue; + // Only need to check pointers in the same alias set. + if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) + continue; unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); @@ -1987,10 +2144,6 @@ void InnerLoopVectorizer::createEmptyLoop() { Constant::getAllOnesValue(BackedgeCount->getType()), "backedge.overflow", BypassBlock->getTerminator()); - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - BypassBlock->getTerminator()); - // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable // then we know that it starts at zero. @@ -2000,6 +2153,18 @@ void InnerLoopVectorizer::createEmptyLoop() { IdxTy): ConstantInt::get(IdxTy, 0); + // We need an instruction to anchor the overflow check on. StartIdx needs to + // be defined before the overflow check branch. Because the scalar preheader + // is going to merge the start index and so the overflow branch block needs to + // contain a definition of the start index. + Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd( + StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor", + BypassBlock->getTerminator()); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + BypassBlock->getTerminator()); + LoopBypassBlocks.push_back(BypassBlock); // Split the single block loop into the two loop structure described above. @@ -2068,17 +2233,18 @@ void InnerLoopVectorizer::createEmptyLoop() { // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. - Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, - "cmp.zero"); + Value *Cmp = + BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); BasicBlock *LastBypassBlock = BypassBlock; // Generate code to check that the loops trip count that we computed by adding // one to the backedge-taken count will not overflow. { - auto PastOverflowCheck = std::next(BasicBlock::iterator(CheckBCOverflow)); + auto PastOverflowCheck = + std::next(BasicBlock::iterator(OverflowCheckAnchor)); BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); + LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); if (ParentLoop) ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); LoopBypassBlocks.push_back(CheckBlock); @@ -3017,6 +3183,8 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = V; } + + propagateMetadata(Entry, it); break; } case Instruction::Select: { @@ -3044,6 +3212,8 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Op0[Part], Op1[Part]); } + + propagateMetadata(Entry, it); break; } @@ -3063,6 +3233,8 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); Entry[Part] = C; } + + propagateMetadata(Entry, it); break; } @@ -3095,6 +3267,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *Broadcasted = getBroadcastInstrs(ScalarCast); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); + propagateMetadata(Entry, it); break; } /// Vectorize casts. @@ -3104,6 +3277,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); + propagateMetadata(Entry, it); break; } @@ -3141,6 +3315,8 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Function *F = Intrinsic::getDeclaration(M, ID, Tys); Entry[Part] = Builder.CreateCall(F, Args); } + + propagateMetadata(Entry, it); break; } break; @@ -3204,8 +3380,10 @@ static bool canIfConvertPHINodes(BasicBlock *BB) { } bool LoopVectorizationLegality::canVectorizeWithIfConvert() { - if (!EnableIfConversion) + if (!EnableIfConversion) { + emitAnalysis(Report() << "if-conversion is disabled"); return false; + } assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); @@ -3235,16 +3413,24 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { BasicBlock *BB = *BI; // We don't support switch statements inside loops. - if (!isa(BB->getTerminator())) + if (!isa(BB->getTerminator())) { + emitAnalysis(Report(BB->getTerminator()) + << "loop contains a switch statement"); return false; + } // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointes)) + if (!blockCanBePredicated(BB, SafePointes)) { + emitAnalysis(Report(BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; - } else if (BB != Header && !canIfConvertPHINodes(BB)) + } + } else if (BB != Header && !canIfConvertPHINodes(BB)) { + emitAnalysis(Report(BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; - + } } // We can if-convert this loop. @@ -3254,20 +3440,31 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { bool LoopVectorizationLegality::canVectorize() { // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. - if (!TheLoop->getLoopPreheader()) + if (!TheLoop->getLoopPreheader()) { + emitAnalysis( + Report() << "loop control flow is not understood by vectorizer"); return false; + } // We can only vectorize innermost loops. - if (TheLoop->getSubLoopsVector().size()) + if (TheLoop->getSubLoopsVector().size()) { + emitAnalysis(Report() << "loop is not the innermost loop"); return false; + } // We must have a single backedge. - if (TheLoop->getNumBackEdges() != 1) + if (TheLoop->getNumBackEdges() != 1) { + emitAnalysis( + Report() << "loop control flow is not understood by vectorizer"); return false; + } // We must have a single exiting block. - if (!TheLoop->getExitingBlock()) + if (!TheLoop->getExitingBlock()) { + emitAnalysis( + Report() << "loop control flow is not understood by vectorizer"); return false; + } // We need to have a loop header. DEBUG(dbgs() << "LV: Found a loop: " << @@ -3283,6 +3480,7 @@ bool LoopVectorizationLegality::canVectorize() { // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { + emitAnalysis(Report() << "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -3376,6 +3574,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { + emitAnalysis(Report(it) + << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } @@ -3386,13 +3586,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (*bb != Header) { // Check that this instruction has no outside users or is an // identified reduction value with an outside user. - if(!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) continue; + emitAnalysis(Report(it) << "value that could not be identified as " + "reduction is used outside the loop"); return false; } // We only allow if-converted PHIs with more than two incoming values. if (Phi->getNumIncomingValues() != 2) { + emitAnalysis(Report(it) + << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } @@ -3423,8 +3627,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { + emitAnalysis(Report(it) << "use of induction value outside of the " + "loop is not handled by vectorizer"); return false; + } continue; } @@ -3467,6 +3674,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } + emitAnalysis(Report(it) << "unvectorizable operation"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; }// end of PHI handling @@ -3475,6 +3683,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // calls and we do handle certain intrinsic and libm functions. CallInst *CI = dyn_cast(it); if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa(CI)) { + emitAnalysis(Report(it) << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } @@ -3484,6 +3693,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { + emitAnalysis(Report(it) + << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; } @@ -3493,6 +3704,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa(it)) { + emitAnalysis(Report(it) + << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; } @@ -3500,8 +3713,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Check that the stored type is vectorizable. if (StoreInst *ST = dyn_cast(it)) { Type *T = ST->getValueOperand()->getType(); - if (!VectorType::isValidElementType(T)) + if (!VectorType::isValidElementType(T)) { + emitAnalysis(Report(ST) << "store instruction cannot be vectorized"); return false; + } if (EnableMemAccessVersioning) collectStridedAcccess(ST); } @@ -3512,8 +3727,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { + emitAnalysis(Report(it) << "value cannot be used outside the loop"); return false; + } } // next instr. @@ -3521,8 +3738,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); - if (Inductions.empty()) + if (Inductions.empty()) { + emitAnalysis(Report() + << "loop induction variable could not be identified"); return false; + } } return true; @@ -3711,19 +3931,22 @@ class AccessAnalysis { /// \brief Set of potential dependent memory accesses. typedef EquivalenceClasses DepCandidates; - AccessAnalysis(const DataLayout *Dl, DepCandidates &DA) : - DL(Dl), DepCands(DA), AreAllWritesIdentified(true), - AreAllReadsIdentified(true), IsRTCheckNeeded(false) {} + AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : + DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} /// \brief Register a load and whether it is only read from. - void addLoad(Value *Ptr, bool IsReadOnly) { + void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { + Value *Ptr = const_cast(Loc.Ptr); + AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.TBAATag); Accesses.insert(MemAccessInfo(Ptr, false)); if (IsReadOnly) ReadOnlyPtr.insert(Ptr); } /// \brief Register a store. - void addStore(Value *Ptr) { + void addStore(AliasAnalysis::Location &Loc) { + Value *Ptr = const_cast(Loc.Ptr); + AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.TBAATag); Accesses.insert(MemAccessInfo(Ptr, true)); } @@ -3737,10 +3960,7 @@ class AccessAnalysis { /// \brief Goes over all memory accesses, checks whether a RT check is needed /// and builds sets of dependent accesses. void buildDependenceSets() { - // Process read-write pointers first. - processMemAccesses(false); - // Next, process read pointers. - processMemAccesses(true); + processMemAccesses(); } bool isRTCheckNeeded() { return IsRTCheckNeeded; } @@ -3752,40 +3972,31 @@ class AccessAnalysis { private: typedef SetVector PtrAccessSet; - typedef DenseMap UnderlyingObjToAccessMap; - /// \brief Go over all memory access or only the deferred ones if - /// \p UseDeferred is true and check whether runtime pointer checks are needed - /// and build sets of dependency check candidates. - void processMemAccesses(bool UseDeferred); + /// \brief Go over all memory access and check whether runtime pointer checks + /// are needed /// and build sets of dependency check candidates. + void processMemAccesses(); /// Set of all accesses. PtrAccessSet Accesses; - /// Set of access to check after all writes have been processed. - PtrAccessSet DeferredAccesses; - - /// Map of pointers to last access encountered. - UnderlyingObjToAccessMap ObjToLastAccess; - /// Set of accesses that need a further dependence check. MemAccessInfoSet CheckDeps; /// Set of pointers that are read only. SmallPtrSet ReadOnlyPtr; - /// Set of underlying objects already written to. - SmallPtrSet WriteObjects; - const DataLayout *DL; + /// An alias set tracker to partition the access set by underlying object and + //intrinsic property (such as TBAA metadata). + AliasSetTracker AST; + /// Sets of potentially dependent accesses - members of one set share an /// underlying pointer. The set "CheckDeps" identfies which sets really need a /// dependence check. DepCandidates &DepCands; - bool AreAllWritesIdentified; - bool AreAllReadsIdentified; bool IsRTCheckNeeded; }; @@ -3813,62 +4024,67 @@ bool AccessAnalysis::canCheckPtrAtRT( ValueToValueMap &StridesMap, bool ShouldCheckStride) { // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. - unsigned NumReadPtrChecks = 0; - unsigned NumWritePtrChecks = 0; bool CanDoRT = true; bool IsDepCheckNeeded = isDependencyCheckNeeded(); - // We assign consecutive id to access from different dependence sets. - // Accesses within the same set don't need a runtime check. - unsigned RunningDepId = 1; - DenseMap DepSetId; - - for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end(); - AI != AE; ++AI) { - const MemAccessInfo &Access = *AI; - Value *Ptr = Access.getPointer(); - bool IsWrite = Access.getInt(); - - // Just add write checks if we have both. - if (!IsWrite && Accesses.count(MemAccessInfo(Ptr, true))) - continue; + NumComparisons = 0; - if (IsWrite) - ++NumWritePtrChecks; - else - ++NumReadPtrChecks; - - if (hasComputableBounds(SE, StridesMap, Ptr) && - // When we run after a failing dependency check we have to make sure we - // don't have wrapping pointers. - (!ShouldCheckStride || - isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { - // The id of the dependence set. - unsigned DepId; - - if (IsDepCheckNeeded) { - Value *Leader = DepCands.getLeaderValue(Access).getPointer(); - unsigned &LeaderId = DepSetId[Leader]; - if (!LeaderId) - LeaderId = RunningDepId++; - DepId = LeaderId; - } else - // Each access has its own dependence set. - DepId = RunningDepId++; - - RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, StridesMap); - - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); - } else { - CanDoRT = false; + // We assign a consecutive id to access from different alias sets. + // Accesses between different groups doesn't need to be checked. + unsigned ASId = 1; + for (auto &AS : AST) { + unsigned NumReadPtrChecks = 0; + unsigned NumWritePtrChecks = 0; + + // We assign consecutive id to access from different dependence sets. + // Accesses within the same set don't need a runtime check. + unsigned RunningDepId = 1; + DenseMap DepSetId; + + for (auto A : AS) { + Value *Ptr = A.getValue(); + bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); + MemAccessInfo Access(Ptr, IsWrite); + + if (IsWrite) + ++NumWritePtrChecks; + else + ++NumReadPtrChecks; + + if (hasComputableBounds(SE, StridesMap, Ptr) && + // When we run after a failing dependency check we have to make sure we + // don't have wrapping pointers. + (!ShouldCheckStride || + isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { + // The id of the dependence set. + unsigned DepId; + + if (IsDepCheckNeeded) { + Value *Leader = DepCands.getLeaderValue(Access).getPointer(); + unsigned &LeaderId = DepSetId[Leader]; + if (!LeaderId) + LeaderId = RunningDepId++; + DepId = LeaderId; + } else + // Each access has its own dependence set. + DepId = RunningDepId++; + + RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); + + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); + } else { + CanDoRT = false; + } } - } - if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) - NumComparisons = 0; // Only one dependence set. - else { - NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks + - NumWritePtrChecks - 1)); + if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) + NumComparisons += 0; // Only one dependence set. + else { + NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks + + NumWritePtrChecks - 1)); + } + + ++ASId; } // If the pointers that we would use for the bounds comparison have different @@ -3882,6 +4098,9 @@ bool AccessAnalysis::canCheckPtrAtRT( // Only need to check pointers between two different dependency sets. if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) continue; + // Only need to check pointers in the same alias set. + if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j]) + continue; Value *PtrI = RtCheck.Pointers[i]; Value *PtrJ = RtCheck.Pointers[j]; @@ -3899,90 +4118,99 @@ bool AccessAnalysis::canCheckPtrAtRT( return CanDoRT; } -static bool isFunctionScopeIdentifiedObject(Value *Ptr) { - return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa(Ptr); -} - -void AccessAnalysis::processMemAccesses(bool UseDeferred) { +void AccessAnalysis::processMemAccesses() { // We process the set twice: first we process read-write pointers, last we // process read-only pointers. This allows us to skip dependence tests for // read-only pointers. - PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; - for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) { - const MemAccessInfo &Access = *AI; - Value *Ptr = Access.getPointer(); - bool IsWrite = Access.getInt(); - - DepCands.insert(Access); - - // Memorize read-only pointers for later processing and skip them in the - // first round (they need to be checked after we have seen all write - // pointers). Note: we also mark pointer that are not consecutive as - // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the - // second check for "!IsWrite". - bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; - if (!UseDeferred && IsReadOnlyPtr) { - DeferredAccesses.insert(Access); - continue; - } + DEBUG(dbgs() << "LV: Processing memory accesses...\n"); + DEBUG(dbgs() << " AST: "; AST.dump()); + DEBUG(dbgs() << "LV: Accesses:\n"); + DEBUG({ + for (auto A : Accesses) + dbgs() << "\t" << *A.getPointer() << " (" << + (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? + "read-only" : "read")) << ")\n"; + }); + + // The AliasSetTracker has nicely partitioned our pointers by metadata + // compatibility and potential for underlying-object overlap. As a result, we + // only need to check for potential pointer dependencies within each alias + // set. + for (auto &AS : AST) { + // Note that both the alias-set tracker and the alias sets themselves used + // linked lists internally and so the iteration order here is deterministic + // (matching the original instruction order within each set). + + bool SetHasWrite = false; + + // Map of pointers to last access encountered. + typedef DenseMap UnderlyingObjToAccessMap; + UnderlyingObjToAccessMap ObjToLastAccess; + + // Set of access to check after all writes have been processed. + PtrAccessSet DeferredAccesses; + + // Iterate over each alias set twice, once to process read/write pointers, + // and then to process read-only pointers. + for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { + bool UseDeferred = SetIteration > 0; + PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; + + for (auto A : AS) { + Value *Ptr = A.getValue(); + bool IsWrite = S.count(MemAccessInfo(Ptr, true)); + + // If we're using the deferred access set, then it contains only reads. + bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; + if (UseDeferred && !IsReadOnlyPtr) + continue; + // Otherwise, the pointer must be in the PtrAccessSet, either as a read + // or a write. + assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || + S.count(MemAccessInfo(Ptr, false))) && + "Alias-set pointer not in the access set?"); + + MemAccessInfo Access(Ptr, IsWrite); + DepCands.insert(Access); + + // Memorize read-only pointers for later processing and skip them in the + // first round (they need to be checked after we have seen all write + // pointers). Note: we also mark pointer that are not consecutive as + // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need + // the second check for "!IsWrite". + if (!UseDeferred && IsReadOnlyPtr) { + DeferredAccesses.insert(Access); + continue; + } - bool NeedDepCheck = false; - // Check whether there is the possibility of dependency because of - // underlying objects being the same. - typedef SmallVector ValueVector; - ValueVector TempObjects; - GetUnderlyingObjects(Ptr, TempObjects, DL); - for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end(); - UI != UE; ++UI) { - Value *UnderlyingObj = *UI; - - // If this is a write then it needs to be an identified object. If this a - // read and all writes (so far) are identified function scope objects we - // don't need an identified underlying object but only an Argument (the - // next write is going to invalidate this assumption if it is - // unidentified). - // This is a micro-optimization for the case where all writes are - // identified and we have one argument pointer. - // Otherwise, we do need a runtime check. - if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) || - (!IsWrite && (!AreAllWritesIdentified || - !isa(UnderlyingObj)) && - !isIdentifiedObject(UnderlyingObj))) { - DEBUG(dbgs() << "LV: Found an unidentified " << - (IsWrite ? "write" : "read" ) << " ptr: " << *UnderlyingObj << - "\n"); - IsRTCheckNeeded = (IsRTCheckNeeded || - !isIdentifiedObject(UnderlyingObj) || - !AreAllReadsIdentified); + // If this is a write - check other reads and writes for conflicts. If + // this is a read only check other writes for conflicts (but only if + // there is no other write to the ptr - this is an optimization to + // catch "a[i] = a[i] + " without having to do a dependence check). + if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { + CheckDeps.insert(Access); + IsRTCheckNeeded = true; + } if (IsWrite) - AreAllWritesIdentified = false; - if (!IsWrite) - AreAllReadsIdentified = false; + SetHasWrite = true; + + // Create sets of pointers connected by a shared alias set and + // underlying object. + typedef SmallVector ValueVector; + ValueVector TempObjects; + GetUnderlyingObjects(Ptr, TempObjects, DL); + for (Value *UnderlyingObj : TempObjects) { + UnderlyingObjToAccessMap::iterator Prev = + ObjToLastAccess.find(UnderlyingObj); + if (Prev != ObjToLastAccess.end()) + DepCands.unionSets(Access, Prev->second); + + ObjToLastAccess[UnderlyingObj] = Access; + } } - - // If this is a write - check other reads and writes for conflicts. If - // this is a read only check other writes for conflicts (but only if there - // is no other write to the ptr - this is an optimization to catch "a[i] = - // a[i] + " without having to do a dependence check). - if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj)) - NeedDepCheck = true; - - if (IsWrite) - WriteObjects.insert(UnderlyingObj); - - // Create sets of pointers connected by shared underlying objects. - UnderlyingObjToAccessMap::iterator Prev = - ObjToLastAccess.find(UnderlyingObj); - if (Prev != ObjToLastAccess.end()) - DepCands.unionSets(Access, Prev->second); - - ObjToLastAccess[UnderlyingObj] = Access; } - - if (NeedDepCheck) - CheckDeps.insert(Access); } } @@ -4242,6 +4470,11 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, if (!AIsWrite && !BIsWrite) return false; + // We cannot check pointers in different address spaces. + if (APtr->getType()->getPointerAddressSpace() != + BPtr->getType()->getPointerAddressSpace()) + return true; + const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); @@ -4429,8 +4662,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { continue; LoadInst *Ld = dyn_cast(it); - if (!Ld) return false; - if (!Ld->isSimple() && !IsAnnotatedParallel) { + if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { + emitAnalysis(Report(Ld) + << "read with atomic ordering or volatile read"); DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } @@ -4443,8 +4677,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Save 'store' instructions. Abort if other instructions write to memory. if (it->mayWriteToMemory()) { StoreInst *St = dyn_cast(it); - if (!St) return false; + if (!St) { + emitAnalysis(Report(it) << "instruction cannot be vectorized"); + return false; + } if (!St->isSimple() && !IsAnnotatedParallel) { + emitAnalysis(Report(St) + << "write with atomic ordering or volatile write"); DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } @@ -4466,7 +4705,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } AccessAnalysis::DepCandidates DependentAccesses; - AccessAnalysis Accesses(DL, DependentAccesses); + AccessAnalysis Accesses(DL, AA, DependentAccesses); // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -4481,6 +4720,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { Value* Ptr = ST->getPointerOperand(); if (isUniform(Ptr)) { + emitAnalysis( + Report(ST) + << "write to a loop invariant address could not be vectorized"); DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); return false; } @@ -4489,7 +4731,15 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // list. At this phase it is only a 'write' list. if (Seen.insert(Ptr)) { ++NumReadWrites; - Accesses.addStore(Ptr); + + AliasAnalysis::Location Loc = AA->getLocation(ST); + // The TBAA metadata could have a control dependency on the predication + // condition, so we cannot rely on it when determining whether or not we + // need runtime pointer checks. + if (blockNeedsPredication(ST->getParent())) + Loc.TBAATag = nullptr; + + Accesses.addStore(Loc); } } @@ -4516,7 +4766,15 @@ bool LoopVectorizationLegality::canVectorizeMemory() { ++NumReads; IsReadOnlyPtr = true; } - Accesses.addLoad(Ptr, IsReadOnlyPtr); + + AliasAnalysis::Location Loc = AA->getLocation(LD); + // The TBAA metadata could have a control dependency on the predication + // condition, so we cannot rely on it when determining whether or not we + // need runtime pointer checks. + if (blockNeedsPredication(LD->getParent())) + Loc.TBAATag = nullptr; + + Accesses.addLoad(Loc, IsReadOnlyPtr); } // If we write (or read-write) to a single destination and there are no @@ -4559,6 +4817,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } if (NeedRTCheck && !CanDoRT) { + emitAnalysis(Report() << "cannot identify array bounds"); DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"); PtrRtCheck.reset(); @@ -4589,6 +4848,14 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Check that we did not collect too many pointers or found an unsizeable // pointer. if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { + if (!CanDoRT && NumComparisons > 0) + emitAnalysis(Report() + << "cannot check memory dependencies at runtime"); + else + emitAnalysis(Report() + << NumComparisons << " exceeds limit of " + << RuntimeMemoryCheckThreshold + << " dependent memory operations checked at runtime"); DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); PtrRtCheck.reset(); return false; @@ -4598,6 +4865,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } } + if (!CanVecMem) + emitAnalysis(Report() << "unsafe dependent memory operations in loop"); + DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"); @@ -5689,6 +5959,7 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) @@ -5850,4 +6121,3 @@ Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, Constant *C = ConstantInt::get(ITy, StartIdx, Negate); return Builder.CreateAdd(Val, C, "induction"); } - diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index f18202be167e..53a43d9851e9 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -149,6 +149,48 @@ static bool isSplat(ArrayRef VL) { return true; } +///\returns Opcode that can be clubbed with \p Op to create an alternate +/// sequence which can later be merged as a ShuffleVector instruction. +static unsigned getAltOpcode(unsigned Op) { + switch (Op) { + case Instruction::FAdd: + return Instruction::FSub; + case Instruction::FSub: + return Instruction::FAdd; + case Instruction::Add: + return Instruction::Sub; + case Instruction::Sub: + return Instruction::Add; + default: + return 0; + } +} + +///\returns bool representing if Opcode \p Op can be part +/// of an alternate sequence which can later be merged as +/// a ShuffleVector instruction. +static bool canCombineAsAltInst(unsigned Op) { + if (Op == Instruction::FAdd || Op == Instruction::FSub || + Op == Instruction::Sub || Op == Instruction::Add) + return true; + return false; +} + +/// \returns ShuffleVector instruction if intructions in \p VL have +/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. +/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) +static unsigned isAltInst(ArrayRef VL) { + Instruction *I0 = dyn_cast(VL[0]); + unsigned Opcode = I0->getOpcode(); + unsigned AltOpcode = getAltOpcode(Opcode); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast(VL[i]); + if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) + return 0; + } + return Instruction::ShuffleVector; +} + /// \returns The opcode if all of the Instructions in \p VL have the same /// opcode, or zero. static unsigned getSameOpcode(ArrayRef VL) { @@ -158,8 +200,11 @@ static unsigned getSameOpcode(ArrayRef VL) { unsigned Opcode = I0->getOpcode(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast(VL[i]); - if (!I || Opcode != I->getOpcode()) + if (!I || Opcode != I->getOpcode()) { + if (canCombineAsAltInst(Opcode) && i == 1) + return isAltInst(VL); return 0; + } } return Opcode; } @@ -377,6 +422,7 @@ class BoUpSLP { /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); + private: struct TreeEntry; @@ -594,6 +640,7 @@ void BoUpSLP::buildTree(ArrayRef Roots, void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { bool SameTy = getSameType(VL); (void)SameTy; + bool isAltShuffle = false; assert(SameTy && "Invalid types!"); if (Depth == RecursionMaxDepth) { @@ -615,10 +662,19 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { newTreeEntry(VL, false); return; } + unsigned Opcode = getSameOpcode(VL); + + // Check that this shuffle vector refers to the alternate + // sequence of opcodes. + if (Opcode == Instruction::ShuffleVector) { + Instruction *I0 = dyn_cast(VL[0]); + unsigned Op = I0->getOpcode(); + if (Op != Instruction::ShuffleVector) + isAltShuffle = true; + } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || - !getSameOpcode(VL)) { + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, false); return; @@ -754,8 +810,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned Opcode = getSameOpcode(VL); - // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { Instruction *Last = getLastInstruction(VL); @@ -1057,6 +1111,26 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } return; } + case Instruction::ShuffleVector: { + // If this is not an alternate sequence of opcode like add-sub + // then do not vectorize this instruction. + if (!isAltShuffle) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); + return; + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } default: newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); @@ -1080,11 +1154,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return getGatherCost(E->Scalars); } - - assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && - "Invalid VL"); + unsigned Opcode = getSameOpcode(VL); + assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); Instruction *VL0 = cast(VL[0]); - unsigned Opcode = VL0->getOpcode(); switch (Opcode) { case Instruction::PHI: { return 0; @@ -1242,6 +1314,32 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return VecCallCost - ScalarCallCost; } + case Instruction::ShuffleVector: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + int ScalarCost = 0; + int VecCost = 0; + for (unsigned i = 0; i < VL.size(); ++i) { + Instruction *I = cast(VL[i]); + if (!I) + break; + ScalarCost += + TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); + } + // VecCost is equal to sum of the cost of creating 2 vectors + // and the cost of creating shuffle. + Instruction *I0 = cast(VL[0]); + VecCost = + TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); + Instruction *I1 = cast(VL[1]); + VecCost += + TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + VecCost += + TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); + return VecCost - ScalarCost; + } default: llvm_unreachable("Unknown instruction"); } @@ -1522,9 +1620,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E->Scalars); return Gather(E->Scalars, VecTy); } - - unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); + unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { case Instruction::PHI: { @@ -1797,6 +1893,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = V; return V; } + case Instruction::ShuffleVector: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast(E->Scalars[i])->getOperand(1)); + } + setInsertPointAfterBundle(E->Scalars); + + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + // Create a vector of LHS op1 RHS + BinaryOperator *BinOp0 = cast(VL0); + Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); + + // Create a vector of LHS op2 RHS + Instruction *VL1 = cast(E->Scalars[1]); + BinaryOperator *BinOp1 = cast(VL1); + Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); + + // Create appropriate shuffle to take alternative operations from + // the vector. + std::vector Mask(E->Scalars.size()); + unsigned e = E->Scalars.size(); + for (unsigned i = 0; i < e; ++i) { + if (i & 1) + Mask[i] = Builder.getInt32(e + i); + else + Mask[i] = Builder.getInt32(i); + } + + Value *ShuffleMask = ConstantVector::get(Mask); + + Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + E->VectorizedValue = V; + if (Instruction *I = dyn_cast(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } default: llvm_unreachable("unknown inst"); } @@ -1865,7 +2004,6 @@ Value *BoUpSLP::vectorizeTree() { // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - // No need to handle users of gathered values. if (Entry->NeedToGather) continue; @@ -2049,7 +2187,6 @@ struct SLPVectorizer : public FunctionPass { for (po_iterator it = po_begin(&F.getEntryBlock()), e = po_end(&F.getEntryBlock()); it != e; ++it) { BasicBlock *BB = *it; - // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; diff --git a/test/Analysis/BasicAA/cs-cs.ll b/test/Analysis/BasicAA/cs-cs.ll new file mode 100644 index 000000000000..693634c0414d --- /dev/null +++ b/test/Analysis/BasicAA/cs-cs.ll @@ -0,0 +1,236 @@ +; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32" +target triple = "arm-apple-ios" + +declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly +declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind + +declare void @a_readonly_func(i8 *) noinline nounwind readonly + +define <8 x i16> @test1(i8* %p, <8 x i16> %y) { +entry: + %q = getelementptr i8* %p, i64 16 + %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind + call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) + %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind + %c = add <8 x i16> %a, %b + ret <8 x i16> %c + +; CHECK-LABEL: Function: test1: + +; CHECK: NoAlias: i8* %p, i8* %q +; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 +; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 +; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 +; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 +; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 +; CHECK: NoModRef: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 +; CHECK: NoModRef: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 +; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 +; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) +} + +define void @test2(i8* %P, i8* %Q) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test2: + +; CHECK: MayAlias: i8* %P, i8* %Q +; CHECK: Both ModRef: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +} + +define void @test2a(i8* noalias %P, i8* noalias %Q) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test2a: + +; CHECK: NoAlias: i8* %P, i8* %Q +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +} + +define void @test2b(i8* noalias %P, i8* noalias %Q) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + %R = getelementptr i8* %P, i64 12 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test2b: + +; CHECK: NoAlias: i8* %P, i8* %Q +; CHECK: NoAlias: i8* %P, i8* %R +; CHECK: NoAlias: i8* %Q, i8* %R +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +} + +define void @test2c(i8* noalias %P, i8* noalias %Q) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + %R = getelementptr i8* %P, i64 11 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test2c: + +; CHECK: NoAlias: i8* %P, i8* %Q +; CHECK: NoAlias: i8* %P, i8* %R +; CHECK: NoAlias: i8* %Q, i8* %R +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +} + +define void @test2d(i8* noalias %P, i8* noalias %Q) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + %R = getelementptr i8* %P, i64 -12 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test2d: + +; CHECK: NoAlias: i8* %P, i8* %Q +; CHECK: NoAlias: i8* %P, i8* %R +; CHECK: NoAlias: i8* %Q, i8* %R +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +} + +define void @test2e(i8* noalias %P, i8* noalias %Q) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + %R = getelementptr i8* %P, i64 -11 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test2e: + +; CHECK: NoAlias: i8* %P, i8* %Q +; CHECK: NoAlias: i8* %P, i8* %R +; CHECK: NoAlias: i8* %Q, i8* %R +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +} + +define void @test3(i8* %P, i8* %Q) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test3: + +; CHECK: MayAlias: i8* %P, i8* %Q +; CHECK: Both ModRef: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) +} + +define void @test3a(i8* noalias %P, i8* noalias %Q) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test3a: + +; CHECK: NoAlias: i8* %P, i8* %Q +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) +} + +define void @test4(i8* %P, i8* noalias %Q) nounwind ssp { + tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test4: + +; CHECK: NoAlias: i8* %P, i8* %Q +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %Q <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) +} + +define void @test5(i8* %P, i8* %Q, i8* %R) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test5: + +; CHECK: MayAlias: i8* %P, i8* %Q +; CHECK: MayAlias: i8* %P, i8* %R +; CHECK: MayAlias: i8* %Q, i8* %R +; CHECK: Both ModRef: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: Both ModRef: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +} + +define void @test6(i8* %P) nounwind ssp { + call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) + call void @a_readonly_func(i8* %P) + ret void + +; CHECK-LABEL: Function: test6: + +; CHECK: Just Mod: Ptr: i8* %P <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) +; CHECK: Just Ref: Ptr: i8* %P <-> call void @a_readonly_func(i8* %P) +; CHECK: Just Mod: call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) <-> call void @a_readonly_func(i8* %P) +; CHECK: Just Ref: call void @a_readonly_func(i8* %P) <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) +} + +attributes #0 = { nounwind } diff --git a/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll b/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll new file mode 100644 index 000000000000..2e162f0f0005 --- /dev/null +++ b/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll @@ -0,0 +1,347 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+sse3,+ssse3 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSSE3 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 + + +; Verify the cost model for alternate shuffles. + +; shufflevector instructions with illegal 64-bit vector types. +; 64-bit packed integer vectors (v2i32) are promoted to type v2i64. +; 64-bit packed float vectors (v2f32) are widened to type v4f32. + +define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) { + %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> + ret <2 x i32> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i32': +; SSE2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + +define <2 x float> @test_v2f32(<2 x float> %a, <2 x float> %b) { + %1 = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> + ret <2 x float> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2f32': +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + +define <2 x i32> @test_v2i32_2(<2 x i32> %a, <2 x i32> %b) { + %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> + ret <2 x i32> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i32_2': +; SSE2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + +define <2 x float> @test_v2f32_2(<2 x float> %a, <2 x float> %b) { + %1 = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> + ret <2 x float> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2f32_2': +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +; Test shuffles on packed vectors of two elements. + +define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) { + %1 = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> + ret <2 x i64> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i64': +; SSE2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + +define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) { + %1 = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> + ret <2 x double> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2f64': +; SSE2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <2 x i64> @test_v2i64_2(<2 x i64> %a, <2 x i64> %b) { + %1 = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> + ret <2 x i64> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i64_2': +; SSE2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <2 x double> @test_v2f64_2(<2 x double> %a, <2 x double> %b) { + %1 = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> + ret <2 x double> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2f64_2': +; SSE2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + +; Test shuffles on packed vectors of four elements. + +define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4i32': +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <4 x i32> @test_v4i32_2(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4i32_2': +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4f32': +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <4 x float> @test_v4f32_2(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4f32_2': +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + +define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { + %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + ret <4 x i64> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4i64': +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <4 x i64> @test_v4i64_2(<4 x i64> %a, <4 x i64> %b) { + %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + ret <4 x i64> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4i64_2': +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { + %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4f64': +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <4 x double> @test_v4f64_2(<4 x double> %a, <4 x double> %b) { + %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4f64_2': +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +; Test shuffles on packed vectors of eight elements. +define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) { + %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i16': +; SSE2: Cost Model: {{.*}} 8 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <8 x i16> @test_v8i16_2(<8 x i16> %a, <8 x i16> %b) { + %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i16_2': +; SSE2: Cost Model: {{.*}} 8 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { + %1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i32': +; SSE2: Cost Model: {{.*}} 4 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 4 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <8 x i32> @test_v8i32_2(<8 x i32> %a, <8 x i32> %b) { + %1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i32_2': +; SSE2: Cost Model: {{.*}} 4 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 4 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { + %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8f32': +; SSE2: Cost Model: {{.*}} 4 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 4 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <8 x float> @test_v8f32_2(<8 x float> %a, <8 x float> %b) { + %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8f32_2': +; SSE2: Cost Model: {{.*}} 4 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 4 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +; Test shuffles on packed vectors of sixteen elements. +define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) { + %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i8': +; SSE2: Cost Model: {{.*}} 48 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector + + +define <16 x i8> @test_v16i8_2(<16 x i8> %a, <16 x i8> %b) { + %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i8_2': +; SSE2: Cost Model: {{.*}} 48 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector + + +define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { + %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i16': +; SSE2: Cost Model: {{.*}} 16 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 5 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + + +define <16 x i16> @test_v16i16_2(<16 x i16> %a, <16 x i16> %b) { + %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i16_2': +; SSE2: Cost Model: {{.*}} 16 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 5 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector + +define <32 x i8> @test_v32i8(<32 x i8> %a, <32 x i8> %b) { + %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v32i8': +; SSE2: Cost Model: {{.*}} 96 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector + + +define <32 x i8> @test_v32i8_2(<32 x i8> %a, <32 x i8> %b) { + %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %1 +} +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v32i8_2': +; SSE2: Cost Model: {{.*}} 96 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector + diff --git a/test/Assembler/invalid-comdat.ll b/test/Assembler/invalid-comdat.ll new file mode 100644 index 000000000000..987e1e1e7d92 --- /dev/null +++ b/test/Assembler/invalid-comdat.ll @@ -0,0 +1,4 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@v = global i32 0, comdat $v +; CHECK: use of undefined comdat '$v' diff --git a/test/Assembler/invalid-comdat2.ll b/test/Assembler/invalid-comdat2.ll new file mode 100644 index 000000000000..ed656ef2b112 --- /dev/null +++ b/test/Assembler/invalid-comdat2.ll @@ -0,0 +1,5 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +$v = comdat any +$v = comdat any +; CHECK: redefinition of comdat '$v' diff --git a/test/Assembler/upgrade-loop-metadata.ll b/test/Assembler/upgrade-loop-metadata.ll new file mode 100644 index 000000000000..1c0311dd09e9 --- /dev/null +++ b/test/Assembler/upgrade-loop-metadata.ll @@ -0,0 +1,41 @@ +; Test to make sure loop vectorizer metadata is automatically upgraded. +; +; Run using opt as well to ensure that the metadata is upgraded when parsing +; assembly. +; +; RUN: llvm-as < %s | llvm-dis | FileCheck %s +; RUN: opt -S < %s | FileCheck %s + +define void @_Z28loop_with_vectorize_metadatav() { +entry: + %i = alloca i32, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32* %i, align 4 + %cmp = icmp slt i32 %0, 16 + br i1 %cmp, label %for.body, label %for.end, !llvm.loop !1 + +for.body: ; preds = %for.cond + br label %for.inc + +for.inc: ; preds = %for.body + %1 = load i32* %i, align 4 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; CHECK: !{metadata !"llvm.loop.interleave.count", i32 4} +; CHECK: !{metadata !"llvm.loop.vectorize.width", i32 8} +; CHECK: !{metadata !"llvm.loop.vectorize.enable", i1 true} + +!0 = metadata !{metadata !"clang version 3.5.0 (trunk 211528)"} +!1 = metadata !{metadata !1, metadata !2, metadata !3, metadata !4, metadata !4} +!2 = metadata !{metadata !"llvm.vectorizer.unroll", i32 4} +!3 = metadata !{metadata !"llvm.vectorizer.width", i32 8} +!4 = metadata !{metadata !"llvm.vectorizer.enable", i1 true} diff --git a/test/Bindings/Ocaml/bitwriter.ml b/test/Bindings/Ocaml/bitwriter.ml index ae456cf785c8..d4d0417a80e3 100644 --- a/test/Bindings/Ocaml/bitwriter.ml +++ b/test/Bindings/Ocaml/bitwriter.ml @@ -1,7 +1,7 @@ (* RUN: rm -rf %t.builddir * RUN: mkdir -p %t.builddir * RUN: cp %s %t.builddir - * RUN: %ocamlopt -warn-error A unix.cmxa llvm.cmxa llvm_bitwriter.cmxa %t.builddir/bitwriter.ml -o %t + * RUN: %ocamlopt -warn-error A-3 unix.cmxa llvm.cmxa llvm_bitwriter.cmxa %t.builddir/bitwriter.ml -o %t * RUN: %t %t.bc * RUN: llvm-dis < %t.bc * XFAIL: vg_leak diff --git a/test/Bindings/Ocaml/vmcore.ml b/test/Bindings/Ocaml/vmcore.ml index f014116ffe8e..53e0553b0d59 100644 --- a/test/Bindings/Ocaml/vmcore.ml +++ b/test/Bindings/Ocaml/vmcore.ml @@ -126,6 +126,12 @@ let test_constants () = ignore (define_global "const_int_string" c m); insist (i32_type = type_of c); + if Sys.word_size = 64; then begin + group "long int"; + let c = const_int i64_type (1 lsl 61) in + insist (c = const_of_int64 i64_type (Int64.of_int (1 lsl 61)) false) + end; + (* CHECK: @const_string = global {{.*}}c"cruel\00world" *) group "string"; diff --git a/test/Bitcode/attributes.ll b/test/Bitcode/attributes.ll index 49366de9836d..2490e5920726 100644 --- a/test/Bitcode/attributes.ll +++ b/test/Bitcode/attributes.ll @@ -229,6 +229,16 @@ define void @f38() unnamed_addr jumptable { unreachable } +define dereferenceable(2) i8* @f39(i8* dereferenceable(1) %a) { +; CHECK: define dereferenceable(2) i8* @f39(i8* dereferenceable(1) %a) { + ret i8* %a +} + +define dereferenceable(18446744073709551606) i8* @f40(i8* dereferenceable(18446744073709551615) %a) { +; CHECK: define dereferenceable(18446744073709551606) i8* @f40(i8* dereferenceable(18446744073709551615) %a) { + ret i8* %a +} + ; CHECK: attributes #0 = { noreturn } ; CHECK: attributes #1 = { nounwind } ; CHECK: attributes #2 = { readnone } diff --git a/test/Bitcode/inalloca.ll b/test/Bitcode/inalloca.ll new file mode 100644 index 000000000000..bad87a9b03f0 --- /dev/null +++ b/test/Bitcode/inalloca.ll @@ -0,0 +1,18 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +; inalloca should roundtrip. + +define void @foo(i32* inalloca %args) { + ret void +} +; CHECK-LABEL: define void @foo(i32* inalloca %args) + +define void @bar() { + ; Use the maximum alignment, since we stuff our bit with alignment. + %args = alloca inalloca i32, align 536870912 + call void @foo(i32* inalloca %args) + ret void +} +; CHECK-LABEL: define void @bar() { +; CHECK: %args = alloca inalloca i32, align 536870912 +; CHECK: call void @foo(i32* inalloca %args) diff --git a/test/Bitcode/upgrade-loop-metadata.ll b/test/Bitcode/upgrade-loop-metadata.ll new file mode 100644 index 000000000000..67a8d3935926 --- /dev/null +++ b/test/Bitcode/upgrade-loop-metadata.ll @@ -0,0 +1,37 @@ +; Test to make sure loop vectorizer metadata is automatically upgraded. +; +; RUN: llvm-dis < %s.bc | FileCheck %s + +define void @_Z28loop_with_vectorize_metadatav() { +entry: + %i = alloca i32, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32* %i, align 4 + %cmp = icmp slt i32 %0, 16 + br i1 %cmp, label %for.body, label %for.end, !llvm.loop !1 + +for.body: ; preds = %for.cond + br label %for.inc + +for.inc: ; preds = %for.body + %1 = load i32* %i, align 4 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; CHECK: !{metadata !"llvm.loop.interleave.count", i32 4} +; CHECK: !{metadata !"llvm.loop.vectorize.width", i32 8} +; CHECK: !{metadata !"llvm.loop.vectorize.enable", i1 true} + +!0 = metadata !{metadata !"clang version 3.5.0 (trunk 211528)"} +!1 = metadata !{metadata !1, metadata !2, metadata !3, metadata !4, metadata !4} +!2 = metadata !{metadata !"llvm.vectorizer.unroll", i32 4} +!3 = metadata !{metadata !"llvm.vectorizer.width", i32 8} +!4 = metadata !{metadata !"llvm.vectorizer.enable", i1 true} diff --git a/test/Bitcode/upgrade-loop-metadata.ll.bc b/test/Bitcode/upgrade-loop-metadata.ll.bc new file mode 100644 index 000000000000..3f218cb7feb4 Binary files /dev/null and b/test/Bitcode/upgrade-loop-metadata.ll.bc differ diff --git a/test/BugPoint/compile-custom.ll b/test/BugPoint/compile-custom.ll index e9016ffb8700..d152f08626f8 100755 --- a/test/BugPoint/compile-custom.ll +++ b/test/BugPoint/compile-custom.ll @@ -1,4 +1,4 @@ -; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext --compile-custom --compile-command="%s.py arg1 arg2" --output-prefix %t %s | FileCheck %s +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext --compile-custom --compile-command="%python %s.py arg1 arg2" --output-prefix %t %s | FileCheck %s ; REQUIRES: loadable_module ; Test that arguments are correctly passed in --compile-command. The output diff --git a/test/CodeGen/AArch64/aarch64-address-type-promotion-assertion.ll b/test/CodeGen/AArch64/aarch64-address-type-promotion-assertion.ll new file mode 100644 index 000000000000..2df9c375bdce --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-address-type-promotion-assertion.ll @@ -0,0 +1,55 @@ +; RUN: llc -O3 -mcpu=cortex-a53 -mtriple=aarch64--linux-gnu %s -o - | FileCheck %s +; PR20188: don't crash when merging sexts. + +; CHECK: foo: +define void @foo() unnamed_addr align 2 { +entry: + br label %invoke.cont145 + +invoke.cont145: + %or.cond = and i1 undef, false + br i1 %or.cond, label %if.then274, label %invoke.cont145 + +if.then274: + %0 = load i32* null, align 4 + br i1 undef, label %invoke.cont291, label %if.else313 + +invoke.cont291: + %idxprom.i.i.i605 = sext i32 %0 to i64 + %arrayidx.i.i.i607 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i605 + %idxprom.i.i.i596 = sext i32 %0 to i64 + %arrayidx.i.i.i598 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i596 + br label %if.end356 + +if.else313: + %cmp314 = fcmp olt double undef, 0.000000e+00 + br i1 %cmp314, label %invoke.cont317, label %invoke.cont353 + +invoke.cont317: + br i1 undef, label %invoke.cont326, label %invoke.cont334 + +invoke.cont326: + %idxprom.i.i.i587 = sext i32 %0 to i64 + %arrayidx.i.i.i589 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i587 + %sub329 = fsub fast double undef, undef + br label %invoke.cont334 + +invoke.cont334: + %lo.1 = phi double [ %sub329, %invoke.cont326 ], [ undef, %invoke.cont317 ] + br i1 undef, label %invoke.cont342, label %if.end356 + +invoke.cont342: + %idxprom.i.i.i578 = sext i32 %0 to i64 + %arrayidx.i.i.i580 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i578 + br label %if.end356 + +invoke.cont353: + %idxprom.i.i.i572 = sext i32 %0 to i64 + %arrayidx.i.i.i574 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i572 + br label %if.end356 + +if.end356: + %lo.2 = phi double [ 0.000000e+00, %invoke.cont291 ], [ %lo.1, %invoke.cont342 ], [ undef, %invoke.cont353 ], [ %lo.1, %invoke.cont334 ] + call void null(i32 %0, double %lo.2) + unreachable +} diff --git a/test/CodeGen/AArch64/aarch64-address-type-promotion.ll b/test/CodeGen/AArch64/aarch64-address-type-promotion.ll new file mode 100644 index 000000000000..ee90d199b458 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-address-type-promotion.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -o - | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64" +target triple = "arm64-apple-macosx10.9" + +; Check that sexts get promoted above adds. +define void @foo(i32* nocapture %a, i32 %i) { +entry: +; CHECK-LABEL: _foo: +; CHECK: add +; CHECK-NEXT: ldp +; CHECK-NEXT: add +; CHECK-NEXT: str +; CHECK-NEXT: ret + %add = add nsw i32 %i, 1 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32* %a, i64 %idxprom + %0 = load i32* %arrayidx, align 4 + %add1 = add nsw i32 %i, 2 + %idxprom2 = sext i32 %add1 to i64 + %arrayidx3 = getelementptr inbounds i32* %a, i64 %idxprom2 + %1 = load i32* %arrayidx3, align 4 + %add4 = add nsw i32 %1, %0 + %idxprom5 = sext i32 %i to i64 + %arrayidx6 = getelementptr inbounds i32* %a, i64 %idxprom5 + store i32 %add4, i32* %arrayidx6, align 4 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll index ccf1371bb5ff..127a7cc0a155 100644 --- a/test/CodeGen/AArch64/arm64-aapcs.ll +++ b/test/CodeGen/AArch64/arm64-aapcs.ll @@ -109,3 +109,17 @@ entry: ; CHECK: ldr {{q[0-9]+}}, [sp] ret <2 x double> %varg_stack; } + +; Check that f16 can be passed and returned (ACLE 2.0 extension) +define half @test_half(float, half %arg) { +; CHECK-LABEL: test_half: +; CHECK: mov v0.16b, v{{[0-9]+}}.16b + ret half %arg; +} + +; Check that f16 constants are materialized correctly +define half @test_half_const() { +; CHECK-LABEL: test_half_const: +; CHECK: ldr h0, [x{{[0-9]+}}, :lo12:{{.*}}] + ret half 0xH4248 +} diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll index 0f5b23998ee8..3377849f6698 100644 --- a/test/CodeGen/AArch64/arm64-atomic-128.ll +++ b/test/CodeGen/AArch64/arm64-atomic-128.ll @@ -22,8 +22,10 @@ define void @fetch_and_nand(i128* %p, i128 %bits) { ; CHECK-LABEL: fetch_and_nand: ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: ; CHECK: ldxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0] -; CHECK-DAG: bic [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2 -; CHECK-DAG: bic [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3 +; CHECK-DAG: and [[TMP_REGLO:x[0-9]+]], [[DEST_REGLO]], x2 +; CHECK-DAG: and [[TMP_REGHI:x[0-9]+]], [[DEST_REGHI]], x3 +; CHECK-DAG: mvn [[SCRATCH_REGLO:x[0-9]+]], [[TMP_REGLO]] +; CHECK-DAG: mvn [[SCRATCH_REGHI:x[0-9]+]], [[TMP_REGHI]] ; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0] ; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll index aef79cb386b3..b56f91ddd111 100644 --- a/test/CodeGen/AArch64/arm64-atomic.ll +++ b/test/CodeGen/AArch64/arm64-atomic.ll @@ -35,7 +35,8 @@ define i32 @fetch_and_nand(i32* %p) { ; CHECK-LABEL: fetch_and_nand: ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: ; CHECK: ldxr w[[DEST_REG:[0-9]+]], [x0] -; CHECK: and [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], #0xfffffff8 +; CHECK: mvn [[TMP_REG:w[0-9]+]], w[[DEST_REG]] +; CHECK: orr [[SCRATCH2_REG:w[0-9]+]], [[TMP_REG]], #0xfffffff8 ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]] ; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0] ; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] @@ -48,8 +49,9 @@ define i64 @fetch_and_nand_64(i64* %p) { ; CHECK-LABEL: fetch_and_nand_64: ; CHECK: mov x[[ADDR:[0-9]+]], x0 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: -; CHECK: ldaxr [[DEST_REG:x[0-9]+]], [x[[ADDR]]] -; CHECK: and [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0xfffffffffffffff8 +; CHECK: ldaxr x[[DEST_REG:[0-9]+]], [x[[ADDR]]] +; CHECK: mvn w[[TMP_REG:[0-9]+]], w[[DEST_REG]] +; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], x[[TMP_REG]], #0xfffffffffffffff8 ; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]] ; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll index c109263cedb4..d0f6db080551 100644 --- a/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/test/CodeGen/AArch64/arm64-build-vector.ll @@ -33,3 +33,27 @@ define <4 x float> @foo(float %a, float %b, float %c, float %d) nounwind { %4 = insertelement <4 x float> %3, float %d, i32 3 ret <4 x float> %4 } + +define <8 x i16> @build_all_zero(<8 x i16> %a) #1 { +; CHECK-LABEL: build_all_zero: +; CHECK: movz w[[GREG:[0-9]+]], #0xae80 +; CHECK-NEXT: fmov s[[FREG:[0-9]+]], w[[GREG]] +; CHECK-NEXT: mul.8h v0, v0, v[[FREG]] + %b = add <8 x i16> %a, + %c = mul <8 x i16> %b, + ret <8 x i16> %c +} + +; There is an optimization in DAG Combiner as following: +; fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) +; -> (BUILD_VECTOR A, B, ..., C, D, ...) +; This case checks when A,B and C,D are different types, there should be no +; assertion failure. +define <8 x i16> @concat_2_build_vector(<4 x i16> %in0) { +; CHECK-LABEL: concat_2_build_vector: +; CHECK: movi + %vshl_n = shl <4 x i16> %in0, + %vshl_n2 = shl <4 x i16> %vshl_n, + %shuffle.i = shufflevector <4 x i16> %vshl_n2, <4 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle.i +} \ No newline at end of file diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll index f88bd6a4fe32..bc7ed7fbdf83 100644 --- a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll +++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll @@ -122,3 +122,82 @@ define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) { } declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*) + +; Regression Test for PR20057. +; +; Cortex-A53 machine model stalls on A53UnitFPMDS contention. Instructions that +; are otherwise ready are jammed in the pending queue. +; CHECK: ********** MI Scheduling ********** +; CHECK: testResourceConflict +; CHECK: *** Final schedule for BB#0 *** +; CHECK: BRK +; CHECK: ********** INTERVALS ********** +define void @testResourceConflict(float* %ptr) { +entry: + %add1 = fadd float undef, undef + %mul2 = fmul float undef, undef + %add3 = fadd float %mul2, undef + %mul4 = fmul float undef, %add3 + %add5 = fadd float %mul4, undef + %sub6 = fsub float 0.000000e+00, undef + %sub7 = fsub float %add5, undef + %div8 = fdiv float 1.000000e+00, undef + %mul9 = fmul float %div8, %sub7 + %mul14 = fmul float %sub6, %div8 + %mul10 = fsub float -0.000000e+00, %mul14 + %mul15 = fmul float undef, %div8 + %mul11 = fsub float -0.000000e+00, %mul15 + %mul12 = fmul float 0.000000e+00, %div8 + %mul13 = fmul float %add1, %mul9 + %mul21 = fmul float %add5, %mul11 + %add22 = fadd float %mul13, %mul21 + store float %add22, float* %ptr, align 4 + %mul28 = fmul float %add1, %mul10 + %mul33 = fmul float %add5, %mul12 + %add34 = fadd float %mul33, %mul28 + store float %add34, float* %ptr, align 4 + %mul240 = fmul float undef, %mul9 + %add246 = fadd float %mul240, undef + store float %add246, float* %ptr, align 4 + %mul52 = fmul float undef, %mul10 + %mul57 = fmul float undef, %mul12 + %add58 = fadd float %mul57, %mul52 + store float %add58, float* %ptr, align 4 + %mul27 = fmul float 0.000000e+00, %mul9 + %mul81 = fmul float undef, %mul10 + %add82 = fadd float %mul27, %mul81 + store float %add82, float* %ptr, align 4 + call void @llvm.trap() + unreachable +} + +declare void @llvm.trap() + +; Regression test for PR20057: "permanent hazard"' +; Resource contention on LDST. +; CHECK: ********** MI Scheduling ********** +; CHECK: testLdStConflict +; CHECK: *** Final schedule for BB#1 *** +; CHECK: LD4Fourv2d +; CHECK: STRQui +; CHECK: ********** INTERVALS ********** +define void @testLdStConflict() { +entry: + br label %loop + +loop: + %0 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8* null) + %ptr = bitcast i8* undef to <2 x i64>* + store <2 x i64> zeroinitializer, <2 x i64>* %ptr, align 4 + %ptr1 = bitcast i8* undef to <2 x i64>* + store <2 x i64> zeroinitializer, <2 x i64>* %ptr1, align 4 + %ptr2 = bitcast i8* undef to <2 x i64>* + store <2 x i64> zeroinitializer, <2 x i64>* %ptr2, align 4 + %ptr3 = bitcast i8* undef to <2 x i64>* + store <2 x i64> zeroinitializer, <2 x i64>* %ptr3, align 4 + %ptr4 = bitcast i8* undef to <2 x i64>* + store <2 x i64> zeroinitializer, <2 x i64>* %ptr4, align 4 + br label %loop +} + +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8*) diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll index cfc2ebf0a2e9..1cfba826d510 100644 --- a/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -842,7 +842,7 @@ define <2 x i64> @scalar_to_vector.v2i64(i64 %a) { define <8 x i8> @testDUP.v1i8(<1 x i8> %a) { ; CHECK-LABEL: testDUP.v1i8: -; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}} +; CHECK: dup v0.8b, v0.b[0] %b = extractelement <1 x i8> %a, i32 0 %c = insertelement <8 x i8> undef, i8 %b, i32 0 %d = insertelement <8 x i8> %c, i8 %b, i32 1 @@ -857,7 +857,7 @@ define <8 x i8> @testDUP.v1i8(<1 x i8> %a) { define <8 x i16> @testDUP.v1i16(<1 x i16> %a) { ; CHECK-LABEL: testDUP.v1i16: -; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} +; CHECK: dup v0.8h, v0.h[0] %b = extractelement <1 x i16> %a, i32 0 %c = insertelement <8 x i16> undef, i16 %b, i32 0 %d = insertelement <8 x i16> %c, i16 %b, i32 1 @@ -872,7 +872,7 @@ define <8 x i16> @testDUP.v1i16(<1 x i16> %a) { define <4 x i32> @testDUP.v1i32(<1 x i32> %a) { ; CHECK-LABEL: testDUP.v1i32: -; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}} +; CHECK: dup v0.4s, v0.s[0] %b = extractelement <1 x i32> %a, i32 0 %c = insertelement <4 x i32> undef, i32 %b, i32 0 %d = insertelement <4 x i32> %c, i32 %b, i32 1 @@ -1411,35 +1411,35 @@ define <16 x i8> @concat_vector_v16i8_const() { define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) { ; CHECK-LABEL: concat_vector_v4i16: -; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}} +; CHECK: dup v0.4h, v0.h[0] %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer ret <4 x i16> %r } define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) { ; CHECK-LABEL: concat_vector_v4i32: -; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}} +; CHECK: dup v0.4s, v0.s[0] %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %r } define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) { ; CHECK-LABEL: concat_vector_v8i8: -; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}} +; CHECK: dup v0.8b, v0.b[0] %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer ret <8 x i8> %r } define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) { ; CHECK-LABEL: concat_vector_v8i16: -; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} +; CHECK: dup v0.8h, v0.h[0] %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %r } define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) { ; CHECK-LABEL: concat_vector_v16i8: -; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}} +; CHECK: dup v0.16b, v0.b[0] %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %r } diff --git a/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/test/CodeGen/AArch64/arm64-neon-select_cc.ll index 255b90dfa64b..95c582a5348c 100644 --- a/test/CodeGen/AArch64/arm64-neon-select_cc.ll +++ b/test/CodeGen/AArch64/arm64-neon-select_cc.ll @@ -136,8 +136,8 @@ define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) { ; CHECK-LABEL: test_select_cc_v1f32: -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel s0, s2, s3, eq +; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s +; CHECK-NEXT: bsl [[MASK]].8b, v2.8b, v3.8b %cmp31 = fcmp oeq float %a, %b %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d ret <1 x float> %e diff --git a/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll b/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll new file mode 100644 index 000000000000..b10fe758d959 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -asm-verbose=false -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s + +define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind { +; CHECK-LABEL: foo: +; CHECK-NEXT: fcmeq.4s v0, v0, v1 +; CHECK-NEXT: fmov.4s v1, #1.00000000 +; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: ret + %cmp = fcmp oeq <4 x float> %val, %test + %ext = zext <4 x i1> %cmp to <4 x i32> + %result = sitofp <4 x i32> %ext to <4 x float> + ret <4 x float> %result +} diff --git a/test/CodeGen/AArch64/arm64-vcvt_f.ll b/test/CodeGen/AArch64/arm64-vcvt_f.ll index d24495844b45..1f393c21a1a1 100644 --- a/test/CodeGen/AArch64/arm64-vcvt_f.ll +++ b/test/CodeGen/AArch64/arm64-vcvt_f.ll @@ -66,17 +66,17 @@ define i16 @to_half(float %in) { ; CHECK-LABEL: to_half: ; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0 ; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}} - %res = call i16 @llvm.convert.to.fp16(float %in) + %res = call i16 @llvm.convert.to.fp16.f32(float %in) ret i16 %res } define float @from_half(i16 %in) { ; CHECK-LABEL: from_half: -; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}} -; CHECK: fcvt s0, h[[HALFVAL]] - %res = call float @llvm.convert.from.fp16(i16 %in) +; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}} +; CHECK: fcvt s0, {{h[0-9]+}} + %res = call float @llvm.convert.from.fp16.f32(i16 %in) ret float %res } -declare float @llvm.convert.from.fp16(i16) #1 -declare i16 @llvm.convert.to.fp16(float) #1 +declare float @llvm.convert.from.fp16.f32(i16) #1 +declare i16 @llvm.convert.to.fp16.f32(float) #1 diff --git a/test/CodeGen/AArch64/f16-convert.ll b/test/CodeGen/AArch64/f16-convert.ll new file mode 100644 index 000000000000..12412d45aa6e --- /dev/null +++ b/test/CodeGen/AArch64/f16-convert.ll @@ -0,0 +1,251 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios -asm-verbose=false | FileCheck %s + +define float @load0(i16* nocapture readonly %a) nounwind { +; CHECK-LABEL: load0: +; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0] +; CHECK-NEXT: fcvt s0, [[HREG]] +; CHECK-NEXT: ret + + %tmp = load i16* %a, align 2 + %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp) + ret float %tmp1 +} + +define double @load1(i16* nocapture readonly %a) nounwind { +; CHECK-LABEL: load1: +; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0] +; CHECK-NEXT: fcvt d0, [[HREG]] +; CHECK-NEXT: ret + + %tmp = load i16* %a, align 2 + %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp) + ret double %conv +} + +define float @load2(i16* nocapture readonly %a, i32 %i) nounwind { +; CHECK-LABEL: load2: +; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, w1, sxtw #1] +; CHECK-NEXT: fcvt s0, [[HREG]] +; CHECK-NEXT: ret + + %idxprom = sext i32 %i to i64 + %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom + %tmp = load i16* %arrayidx, align 2 + %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp) + ret float %tmp1 +} + +define double @load3(i16* nocapture readonly %a, i32 %i) nounwind { +; CHECK-LABEL: load3: +; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, w1, sxtw #1] +; CHECK-NEXT: fcvt d0, [[HREG]] +; CHECK-NEXT: ret + + %idxprom = sext i32 %i to i64 + %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom + %tmp = load i16* %arrayidx, align 2 + %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp) + ret double %conv +} + +define float @load4(i16* nocapture readonly %a, i64 %i) nounwind { +; CHECK-LABEL: load4: +; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, x1, lsl #1] +; CHECK-NEXT: fcvt s0, [[HREG]] +; CHECK-NEXT: ret + + %arrayidx = getelementptr inbounds i16* %a, i64 %i + %tmp = load i16* %arrayidx, align 2 + %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp) + ret float %tmp1 +} + +define double @load5(i16* nocapture readonly %a, i64 %i) nounwind { +; CHECK-LABEL: load5: +; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, x1, lsl #1] +; CHECK-NEXT: fcvt d0, [[HREG]] +; CHECK-NEXT: ret + + %arrayidx = getelementptr inbounds i16* %a, i64 %i + %tmp = load i16* %arrayidx, align 2 + %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp) + ret double %conv +} + +define float @load6(i16* nocapture readonly %a) nounwind { +; CHECK-LABEL: load6: +; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, #20] +; CHECK-NEXT: fcvt s0, [[HREG]] +; CHECK-NEXT: ret + + %arrayidx = getelementptr inbounds i16* %a, i64 10 + %tmp = load i16* %arrayidx, align 2 + %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp) + ret float %tmp1 +} + +define double @load7(i16* nocapture readonly %a) nounwind { +; CHECK-LABEL: load7: +; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, #20] +; CHECK-NEXT: fcvt d0, [[HREG]] +; CHECK-NEXT: ret + + %arrayidx = getelementptr inbounds i16* %a, i64 10 + %tmp = load i16* %arrayidx, align 2 + %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp) + ret double %conv +} + +define float @load8(i16* nocapture readonly %a) nounwind { +; CHECK-LABEL: load8: +; CHECK-NEXT: ldur [[HREG:h[0-9]+]], [x0, #-20] +; CHECK-NEXT: fcvt s0, [[HREG]] +; CHECK-NEXT: ret + + %arrayidx = getelementptr inbounds i16* %a, i64 -10 + %tmp = load i16* %arrayidx, align 2 + %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp) + ret float %tmp1 +} + +define double @load9(i16* nocapture readonly %a) nounwind { +; CHECK-LABEL: load9: +; CHECK-NEXT: ldur [[HREG:h[0-9]+]], [x0, #-20] +; CHECK-NEXT: fcvt d0, [[HREG]] +; CHECK-NEXT: ret + + %arrayidx = getelementptr inbounds i16* %a, i64 -10 + %tmp = load i16* %arrayidx, align 2 + %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp) + ret double %conv +} + +define void @store0(i16* nocapture %a, float %val) nounwind { +; CHECK-LABEL: store0: +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret + + %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val) + store i16 %tmp, i16* %a, align 2 + ret void +} + +define void @store1(i16* nocapture %a, double %val) nounwind { +; CHECK-LABEL: store1: +; CHECK-NEXT: fcvt h0, d0 +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret + + %conv = fptrunc double %val to float + %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv) + store i16 %tmp, i16* %a, align 2 + ret void +} + +define void @store2(i16* nocapture %a, i32 %i, float %val) nounwind { +; CHECK-LABEL: store2: +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: str h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + + %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val) + %idxprom = sext i32 %i to i64 + %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom + store i16 %tmp, i16* %arrayidx, align 2 + ret void +} + +define void @store3(i16* nocapture %a, i32 %i, double %val) nounwind { +; CHECK-LABEL: store3: +; CHECK-NEXT: fcvt h0, d0 +; CHECK-NEXT: str h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + + %conv = fptrunc double %val to float + %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv) + %idxprom = sext i32 %i to i64 + %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom + store i16 %tmp, i16* %arrayidx, align 2 + ret void +} + +define void @store4(i16* nocapture %a, i64 %i, float %val) nounwind { +; CHECK-LABEL: store4: +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: str h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + + %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val) + %arrayidx = getelementptr inbounds i16* %a, i64 %i + store i16 %tmp, i16* %arrayidx, align 2 + ret void +} + +define void @store5(i16* nocapture %a, i64 %i, double %val) nounwind { +; CHECK-LABEL: store5: +; CHECK-NEXT: fcvt h0, d0 +; CHECK-NEXT: str h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + + %conv = fptrunc double %val to float + %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv) + %arrayidx = getelementptr inbounds i16* %a, i64 %i + store i16 %tmp, i16* %arrayidx, align 2 + ret void +} + +define void @store6(i16* nocapture %a, float %val) nounwind { +; CHECK-LABEL: store6: +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: str h0, [x0, #20] +; CHECK-NEXT: ret + + %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val) + %arrayidx = getelementptr inbounds i16* %a, i64 10 + store i16 %tmp, i16* %arrayidx, align 2 + ret void +} + +define void @store7(i16* nocapture %a, double %val) nounwind { +; CHECK-LABEL: store7: +; CHECK-NEXT: fcvt h0, d0 +; CHECK-NEXT: str h0, [x0, #20] +; CHECK-NEXT: ret + + %conv = fptrunc double %val to float + %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv) + %arrayidx = getelementptr inbounds i16* %a, i64 10 + store i16 %tmp, i16* %arrayidx, align 2 + ret void +} + +define void @store8(i16* nocapture %a, float %val) nounwind { +; CHECK-LABEL: store8: +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: stur h0, [x0, #-20] +; CHECK-NEXT: ret + + %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val) + %arrayidx = getelementptr inbounds i16* %a, i64 -10 + store i16 %tmp, i16* %arrayidx, align 2 + ret void +} + +define void @store9(i16* nocapture %a, double %val) nounwind { +; CHECK-LABEL: store9: +; CHECK-NEXT: fcvt h0, d0 +; CHECK-NEXT: stur h0, [x0, #-20] +; CHECK-NEXT: ret + + %conv = fptrunc double %val to float + %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv) + %arrayidx = getelementptr inbounds i16* %a, i64 -10 + store i16 %tmp, i16* %arrayidx, align 2 + ret void +} + +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone +declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone diff --git a/test/CodeGen/AArch64/fast-isel-mul.ll b/test/CodeGen/AArch64/fast-isel-mul.ll new file mode 100644 index 000000000000..d02c67f52f8d --- /dev/null +++ b/test/CodeGen/AArch64/fast-isel-mul.ll @@ -0,0 +1,40 @@ +; RUN: llc -fast-isel -fast-isel-abort -mtriple=aarch64 -o - %s | FileCheck %s + +@var8 = global i8 0 +@var16 = global i16 0 +@var32 = global i32 0 +@var64 = global i64 0 + +define void @test_mul8(i8 %lhs, i8 %rhs) { +; CHECK-LABEL: test_mul8: +; CHECK: mul w0, w0, w1 +; %lhs = load i8* @var8 +; %rhs = load i8* @var8 + %prod = mul i8 %lhs, %rhs + store i8 %prod, i8* @var8 + ret void +} + +define void @test_mul16(i16 %lhs, i16 %rhs) { +; CHECK-LABEL: test_mul16: +; CHECK: mul w0, w0, w1 + %prod = mul i16 %lhs, %rhs + store i16 %prod, i16* @var16 + ret void +} + +define void @test_mul32(i32 %lhs, i32 %rhs) { +; CHECK-LABEL: test_mul32: +; CHECK: mul w0, w0, w1 + %prod = mul i32 %lhs, %rhs + store i32 %prod, i32* @var32 + ret void +} + +define void @test_mul64(i64 %lhs, i64 %rhs) { +; CHECK-LABEL: test_mul64: +; CHECK: mul x0, x0, x1 + %prod = mul i64 %lhs, %rhs + store i64 %prod, i64* @var64 + ret void +} diff --git a/test/CodeGen/AArch64/funcptr_cast.ll b/test/CodeGen/AArch64/funcptr_cast.ll new file mode 100644 index 000000000000..a00b7bcaf6a2 --- /dev/null +++ b/test/CodeGen/AArch64/funcptr_cast.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s + +define i8 @test() { +; CHECK-LABEL: @test +; CHECK: adrp {{x[0-9]+}}, foo +; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, :lo12:foo +; CHECK: ldrb w0, [{{x[0-9]+}}] +entry: + %0 = load i8* bitcast (void (...)* @foo to i8*), align 1 + ret i8 %0 +} + +declare void @foo(...) diff --git a/test/CodeGen/AArch64/half.ll b/test/CodeGen/AArch64/half.ll new file mode 100644 index 000000000000..a46094b9fb85 --- /dev/null +++ b/test/CodeGen/AArch64/half.ll @@ -0,0 +1,83 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s + +define void @test_load_store(half* %in, half* %out) { +; CHECK-LABEL: test_load_store: +; CHECK: ldr [[TMP:h[0-9]+]], [x0] +; CHECK: str [[TMP]], [x1] + %val = load half* %in + store half %val, half* %out + ret void +} + +define i16 @test_bitcast_from_half(half* %addr) { +; CHECK-LABEL: test_bitcast_from_half: +; CHECK: ldrh w0, [x0] + %val = load half* %addr + %val_int = bitcast half %val to i16 + ret i16 %val_int +} + +define i16 @test_reg_bitcast_from_half(half %in) { +; CHECK-LABEL: test_reg_bitcast_from_half: +; CHECK-NOT: str +; CHECK-NOT: ldr +; CHECK-DAG: fmov w0, s0 +; CHECK: ret + %val = bitcast half %in to i16 + ret i16 %val +} + +define void @test_bitcast_to_half(half* %addr, i16 %in) { +; CHECK-LABEL: test_bitcast_to_half: +; CHECK: strh w1, [x0] + %val_fp = bitcast i16 %in to half + store half %val_fp, half* %addr + ret void +} + +define half @test_reg_bitcast_to_half(i16 %in) { +; CHECK-LABEL: test_reg_bitcast_to_half: +; CHECK-NOT: str +; CHECK-NOT: ldr +; CHECK-DAG: fmov s0, w0 +; CHECK: ret + + %val = bitcast i16 %in to half + ret half %val +} + +define float @test_extend32(half* %addr) { +; CHECK-LABEL: test_extend32: +; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}} + + %val16 = load half* %addr + %val32 = fpext half %val16 to float + ret float %val32 +} + +define double @test_extend64(half* %addr) { +; CHECK-LABEL: test_extend64: +; CHECK: fcvt {{d[0-9]+}}, {{h[0-9]+}} + + %val16 = load half* %addr + %val32 = fpext half %val16 to double + ret double %val32 +} + +define void @test_trunc32(float %in, half* %addr) { +; CHECK-LABEL: test_trunc32: +; CHECK: fcvt {{h[0-9]+}}, {{s[0-9]+}} + + %val16 = fptrunc float %in to half + store half %val16, half* %addr + ret void +} + +define void @test_trunc64(double %in, half* %addr) { +; CHECK-LABEL: test_trunc64: +; CHECK: fcvt {{h[0-9]+}}, {{d[0-9]+}} + + %val16 = fptrunc double %in to half + store half %val16, half* %addr + ret void +} diff --git a/test/CodeGen/AArch64/hints.ll b/test/CodeGen/AArch64/hints.ll new file mode 100644 index 000000000000..d7d9e23af1f1 --- /dev/null +++ b/test/CodeGen/AArch64/hints.ll @@ -0,0 +1,67 @@ +; RUN: llc -mtriple aarch64-eabi -o - %s | FileCheck %s + +declare void @llvm.aarch64.hint(i32) nounwind + +define void @hint_nop() { +entry: + tail call void @llvm.aarch64.hint(i32 0) nounwind + ret void +} + +; CHECK-LABEL: hint_nop +; CHECK: nop + +define void @hint_yield() { +entry: + tail call void @llvm.aarch64.hint(i32 1) nounwind + ret void +} + +; CHECK-LABEL: hint_yield +; CHECK: yield + +define void @hint_wfe() { +entry: + tail call void @llvm.aarch64.hint(i32 2) nounwind + ret void +} + +; CHECK-LABEL: hint_wfe +; CHECK: wfe + +define void @hint_wfi() { +entry: + tail call void @llvm.aarch64.hint(i32 3) nounwind + ret void +} + +; CHECK-LABEL: hint_wfi +; CHECK: wfi + +define void @hint_sev() { +entry: + tail call void @llvm.aarch64.hint(i32 4) nounwind + ret void +} + +; CHECK-LABEL: hint_sev +; CHECK: sev + +define void @hint_sevl() { +entry: + tail call void @llvm.aarch64.hint(i32 5) nounwind + ret void +} + +; CHECK-LABEL: hint_sevl +; CHECK: sevl + +define void @hint_undefined() { +entry: + tail call void @llvm.aarch64.hint(i32 8) nounwind + ret void +} + +; CHECK-LABEL: hint_undefined +; CHECK: hint #0x8 + diff --git a/test/CodeGen/AArch64/i128-fast-isel-fallback.ll b/test/CodeGen/AArch64/i128-fast-isel-fallback.ll new file mode 100644 index 000000000000..1cffbf3de052 --- /dev/null +++ b/test/CodeGen/AArch64/i128-fast-isel-fallback.ll @@ -0,0 +1,18 @@ +; RUN: llc -O0 -mtriple=arm64-apple-ios7.0 -mcpu=generic < %s | FileCheck %s + +; Function Attrs: nounwind ssp +define void @test1() { + %1 = sext i32 0 to i128 + call void @test2(i128 %1) + ret void + +; The i128 is 0 so the we can test to make sure it is propogated into the x +; registers that make up the i128 pair + +; CHECK: mov x0, xzr +; CHECK: mov x1, x0 +; CHECK: bl _test2 + +} + +declare void @test2(i128) diff --git a/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll b/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll new file mode 100644 index 000000000000..645214ac8ec7 --- /dev/null +++ b/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll @@ -0,0 +1,26 @@ +; We actually need to use -filetype=obj in this test because if we output +; assembly, the current code path will bypass the parser and just write the +; raw text out to the Streamer. We need to actually parse the inlineasm to +; demonstrate the bug. Going the asm->obj route does not show the issue. +; RUN: llc -mtriple=aarch64 < %s -filetype=obj | llvm-objdump -arch=aarch64 -d - | FileCheck %s + +; CHECK-LABEL: foo: +; CHECK: a0 79 95 d2 movz x0, #0xabcd +; CHECK: c0 03 5f d6 ret +define i32 @foo() nounwind { +entry: + %0 = tail call i32 asm sideeffect "ldr $0,=0xabcd", "=r"() nounwind + ret i32 %0 +} +; CHECK-LABEL: bar: +; CHECK: 40 00 00 58 ldr x0, #8 +; CHECK: c0 03 5f d6 ret +; Make sure the constant pool entry comes after the return +; CHECK-LABEL: $d.1: +define i32 @bar() nounwind { +entry: + %0 = tail call i32 asm sideeffect "ldr $0,=0x10001", "=r"() nounwind + ret i32 %0 +} + + diff --git a/test/CodeGen/AArch64/intrinsics-memory-barrier.ll b/test/CodeGen/AArch64/intrinsics-memory-barrier.ll new file mode 100644 index 000000000000..09e34ae2d2ed --- /dev/null +++ b/test/CodeGen/AArch64/intrinsics-memory-barrier.ll @@ -0,0 +1,57 @@ +; RUN: llc < %s -mtriple=aarch64-eabi -O=3 | FileCheck %s + +define void @test() { + ; CHECK: dmb sy + call void @llvm.aarch64.dmb(i32 15) + ; CHECK: dmb osh + call void @llvm.aarch64.dmb(i32 3) + ; CHECK: dsb sy + call void @llvm.aarch64.dsb(i32 15) + ; CHECK: dsb ishld + call void @llvm.aarch64.dsb(i32 9) + ; CHECK: isb + call void @llvm.aarch64.isb(i32 15) + ret void +} + +; Important point is that the compiler should not reorder memory access +; instructions around DMB. +; Failure to do so, two STRs will collapse into one STP. +define void @test_dmb_reordering(i32 %a, i32 %b, i32* %d) { + store i32 %a, i32* %d ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}] + + call void @llvm.aarch64.dmb(i32 15); CHECK: dmb sy + + %d1 = getelementptr i32* %d, i64 1 + store i32 %b, i32* %d1 ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4] + + ret void +} + +; Similarly for DSB. +define void @test_dsb_reordering(i32 %a, i32 %b, i32* %d) { + store i32 %a, i32* %d ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}] + + call void @llvm.aarch64.dsb(i32 15); CHECK: dsb sy + + %d1 = getelementptr i32* %d, i64 1 + store i32 %b, i32* %d1 ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4] + + ret void +} + +; And ISB. +define void @test_isb_reordering(i32 %a, i32 %b, i32* %d) { + store i32 %a, i32* %d ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}] + + call void @llvm.aarch64.isb(i32 15); CHECK: isb + + %d1 = getelementptr i32* %d, i64 1 + store i32 %b, i32* %d1 ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4] + + ret void +} + +declare void @llvm.aarch64.dmb(i32) +declare void @llvm.aarch64.dsb(i32) +declare void @llvm.aarch64.isb(i32) diff --git a/test/CodeGen/AArch64/memcpy-f128.ll b/test/CodeGen/AArch64/memcpy-f128.ll new file mode 100644 index 000000000000..76db2974ab4d --- /dev/null +++ b/test/CodeGen/AArch64/memcpy-f128.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s + +%structA = type { i128 } +@stubA = internal unnamed_addr constant %structA zeroinitializer, align 8 + +; Make sure we don't hit llvm_unreachable. + +define void @test1() { +; CHECK-LABEL: @test1 +; CHECK: adrp +; CHECK: ldr q0 +; CHECK: str q0 +; CHECK: ret +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* bitcast (%structA* @stubA to i8*), i64 48, i32 8, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) diff --git a/test/CodeGen/AArch64/mul_pow2.ll b/test/CodeGen/AArch64/mul_pow2.ll new file mode 100644 index 000000000000..efc0ec8c40e3 --- /dev/null +++ b/test/CodeGen/AArch64/mul_pow2.ll @@ -0,0 +1,123 @@ +; RUN: llc < %s -march=aarch64 | FileCheck %s + +; Convert mul x, pow2 to shift. +; Convert mul x, pow2 +/- 1 to shift + add/sub. + +define i32 @test2(i32 %x) { +; CHECK-LABEL: test2 +; CHECK: lsl w0, w0, #1 + + %mul = shl nsw i32 %x, 1 + ret i32 %mul +} + +define i32 @test3(i32 %x) { +; CHECK-LABEL: test3 +; CHECK: add w0, w0, w0, lsl #1 + + %mul = mul nsw i32 %x, 3 + ret i32 %mul +} + +define i32 @test4(i32 %x) { +; CHECK-LABEL: test4 +; CHECK: lsl w0, w0, #2 + + %mul = shl nsw i32 %x, 2 + ret i32 %mul +} + +define i32 @test5(i32 %x) { +; CHECK-LABEL: test5 +; CHECK: add w0, w0, w0, lsl #2 + + + %mul = mul nsw i32 %x, 5 + ret i32 %mul +} + +define i32 @test7(i32 %x) { +; CHECK-LABEL: test7 +; CHECK: lsl {{w[0-9]+}}, w0, #3 +; CHECK: sub w0, {{w[0-9]+}}, w0 + + %mul = mul nsw i32 %x, 7 + ret i32 %mul +} + +define i32 @test8(i32 %x) { +; CHECK-LABEL: test8 +; CHECK: lsl w0, w0, #3 + + %mul = shl nsw i32 %x, 3 + ret i32 %mul +} + +define i32 @test9(i32 %x) { +; CHECK-LABEL: test9 +; CHECK: add w0, w0, w0, lsl #3 + + %mul = mul nsw i32 %x, 9 + ret i32 %mul +} + +; Convert mul x, -pow2 to shift. +; Convert mul x, -(pow2 +/- 1) to shift + add/sub. + +define i32 @ntest2(i32 %x) { +; CHECK-LABEL: ntest2 +; CHECK: neg w0, w0, lsl #1 + + %mul = mul nsw i32 %x, -2 + ret i32 %mul +} + +define i32 @ntest3(i32 %x) { +; CHECK-LABEL: ntest3 +; CHECK: add {{w[0-9]+}}, w0, w0, lsl #1 +; CHECK: neg w0, {{w[0-9]+}} + + %mul = mul nsw i32 %x, -3 + ret i32 %mul +} + +define i32 @ntest4(i32 %x) { +; CHECK-LABEL: ntest4 +; CHECK:neg w0, w0, lsl #2 + + %mul = mul nsw i32 %x, -4 + ret i32 %mul +} + +define i32 @ntest5(i32 %x) { +; CHECK-LABEL: ntest5 +; CHECK: add {{w[0-9]+}}, w0, w0, lsl #2 +; CHECK: neg w0, {{w[0-9]+}} + %mul = mul nsw i32 %x, -5 + ret i32 %mul +} + +define i32 @ntest7(i32 %x) { +; CHECK-LABEL: ntest7 +; CHECK: sub w0, w0, w0, lsl #3 + + %mul = mul nsw i32 %x, -7 + ret i32 %mul +} + +define i32 @ntest8(i32 %x) { +; CHECK-LABEL: ntest8 +; CHECK: neg w0, w0, lsl #3 + + %mul = mul nsw i32 %x, -8 + ret i32 %mul +} + +define i32 @ntest9(i32 %x) { +; CHECK-LABEL: ntest9 +; CHECK: add {{w[0-9]+}}, w0, w0, lsl #3 +; CHECK: neg w0, {{w[0-9]+}} + + %mul = mul nsw i32 %x, -9 + ret i32 %mul +} diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll index a01df3275a99..6afac315a961 100644 --- a/test/CodeGen/AArch64/neon-scalar-copy.ll +++ b/test/CodeGen/AArch64/neon-scalar-copy.ll @@ -101,3 +101,20 @@ define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) { ret <1 x i64> %vset_lane } +; Undefined behaviour, so we really don't care what actually gets emitted, just +; as long as we don't crash (since it could be dynamically unreachable). +define i32 @test_out_of_range_extract(<4 x i32> %vec) { +; CHECK-LABEL: test_out_of_range_extract: +; CHECK: ret + %elt = extractelement <4 x i32> %vec, i32 4 + ret i32 %elt +} + +; Undefined behaviour, so we really don't care what actually gets emitted, just +; as long as we don't crash (since it could be dynamically unreachable). +define void @test_out_of_range_insert(<4 x i32> %vec, i32 %elt) { +; CHECK-LABEL: test_out_of_range_insert: +; CHECK: ret + insertelement <4 x i32> %vec, i32 %elt, i32 4 + ret void +} diff --git a/test/CodeGen/AArch64/trunc-v1i64.ll b/test/CodeGen/AArch64/trunc-v1i64.ll new file mode 100644 index 000000000000..159b8e0cff33 --- /dev/null +++ b/test/CodeGen/AArch64/trunc-v1i64.ll @@ -0,0 +1,63 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s + +; An optimization in DAG Combiner to fold +; (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)) +; will generate nodes like: +; v1i32 trunc v1i64, v1i16 trunc v1i64, v1i8 trunc v1i64. +; And such nodes will be defaultly scalarized in type legalization. But such +; scalarization will cause an assertion failure, as v1i64 is a legal type in +; AArch64. We change the default behaviour from be scalarized to be widen. + +; FIXME: Currently XTN is generated for v1i32, but it can be optimized. +; Just like v1i16 and v1i8, there is no XTN generated. + +define <2 x i32> @test_v1i32_0(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i32_0: +; CHECK: xtn v0.2s, v0.2d + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <2 x i32> + %2 = trunc <2 x i64> %1 to <2 x i32> + ret <2 x i32> %2 +} + +define <2 x i32> @test_v1i32_1(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i32_1: +; CHECK: xtn v0.2s, v0.2d +; CHECK-NEXT: dup v0.2s, v0.s[0] + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <2 x i32> + %2 = trunc <2 x i64> %1 to <2 x i32> + ret <2 x i32> %2 +} + +define <4 x i16> @test_v1i16_0(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i16_0: +; CHECK-NOT: xtn + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <4 x i32> + %2 = trunc <4 x i64> %1 to <4 x i16> + ret <4 x i16> %2 +} + +define <4 x i16> @test_v1i16_1(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i16_1: +; CHECK-NOT: xtn +; CHECK: dup v0.4h, v0.h[0] + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <4 x i32> + %2 = trunc <4 x i64> %1 to <4 x i16> + ret <4 x i16> %2 +} + +define <8 x i8> @test_v1i8_0(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i8_0: +; CHECK-NOT: xtn + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> + %2 = trunc <8 x i64> %1 to <8 x i8> + ret <8 x i8> %2 +} + +define <8 x i8> @test_v1i8_1(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i8_1: +; CHECK-NOT: xtn +; CHECK: dup v0.8b, v0.b[0] + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> + %2 = trunc <8 x i64> %1 to <8 x i8> + ret <8 x i8> %2 +} \ No newline at end of file diff --git a/test/CodeGen/ARM/2014-07-18-earlyclobber-str-post.ll b/test/CodeGen/ARM/2014-07-18-earlyclobber-str-post.ll new file mode 100644 index 000000000000..9ea762ae9bff --- /dev/null +++ b/test/CodeGen/ARM/2014-07-18-earlyclobber-str-post.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s + +; Check that we don't create an unpredictable STR instruction, +; e.g. str r0, [r0], #4 + +define i32* @earlyclobber-str-post(i32* %addr) nounwind { +; CHECK: earlyclobber-str-post +; CHECK-NOT: str r[[REG:[0-9]+]], [r[[REG]]], #4 + %val = ptrtoint i32* %addr to i32 + store i32 %val, i32* %addr + %new = getelementptr i32* %addr, i32 1 + ret i32* %new +} diff --git a/test/CodeGen/ARM/Windows/alloca.ll b/test/CodeGen/ARM/Windows/alloca.ll new file mode 100644 index 000000000000..6a3d002ab3b3 --- /dev/null +++ b/test/CodeGen/ARM/Windows/alloca.ll @@ -0,0 +1,22 @@ +; RUN: llc -O0 -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s + +declare arm_aapcs_vfpcc i32 @num_entries() + +define arm_aapcs_vfpcc void @test___builtin_alloca() { +entry: + %array = alloca i8*, align 4 + %call = call arm_aapcs_vfpcc i32 @num_entries() + %mul = mul i32 4, %call + %0 = alloca i8, i32 %mul + store i8* %0, i8** %array, align 4 + ret void +} + +; CHECK: bl num_entries +; CHECK: movs [[R1:r[0-9]+]], #7 +; CHECK: add.w [[R0:r[0-9]+]], [[R1]], [[R0]], lsl #2 +; CHECK: bic [[R0]], [[R0]], #7 +; CHECK: lsrs r4, [[R0]], #2 +; CHECK: bl __chkstk +; CHECK: sub.w sp, sp, r4 + diff --git a/test/CodeGen/ARM/Windows/dllimport.ll b/test/CodeGen/ARM/Windows/dllimport.ll new file mode 100644 index 000000000000..bc737bd41827 --- /dev/null +++ b/test/CodeGen/ARM/Windows/dllimport.ll @@ -0,0 +1,61 @@ +; RUN: llc -mtriple thumbv7-windows -filetype asm -o - %s | FileCheck %s + +; ModuleID = 'dllimport.c' + +@var = external dllimport global i32 +@ext = external global i32 +declare dllimport arm_aapcs_vfpcc i32 @external() +declare arm_aapcs_vfpcc i32 @internal() + +define arm_aapcs_vfpcc i32 @get_var() { + %1 = load i32* @var, align 4 + ret i32 %1 +} + +; CHECK-LABEL: get_var +; CHECK: movw r0, :lower16:__imp_var +; CHECK: movt r0, :upper16:__imp_var +; CHECK: ldr r0, [r0] +; CHECK: ldr r0, [r0] +; CHECK: bx lr + +define arm_aapcs_vfpcc i32 @get_ext() { + %1 = load i32* @ext, align 4 + ret i32 %1 +} + +; CHECK-LABEL: get_ext +; CHECK: movw r0, :lower16:ext +; CHECK: movt r0, :upper16:ext +; CHECK: ldr r0, [r0] +; CHECK: bx lr + +define arm_aapcs_vfpcc i32* @get_var_pointer() { + ret i32* @var +} + +; CHECK-LABEL: get_var_pointer +; CHECK: movw r0, :lower16:__imp_var +; CHECK: movt r0, :upper16:__imp_var +; CHECK: ldr r0, [r0] +; CHECK: bx lr + +define arm_aapcs_vfpcc i32 @call_external() { + %call = tail call arm_aapcs_vfpcc i32 @external() + ret i32 %call +} + +; CHECK-LABEL: call_external +; CHECK: movw r0, :lower16:__imp_external +; CHECK: movt r0, :upper16:__imp_external +; CHECK: ldr r0, [r0] +; CHECK: bx r0 + +define arm_aapcs_vfpcc i32 @call_internal() { + %call = tail call arm_aapcs_vfpcc i32 @internal() + ret i32 %call +} + +; CHECK-LABEL: call_internal +; CHECK: b internal + diff --git a/test/CodeGen/ARM/atomic-cmpxchg.ll b/test/CodeGen/ARM/atomic-cmpxchg.ll new file mode 100644 index 000000000000..4b79fa25145b --- /dev/null +++ b/test/CodeGen/ARM/atomic-cmpxchg.ll @@ -0,0 +1,50 @@ +; RUN: llc < %s -mtriple=arm-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARM +; RUN: llc < %s -mtriple=thumb-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMB + +; RUN: llc < %s -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARMV7 +; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV7 + +define zeroext i1 @test_cmpxchg_res_i8(i8* %addr, i8 %desired, i8 zeroext %new) { +entry: + %0 = cmpxchg i8* %addr, i8 %desired, i8 %new monotonic monotonic + %1 = extractvalue { i8, i1 } %0, 1 + ret i1 %1 +} + +; CHECK-ARM-LABEL: test_cmpxchg_res_i8 +; CHECK-ARM: bl __sync_val_compare_and_swap_1 +; CHECK-ARM: mov [[REG:r[0-9]+]], #0 +; CHECK-ARM: cmp r0, {{r[0-9]+}} +; CHECK-ARM: moveq [[REG]], #1 +; CHECK-ARM: mov r0, [[REG]] + +; CHECK-THUMB-LABEL: test_cmpxchg_res_i8 +; CHECK-THUMB: bl __sync_val_compare_and_swap_1 +; CHECK-THUMB: mov [[R1:r[0-9]+]], r0 +; CHECK-THUMB: movs r0, #1 +; CHECK-THUMB: movs [[R2:r[0-9]+]], #0 +; CHECK-THUMB: cmp [[R1]], {{r[0-9]+}} +; CHECK-THU* %loadaddr, <2 x i64>* %storeaddr ) { +; CHECK-LABEL: vector_ext_2i8_to_2i64: +; CHECK: vld1.16 {[[REG:d[0-9]+]] +; CHECK: vmov.i64 {{q[0-9]+}}, #0xff +; CHECK: vrev16.8 [[REG]], [[REG]] +; CHECK: vmovl.u8 {{q[0-9]+}}, [[REG]] + %1 = load <2 x i8>* %loadaddr + %2 = zext <2 x i8> %1 to <2 x i64> + store <2 x i64> %2, <2 x i64>* %storeaddr + ret void +} + +define void @vector_ext_2i16_to_2i64( <2 x i16>* %loadaddr, <2 x i64>* %storeaddr ) { +; CHECK-LABEL: vector_ext_2i16_to_2i64: +; CHECK: vld1.32 {[[REG:d[0-9]+]] +; CHECK: vmov.i64 {{q[0-9]+}}, #0xffff +; CHECK: vrev32.16 [[REG]], [[REG]] +; CHECK: vmovl.u16 {{q[0-9]+}}, [[REG]] + %1 = load <2 x i16>* %loadaddr + %2 = zext <2 x i16> %1 to <2 x i64> + store <2 x i64> %2, <2 x i64>* %storeaddr + ret void +} + + +define void @vector_ext_2i8_to_2i32( <2 x i8>* %loadaddr, <2 x i32>* %storeaddr ) { +; CHECK-LABEL: vector_ext_2i8_to_2i32: +; CHECK: vld1.16 {[[REG:d[0-9]+]] +; CHECK: vrev16.8 [[REG]], [[REG]] + %1 = load <2 x i8>* %loadaddr + %2 = zext <2 x i8> %1 to <2 x i32> + store <2 x i32> %2, <2 x i32>* %storeaddr + ret void +} + +define void @vector_ext_2i16_to_2i32( <2 x i16>* %loadaddr, <2 x i32>* %storeaddr ) { +; CHECK-LABEL: vector_ext_2i16_to_2i32: +; CHECK: vld1.32 {[[REG:d[0-9]+]] +; CHECK: vrev32.16 [[REG]], [[REG]] +; CHECK: vmovl.u16 {{q[0-9]+}}, [[REG]] + %1 = load <2 x i16>* %loadaddr + %2 = zext <2 x i16> %1 to <2 x i32> + store <2 x i32> %2, <2 x i32>* %storeaddr + ret void +} + +define void @vector_ext_2i8_to_2i16( <2 x i8>* %loadaddr, <2 x i16>* %storeaddr ) { +; CHECK-LABEL: vector_ext_2i8_to_2i16: +; CHECK: vld1.16 {[[REG:d[0-9]+]] +; CHECK: vrev16.8 [[REG]], [[REG]] +; CHECK: vmovl.u8 {{q[0-9]+}}, [[REG]] + %1 = load <2 x i8>* %loadaddr + %2 = zext <2 x i8> %1 to <2 x i16> + store <2 x i16> %2, <2 x i16>* %storeaddr + ret void +} + +define void @vector_ext_4i8_to_4i32( <4 x i8>* %loadaddr, <4 x i32>* %storeaddr ) { +; CHECK-LABEL: vector_ext_4i8_to_4i32: +; CHECK: vld1.32 {[[REG:d[0-9]+]] +; CHECK: vrev32.8 [[REG]], [[REG]] +; CHECK: vmovl.u8 {{q[0-9]+}}, [[REG]] + %1 = load <4 x i8>* %loadaddr + %2 = zext <4 x i8> %1 to <4 x i32> + store <4 x i32> %2, <4 x i32>* %storeaddr + ret void +} + +define void @vector_ext_4i8_to_4i16( <4 x i8>* %loadaddr, <4 x i16>* %storeaddr ) { +; CHECK-LABEL: vector_ext_4i8_to_4i16: +; CHECK: vld1.32 {[[REG:d[0-9]+]] +; CHECK: vrev32.8 [[REG]], [[REG]] +; CHECK: vmovl.u8 {{q[0-9]+}}, [[REG]] + %1 = load <4 x i8>* %loadaddr + %2 = zext <4 x i8> %1 to <4 x i16> + store <4 x i16> %2, <4 x i16>* %storeaddr + ret void +} + diff --git a/test/CodeGen/ARM/debug-info-arg.ll b/test/CodeGen/ARM/debug-info-arg.ll index e8bf3ba9d61f..31d0324de689 100644 --- a/test/CodeGen/ARM/debug-info-arg.ll +++ b/test/CodeGen/ARM/debug-info-arg.ll @@ -59,7 +59,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone !24 = metadata !{i32 11, i32 81, metadata !1, null} !25 = metadata !{i32 11, i32 101, metadata !1, null} !26 = metadata !{i32 12, i32 3, metadata !27, null} -!27 = metadata !{i32 786443, metadata !1, i32 11, i32 107, metadata !2, i32 0} ; [ DW_TAG_lexical_block ] +!27 = metadata !{i32 786443, metadata !2, metadata !1, i32 11, i32 107, i32 0} ; [ DW_TAG_lexical_block ] !28 = metadata !{i32 13, i32 5, metadata !27, null} !29 = metadata !{i32 14, i32 1, metadata !27, null} !30 = metadata !{metadata !1} diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll index 6cbe4b4727cd..5ad5e59b880e 100644 --- a/test/CodeGen/ARM/debug-info-blocks.ll +++ b/test/CodeGen/ARM/debug-info-blocks.ll @@ -231,10 +231,10 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load !133 = metadata !{i32 609, i32 175, metadata !23, null} !134 = metadata !{i32 786689, metadata !23, metadata !"data", metadata !24, i32 67109473, metadata !108, i32 0, null} ; [ DW_TAG_arg_variable ] !135 = metadata !{i32 609, i32 190, metadata !23, null} -!136 = metadata !{i32 786688, metadata !23, metadata !"mydata", metadata !24, i32 604, metadata !50, i32 0, null, i64 1, i64 20, i64 2, i64 1, i64 4, i64 2, i64 1, i64 24} ; [ DW_TAG_auto_variable ] +!136 = metadata !{i32 786688, metadata !23, metadata !"mydata", metadata !24, i32 604, metadata !50, i32 0, null, metadata !163} ; [ DW_TAG_auto_variable ] !137 = metadata !{i32 604, i32 49, metadata !23, null} -!138 = metadata !{i32 786688, metadata !23, metadata !"self", metadata !40, i32 604, metadata !90, i32 0, null, i64 1, i64 24} ; [ DW_TAG_auto_variable ] -!139 = metadata !{i32 786688, metadata !23, metadata !"semi", metadata !24, i32 607, metadata !125, i32 0, null, i64 1, i64 28} ; [ DW_TAG_auto_variable ] +!138 = metadata !{i32 786688, metadata !23, metadata !"self", metadata !40, i32 604, metadata !90, i32 0, null, metadata !164} ; [ DW_TAG_auto_variable ] +!139 = metadata !{i32 786688, metadata !23, metadata !"semi", metadata !24, i32 607, metadata !125, i32 0, null, metadata !165} ; [ DW_TAG_auto_variable ] !140 = metadata !{i32 607, i32 30, metadata !23, null} !141 = metadata !{i32 610, i32 17, metadata !142, null} !142 = metadata !{i32 786443, metadata !152, metadata !23, i32 609, i32 200, i32 94} ; [ DW_TAG_lexical_block ] @@ -258,3 +258,6 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load !160 = metadata !{metadata !"header.h", metadata !"/Volumes/Sandbox/llvm"} !161 = metadata !{metadata !"header2.h", metadata !"/Volumes/Sandbox/llvm"} !162 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!163 = metadata !{i64 1, i64 20, i64 2, i64 1, i64 4, i64 2, i64 1, i64 24} +!164 = metadata !{i64 1, i64 24} +!165 = metadata !{i64 1, i64 28} diff --git a/test/CodeGen/ARM/fast-isel-inline-asm.ll b/test/CodeGen/ARM/fast-isel-inline-asm.ll new file mode 100644 index 000000000000..2eb25ec7738b --- /dev/null +++ b/test/CodeGen/ARM/fast-isel-inline-asm.ll @@ -0,0 +1,18 @@ +; RUN: llc -fast-isel < %s | FileCheck %s +target datalayout = "e-m:o-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32" +target triple = "thumbv7-apple-ios5.0.0" + +%0 = type opaque + +; Make sure that the inline asm starts right after the call to bar. +define void @test_inline_asm_sideeffect(%0* %call) { +; CHECK: bl _bar +; CHECK-NEXT: InlineAsm Start + call void @bar() + call void asm sideeffect "mov\09r7, r7\09\09@ marker", ""() + %1 = call %0* bitcast (i8* (i8*)* @foo to %0* (%0*)*)(%0* %call) + ret void +} + +declare i8* @foo(i8*) +declare void @bar() diff --git a/test/CodeGen/ARM/fp16.ll b/test/CodeGen/ARM/fp16.ll index fba794676d49..d3f32556a093 100644 --- a/test/CodeGen/ARM/fp16.ll +++ b/test/CodeGen/ARM/fp16.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s | FileCheck %s ; RUN: llc -mattr=+vfp3,+fp16 < %s | FileCheck --check-prefix=CHECK-FP16 %s +; RUN: llc -mtriple=armv8-eabi < %s | FileCheck --check-prefix=CHECK-ARMV8 %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32" target triple = "armv7-eabi" @@ -10,23 +11,61 @@ target triple = "armv7-eabi" define arm_aapcs_vfpcc void @foo() nounwind { ; CHECK-LABEL: foo: ; CHECK-FP16-LABEL: foo: +; CHECK-ARMV8-LABEL: foo: entry: %0 = load i16* @x, align 2 %1 = load i16* @y, align 2 - %2 = tail call float @llvm.convert.from.fp16(i16 %0) + %2 = tail call float @llvm.convert.from.fp16.f32(i16 %0) ; CHECK: __gnu_h2f_ieee ; CHECK-FP16: vcvtb.f32.f16 - %3 = tail call float @llvm.convert.from.fp16(i16 %1) +; CHECK-ARMv8: vcvtb.f32.f16 + %3 = tail call float @llvm.convert.from.fp16.f32(i16 %1) ; CHECK: __gnu_h2f_ieee ; CHECK-FP16: vcvtb.f32.f16 +; CHECK-ARMV8: vcvtb.f32.f16 %4 = fadd float %2, %3 - %5 = tail call i16 @llvm.convert.to.fp16(float %4) + %5 = tail call i16 @llvm.convert.to.fp16.f32(float %4) ; CHECK: __gnu_f2h_ieee ; CHECK-FP16: vcvtb.f16.f32 +; CHECK-ARMV8: vcvtb.f16.f32 store i16 %5, i16* @x, align 2 ret void } -declare float @llvm.convert.from.fp16(i16) nounwind readnone +define arm_aapcs_vfpcc double @test_from_fp16(i16 %in) { +; CHECK-LABEL: test_from_fp16: +; CHECK-FP-LABEL: test_from_fp16: +; CHECK-ARMV8-LABEL: test_from_fp16: + %val = call double @llvm.convert.from.fp16.f64(i16 %in) +; CHECK: bl __gnu_h2f_ieee +; CHECK: vmov [[TMP:s[0-9]+]], r0 +; CHECK: vcvt.f64.f32 d0, [[TMP]] -declare i16 @llvm.convert.to.fp16(float) nounwind readnone +; CHECK-FP16: vmov [[TMP16:s[0-9]+]], r0 +; CHECK-FP16: vcvtb.f32.f16 [[TMP32:s[0-9]+]], [[TMP16]] +; CHECK-FP16: vcvt.f64.f32 d0, [[TMP32]] + +; CHECK-ARMV8: vmov [[TMP:s[0-9]+]], r0 +; CHECK-ARMV8: vcvtb.f64.f16 d0, [[TMP]] + ret double %val +} + +define arm_aapcs_vfpcc i16 @test_to_fp16(double %in) { +; CHECK-LABEL: test_to_fp16: +; CHECK-FP-LABEL: test_to_fp16: +; CHECK-ARMV8-LABEL: test_to_fp16: + %val = call i16 @llvm.convert.to.fp16.f64(double %in) +; CHECK: bl __truncdfhf2 + +; CHECK-FP16: bl __truncdfhf2 + +; CHECK-ARMV8: vcvtb.f16.f64 [[TMP:s[0-9]+]], d0 +; CHECK-ARMV8: vmov r0, [[TMP]] + ret i16 %val +} + +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone +declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone + +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone +declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone diff --git a/test/CodeGen/ARM/half.ll b/test/CodeGen/ARM/half.ll new file mode 100644 index 000000000000..10cebb38c565 --- /dev/null +++ b/test/CodeGen/ARM/half.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -mtriple=thumbv7-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OLD +; RUN: llc < %s -mtriple=thumbv7s-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-F16 +; RUN: llc < %s -mtriple=thumbv8-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V8 + +define void @test_load_store(half* %in, half* %out) { +; CHECK-LABEL: test_load_store: +; CHECK: ldrh [[TMP:r[0-9]+]], [r0] +; CHECK: strh [[TMP]], [r1] + %val = load half* %in + store half %val, half* %out + ret void +} + +define i16 @test_bitcast_from_half(half* %addr) { +; CHECK-LABEL: test_bitcast_from_half: +; CHECK: ldrh r0, [r0] + %val = load half* %addr + %val_int = bitcast half %val to i16 + ret i16 %val_int +} + +define void @test_bitcast_to_half(half* %addr, i16 %in) { +; CHECK-LABEL: test_bitcast_to_half: +; CHECK: strh r1, [r0] + %val_fp = bitcast i16 %in to half + store half %val_fp, half* %addr + ret void +} + +define float @test_extend32(half* %addr) { +; CHECK-LABEL: test_extend32: + +; CHECK-OLD: b.w ___gnu_h2f_ieee +; CHECK-F16: vcvtb.f32.f16 +; CHECK-V8: vcvtb.f32.f16 + %val16 = load half* %addr + %val32 = fpext half %val16 to float + ret float %val32 +} + +define double @test_extend64(half* %addr) { +; CHECK-LABEL: test_extend64: + +; CHECK-OLD: blx ___gnu_h2f_ieee +; CHECK-OLD: vcvt.f64.f32 +; CHECK-F16: vcvtb.f32.f16 +; CHECK-F16: vcvt.f64.f32 +; CHECK-V8: vcvtb.f64.f16 + %val16 = load half* %addr + %val32 = fpext half %val16 to double + ret double %val32 +} + +define void @test_trunc32(float %in, half* %addr) { +; CHECK-LABEL: test_trunc32: + +; CHECK-OLD: blx ___gnu_f2h_ieee +; CHECK-F16: vcvtb.f16.f32 +; CHECK-V8: vcvtb.f16.f32 + %val16 = fptrunc float %in to half + store half %val16, half* %addr + ret void +} + +define void @test_trunc64(double %in, half* %addr) { +; CHECK-LABEL: test_trunc64: + +; CHECK-OLD: blx ___truncdfhf2 +; CHECK-F16: blx ___truncdfhf2 +; CHECK-V8: vcvtb.f16.f64 + %val16 = fptrunc double %in to half + store half %val16, half* %addr + ret void +} diff --git a/test/CodeGen/ARM/intrinsics-memory-barrier.ll b/test/CodeGen/ARM/intrinsics-memory-barrier.ll new file mode 100644 index 000000000000..5ee0b3e59902 --- /dev/null +++ b/test/CodeGen/ARM/intrinsics-memory-barrier.ll @@ -0,0 +1,55 @@ +; RUN: llc < %s -mtriple=armv7 -mattr=+db | FileCheck %s +; RUN: llc < %s -mtriple=thumbv7 -mattr=+db | FileCheck %s + +; CHECK-LABEL: test +define void @test() { + call void @llvm.arm.dmb(i32 3) ; CHECK: dmb osh + call void @llvm.arm.dsb(i32 7) ; CHECK: dsb nsh + call void @llvm.arm.isb(i32 15) ; CHECK: isb sy + ret void +} + +; Important point is that the compiler should not reorder memory access +; instructions around DMB. +; Failure to do so, two STRs will collapse into one STRD. +; CHECK-LABEL: test_dmb_reordering +define void @test_dmb_reordering(i32 %a, i32 %b, i32* %d) { + store i32 %a, i32* %d ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}] + + call void @llvm.arm.dmb(i32 15) ; CHECK: dmb sy + + %d1 = getelementptr i32* %d, i32 1 + store i32 %b, i32* %d1 ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}, #4] + + ret void +} + +; Similarly for DSB. +; CHECK-LABEL: test_dsb_reordering +define void @test_dsb_reordering(i32 %a, i32 %b, i32* %d) { + store i32 %a, i32* %d ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}] + + call void @llvm.arm.dsb(i32 15) ; CHECK: dsb sy + + %d1 = getelementptr i32* %d, i32 1 + store i32 %b, i32* %d1 ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}, #4] + + ret void +} + +; And ISB. +; CHECK-LABEL: test_isb_reordering +define void @test_isb_reordering(i32 %a, i32 %b, i32* %d) { + store i32 %a, i32* %d ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}] + + call void @llvm.arm.isb(i32 15) ; CHECK: isb sy + + %d1 = getelementptr i32* %d, i32 1 + store i32 %b, i32* %d1 ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}, #4] + + ret void +} + +declare void @llvm.arm.dmb(i32) +declare void @llvm.arm.dsb(i32) +declare void @llvm.arm.isb(i32) diff --git a/test/CodeGen/ARM/metadata-default.ll b/test/CodeGen/ARM/metadata-default.ll new file mode 100644 index 000000000000..f6a3fe289cc1 --- /dev/null +++ b/test/CodeGen/ARM/metadata-default.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" +target triple = "armv7--none-eabi" + +define i32 @f(i64 %z) { + ret i32 0 +} + +!llvm.module.flags = !{!0, !1} + +!0 = metadata !{i32 1, metadata !"wchar_size", i32 4} +!1 = metadata !{i32 1, metadata !"min_enum_size", i32 4} + +; CHECK: .eabi_attribute 18, 4 @ Tag_ABI_PCS_wchar_t +; CHECK: .eabi_attribute 26, 2 @ Tag_ABI_enum_size diff --git a/test/CodeGen/ARM/metadata-short-enums.ll b/test/CodeGen/ARM/metadata-short-enums.ll new file mode 100644 index 000000000000..bccd3327e5b5 --- /dev/null +++ b/test/CodeGen/ARM/metadata-short-enums.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" +target triple = "armv7--none-eabi" + +define i32 @f(i64 %z) { + ret i32 0 +} + +!llvm.module.flags = !{!0, !1} + +!0 = metadata !{i32 1, metadata !"wchar_size", i32 4} +!1 = metadata !{i32 1, metadata !"min_enum_size", i32 1} + +; CHECK: .eabi_attribute 18, 4 @ Tag_ABI_PCS_wchar_t +; CHECK: .eabi_attribute 26, 1 @ Tag_ABI_enum_size diff --git a/test/CodeGen/ARM/metadata-short-wchar.ll b/test/CodeGen/ARM/metadata-short-wchar.ll new file mode 100644 index 000000000000..6de9bf174317 --- /dev/null +++ b/test/CodeGen/ARM/metadata-short-wchar.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" +target triple = "armv7--none-eabi" + +define i32 @f(i64 %z) { + ret i32 0 +} + +!llvm.module.flags = !{!0, !1} + +!0 = metadata !{i32 1, metadata !"wchar_size", i32 2} +!1 = metadata !{i32 1, metadata !"min_enum_size", i32 4} + +; CHECK: .eabi_attribute 18, 2 @ Tag_ABI_PCS_wchar_t +; CHECK: .eabi_attribute 26, 2 @ Tag_ABI_enum_size diff --git a/test/CodeGen/ARM/null-streamer.ll b/test/CodeGen/ARM/null-streamer.ll new file mode 100644 index 000000000000..350c45e5bebe --- /dev/null +++ b/test/CodeGen/ARM/null-streamer.ll @@ -0,0 +1,7 @@ +; Test the null streamer with a terget streamer. +; RUN: llc -O0 -filetype=null -mtriple=arm-linux < %s + +define i32 @main() { +entry: + ret i32 0 +} diff --git a/test/CodeGen/ARM/out-of-registers.ll b/test/CodeGen/ARM/out-of-registers.ll new file mode 100644 index 000000000000..790e4165d4c6 --- /dev/null +++ b/test/CodeGen/ARM/out-of-registers.ll @@ -0,0 +1,42 @@ +; RUN: llc -O3 %s -o - | FileCheck %s +; ModuleID = 'fo.c' +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n8:16:32-S64" +target triple = "thumbv7-none-linux-gnueabi" + +; CHECK: vpush +; CHECK: vpop + +define void @foo(float* nocapture %A) #0 { + %1= bitcast float* %A to i8* + %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4) + %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0 + %divp_vec = fdiv <4 x float> , %3 + %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1 + %div3p_vec = fdiv <4 x float> , %4 + %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2 + %div8p_vec = fdiv <4 x float> , %5 + %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3 + %div13p_vec = fdiv <4 x float> , %6 + tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4) + ret void +} + +; Function Attrs: nounwind +declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1 + +; Function Attrs: nounwind readonly + +; Function Attrs: nounwind +declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1 +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2 + +; Function Attrs: nounwind + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind } +attributes #2 = { nounwind readonly } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"Snapdragon LLVM ARM Compiler 3.4"} +!1 = metadata !{metadata !1} diff --git a/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll b/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll new file mode 100644 index 000000000000..3cf2a08fe35d --- /dev/null +++ b/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll @@ -0,0 +1,31 @@ +; RUN: llc -mtriple=armv7-apple-ios -O0 < %s | FileCheck %s +; RUN: llc -mtriple=armv7-apple-ios -O1 < %s | FileCheck %s +; RUN: llc -mtriple=armv7-apple-ios -O2 < %s | FileCheck %s +; RUN: llc -mtriple=armv7-apple-ios -O3 < %s | FileCheck %s + +; SjLjEHPrepare shouldn't crash when lowering empty structs. +; +; Checks that between in case of empty structs used as arguments +; nothing happens, i.e. there are no instructions between +; __Unwind_SjLj_Register and actual @bar invocation + + +define i8* @foo(i8 %a, {} %c) { +entry: +; CHECK: bl __Unwind_SjLj_Register +; CHECK-NEXT: {{[A-Z][a-zA-Z0-9]*}}: +; CHECK-NEXT: bl _bar + invoke void @bar () + to label %unreachable unwind label %handler + +unreachable: + unreachable + +handler: + %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @baz to i8*) + cleanup + resume { i8*, i32 } undef +} + +declare void @bar() +declare i32 @baz(...) diff --git a/test/CodeGen/ARM/vector-spilling.ll b/test/CodeGen/ARM/vector-spilling.ll new file mode 100644 index 000000000000..746c6dfcd114 --- /dev/null +++ b/test/CodeGen/ARM/vector-spilling.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -arm-atomic-cfg-tidy=0 -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64" + +; This test will generate spills/fills using vldmia instructions that access 24 bytes of memory. +; Check that we don't crash when we generate these instructions on Cortex-A9. + +; CHECK: test: +; CHECK: vstmia +; CHECK: vldmia +define void @test(<8 x i64>* %src) #0 { +entry: + %0 = getelementptr inbounds <8 x i64>* %src, i32 0 + %1 = load <8 x i64>* %0, align 8 + + %2 = getelementptr inbounds <8 x i64>* %src, i32 1 + %3 = load <8 x i64>* %2, align 8 + + %4 = getelementptr inbounds <8 x i64>* %src, i32 2 + %5 = load <8 x i64>* %4, align 8 + + %6 = getelementptr inbounds <8 x i64>* %src, i32 3 + %7 = load <8 x i64>* %6, align 8 + + %8 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> + %9 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> + + tail call void(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)* @foo(<8 x i64> %1, <8 x i64> %3, <8 x i64> %5, <8 x i64> %7, <8 x i64> %8, <8 x i64> %9) + ret void +} + +declare void @foo(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>) + +attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/ARM/vldm-sched-a9.ll b/test/CodeGen/ARM/vldm-sched-a9.ll index f2e5eb9b7e03..64f3770e3d21 100644 --- a/test/CodeGen/ARM/vldm-sched-a9.ll +++ b/test/CodeGen/ARM/vldm-sched-a9.ll @@ -2,12 +2,12 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64" -; This test will generate spills/fills using vldmia instructions that access 64 bytes of memory. -; Check that we don't crash when we generate these instructions on Cortex-A9. +; This test used to test vector spilling using vstmia/vldmia instructions, but +; the changes for PR:18825 prevent that spilling. ; CHECK: test: -; CHECK: vstmia -; CHECK: vldmia +; CHECK-NOT: vstmia +; CHECK-NOT: vldmia define void @test(i64* %src) #0 { entry: %arrayidx39 = getelementptr inbounds i64* %src, i32 13 diff --git a/test/CodeGen/Mips/abiflags-xx.ll b/test/CodeGen/Mips/abiflags-xx.ll new file mode 100644 index 000000000000..c4610120fdd5 --- /dev/null +++ b/test/CodeGen/Mips/abiflags-xx.ll @@ -0,0 +1,5 @@ +; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr=fpxx %s -o - | FileCheck %s + +; CHECK: .nan legacy +; CHECK: .module fp=xx + diff --git a/test/CodeGen/Mips/abiflags32.ll b/test/CodeGen/Mips/abiflags32.ll new file mode 100644 index 000000000000..e32d4a586ee3 --- /dev/null +++ b/test/CodeGen/Mips/abiflags32.ll @@ -0,0 +1,17 @@ +; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck %s +; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr=fp64 %s -o - | FileCheck -check-prefix=CHECK-64 %s +; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips64 -mattr=-n64,n32 %s -o - | FileCheck -check-prefix=CHECK-64n %s + +; CHECK: .nan legacy +; We don't emit '.module fp=32' for compatibility with binutils 2.24 which +; doesn't accept .module. +; CHECK-NOT: .module fp=32 + +; CHECK-64: .nan legacy +; We do emit '.module fp=64' though since it contradicts the default value. +; CHECK-64: .module fp=64 + +; CHECK-64n: .nan legacy +; We don't emit '.module fp=64' for compatibility with binutils 2.24 which +; doesn't accept .module. +; CHECK-64n-NOT: .module fp=64 diff --git a/test/CodeGen/Mips/analyzebranch.ll b/test/CodeGen/Mips/analyzebranch.ll index d9ad0f8ad86a..4b5d09778d79 100644 --- a/test/CodeGen/Mips/analyzebranch.ll +++ b/test/CodeGen/Mips/analyzebranch.ll @@ -16,7 +16,7 @@ entry: ; 32-GPR: mtc1 $zero, $[[Z:f[0-9]]] ; 32-GPR: mthc1 $zero, $[[Z:f[0-9]]] ; 64-GPR: dmtc1 $zero, $[[Z:f[0-9]]] -; GPR: cmp.olt.d $[[FGRCC:f[0-9]+]], $[[Z]], $f12 +; GPR: cmp.lt.d $[[FGRCC:f[0-9]+]], $[[Z]], $f12 ; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC]] ; GPR-NOT: not $[[GPRCC]], $[[GPRCC]] ; GPR: bnez $[[GPRCC]], $BB diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll index 066d42cc302d..f4118ecec79d 100644 --- a/test/CodeGen/Mips/atomic.ll +++ b/test/CodeGen/Mips/atomic.ll @@ -8,7 +8,7 @@ ; Keep one big-endian check so that we don't reduce testing, but don't add more ; since endianness doesn't affect the body of the atomic operations. -; RUN: llc -march=mips --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=CHECK-EB +; RUN: llc -march=mips --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH -check-prefix=CHECK-EB @x = common global i32 0, align 4 @@ -246,6 +246,7 @@ entry: ; NO-SEB-SEH: sra $2, $[[R17]], 24 ; HAS-SEB-SEH: seb $2, $[[R16]] + } define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwind { @@ -292,6 +293,49 @@ entry: ; HAS-SEB-SEH: seb $2, $[[R17]] } +define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 %oldval, i8 signext %newval) nounwind { +entry: + %0 = cmpxchg i8* %ptr, i8 %oldval, i8 %newval monotonic monotonic + %1 = extractvalue { i8, i1 } %0, 1 + ret i1 %1 +; ALL-LABEL: AtomicCmpSwapRes8 + +; ALL: addiu $[[R1:[0-9]+]], $zero, -4 +; ALL: and $[[R2:[0-9]+]], $4, $[[R1]] +; ALL: andi $[[R3:[0-9]+]], $4, 3 +; CHECK-EL: sll $[[R5:[0-9]+]], $[[R3]], 3 +; CHECK-EB: xori $[[R4:[0-9]+]], $[[R3]], 3 +; CHECK-EB: sll $[[R5:[0-9]+]], $[[R4]], 3 +; ALL: ori $[[R6:[0-9]+]], $zero, 255 +; ALL: sllv $[[R7:[0-9]+]], $[[R6]], $[[R5]] +; ALL: nor $[[R8:[0-9]+]], $zero, $[[R7]] +; ALL: andi $[[R9:[0-9]+]], $5, 255 +; ALL: sllv $[[R10:[0-9]+]], $[[R9]], $[[R5]] +; ALL: andi $[[R11:[0-9]+]], $6, 255 +; ALL: sllv $[[R12:[0-9]+]], $[[R11]], $[[R5]] + +; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: ll $[[R13:[0-9]+]], 0($[[R2]]) +; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]] +; ALL: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]] + +; ALL: and $[[R15:[0-9]+]], $[[R13]], $[[R8]] +; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R12]] +; ALL: sc $[[R16]], 0($[[R2]]) +; ALL: beqz $[[R16]], $[[BB0]] + +; ALL: $[[BB1]]: +; ALL: srlv $[[R17:[0-9]+]], $[[R14]], $[[R5]] + +; NO-SEB-SEH: sll $[[R18:[0-9]+]], $[[R17]], 24 +; NO-SEB-SEH: sra $[[R19:[0-9]+]], $[[R18]], 24 + +; HAS-SEB-SEH: seb $[[R19:[0-9]+]], $[[R17]] + +; ALL: xor $[[R20:[0-9]+]], $[[R19]], $5 +; ALL: sltiu $2, $[[R20]], 1 +} + ; Check one i16 so that we cover the seh sign extend @z = common global i16 0, align 1 diff --git a/test/CodeGen/Mips/buildpairextractelementf64.ll b/test/CodeGen/Mips/buildpairextractelementf64.ll index 88d1d07e29ad..7682a98ace99 100644 --- a/test/CodeGen/Mips/buildpairextractelementf64.ll +++ b/test/CodeGen/Mips/buildpairextractelementf64.ll @@ -1,15 +1,19 @@ -; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK -; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK -; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK -; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK +; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=NO-MFHC1 -check-prefix=ALL +; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=NO-MFHC1 -check-prefix=ALL +; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL +; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL +; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL +; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL @a = external global i32 -; CHECK-LABEL: f: -; FP32: mtc1 -; FP32: mtc1 -; FP64-DAG: mtc1 -; FP64-DAG: mthc1 +; ALL-LABEL: f: + +; NO-MFHC1: mtc1 +; NO-MFHC1: mtc1 + +; HAS-MFHC1-DAG: mtc1 +; HAS-MFHC1-DAG: mthc1 define double @f(i32 %a1, double %d) nounwind { entry: @@ -18,11 +22,13 @@ entry: ret double %add } -; CHECK-LABEL: f3: -; FP32: mfc1 -; FP32: mfc1 -; FP64-DAG: mfc1 -; FP64-DAG: mfhc1 +; ALL-LABEL: f3: + +; NO-MFHC1: mfc1 +; NO-MFHC1: mfc1 + +; HAS-MFHC1-DAG: mfc1 +; HAS-MFHC1-DAG: mfhc1 define void @f3(double %d, i32 %a1) nounwind { entry: diff --git a/test/CodeGen/Mips/cconv/callee-saved-fpxx.ll b/test/CodeGen/Mips/cconv/callee-saved-fpxx.ll new file mode 100644 index 000000000000..4b28b9962075 --- /dev/null +++ b/test/CodeGen/Mips/cconv/callee-saved-fpxx.ll @@ -0,0 +1,58 @@ +; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s +; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s +; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV %s +; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV %s + +; RUN-TODO: llc -march=mips64 -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s +; RUN-TODO: llc -march=mips64el -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s +; RUN-TODO: llc -march=mips64 -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV --check-prefix=O32-FPXX-INV %s +; RUN-TODO: llc -march=mips64el -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV --check-prefix=O32-FPXX-INV %s + +define void @fpu_clobber() nounwind { +entry: + call void asm "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f12},~{$f13},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"() + ret void +} + +; O32-FPXX-LABEL: fpu_clobber: +; O32-FPXX-INV-NOT: sdc1 $f0, +; O32-FPXX-INV-NOT: sdc1 $f1, +; O32-FPXX-INV-NOT: sdc1 $f2, +; O32-FPXX-INV-NOT: sdc1 $f3, +; O32-FPXX-INV-NOT: sdc1 $f4, +; O32-FPXX-INV-NOT: sdc1 $f5, +; O32-FPXX-INV-NOT: sdc1 $f6, +; O32-FPXX-INV-NOT: sdc1 $f7, +; O32-FPXX-INV-NOT: sdc1 $f8, +; O32-FPXX-INV-NOT: sdc1 $f9, +; O32-FPXX-INV-NOT: sdc1 $f10, +; O32-FPXX-INV-NOT: sdc1 $f11, +; O32-FPXX-INV-NOT: sdc1 $f12, +; O32-FPXX-INV-NOT: sdc1 $f13, +; O32-FPXX-INV-NOT: sdc1 $f14, +; O32-FPXX-INV-NOT: sdc1 $f15, +; O32-FPXX-INV-NOT: sdc1 $f16, +; O32-FPXX-INV-NOT: sdc1 $f17, +; O32-FPXX-INV-NOT: sdc1 $f18, +; O32-FPXX-INV-NOT: sdc1 $f19, +; O32-FPXX-INV-NOT: sdc1 $f21, +; O32-FPXX-INV-NOT: sdc1 $f23, +; O32-FPXX-INV-NOT: sdc1 $f25, +; O32-FPXX-INV-NOT: sdc1 $f27, +; O32-FPXX-INV-NOT: sdc1 $f29, +; O32-FPXX-INV-NOT: sdc1 $f31, + +; O32-FPXX: addiu $sp, $sp, -48 +; O32-FPXX-DAG: sdc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp) +; O32-FPXX-DAG: sdc1 [[F22:\$f22]], [[OFF22:[0-9]+]]($sp) +; O32-FPXX-DAG: sdc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp) +; O32-FPXX-DAG: sdc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp) +; O32-FPXX-DAG: sdc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp) +; O32-FPXX-DAG: sdc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp) +; O32-FPXX-DAG: ldc1 [[F20]], [[OFF20]]($sp) +; O32-FPXX-DAG: ldc1 [[F22]], [[OFF22]]($sp) +; O32-FPXX-DAG: ldc1 [[F24]], [[OFF24]]($sp) +; O32-FPXX-DAG: ldc1 [[F26]], [[OFF26]]($sp) +; O32-FPXX-DAG: ldc1 [[F28]], [[OFF28]]($sp) +; O32-FPXX-DAG: ldc1 [[F30]], [[OFF30]]($sp) +; O32-FPXX: addiu $sp, $sp, 48 diff --git a/test/CodeGen/Mips/cconv/callee-saved-fpxx1.ll b/test/CodeGen/Mips/cconv/callee-saved-fpxx1.ll new file mode 100644 index 000000000000..489879e98ad3 --- /dev/null +++ b/test/CodeGen/Mips/cconv/callee-saved-fpxx1.ll @@ -0,0 +1,24 @@ +; RUN: llc -march=mips -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=O32-FP64-INV %s +; RUN: llc -march=mipsel -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=O32-FP64-INV %s + +; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=O32-FPXX %s +; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=O32-FPXX %s + +; RUN-TODO: llc -march=mips64 -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=O32-FPXX %s +; RUN-TODO: llc -march=mips64el -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=O32-FPXX %s + +define void @fpu_clobber() nounwind { +entry: + call void asm "# Clobber", "~{$f21}"() + ret void +} + +; O32-FPXX-LABEL: fpu_clobber: + +; O32-FPXX: addiu $sp, $sp, -8 + +; O32-FP64-INV-NOT: sdc1 $f20, +; O32-FPXX-DAG: sdc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp) +; O32-FPXX-DAG: ldc1 [[F20]], [[OFF20]]($sp) + +; O32-FPXX: addiu $sp, $sp, 8 diff --git a/test/CodeGen/Mips/cconv/return-hard-float.ll b/test/CodeGen/Mips/cconv/return-hard-float.ll index 371b3a54598b..3eb26fa9d24f 100644 --- a/test/CodeGen/Mips/cconv/return-hard-float.ll +++ b/test/CodeGen/Mips/cconv/return-hard-float.ll @@ -10,6 +10,9 @@ ; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s ; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s +; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=ALL --check-prefix=032FP64 %s +; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=ALL --check-prefix=032FP64 %s + ; Test the float returns for all ABI's and byte orders as specified by ; section 5 of MD00305 (MIPS ABIs Described). @@ -44,3 +47,13 @@ entry: ; N32-DAG: ldc1 $f0, %lo(double)([[R1:\$[0-9]+]]) ; N64-DAG: ld [[R1:\$[0-9]+]], %got_disp(double)($1) ; N64-DAG: ldc1 $f0, 0([[R1]]) + +define { double, double } @retComplexDouble() #0 { + %retval = alloca { double, double }, align 8 + %1 = load { double, double }* %retval + ret { double, double } %1 +} + +; ALL-LABEL: retComplexDouble: +; 032FP64-DAG: ldc1 $f0, 0($sp) +; 032FP64-DAG: ldc1 $f2, 8($sp) diff --git a/test/CodeGen/Mips/cfi_offset.ll b/test/CodeGen/Mips/cfi_offset.ll new file mode 100644 index 000000000000..e23855bd65d2 --- /dev/null +++ b/test/CodeGen/Mips/cfi_offset.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=mips -mattr=+o32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB +; RUN: llc -march=mipsel -mattr=+o32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL +; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB +; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL +; RUN: llc -march=mips -mattr=+o32,+fp64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB +; RUN: llc -march=mipsel -mattr=+o32,+fp64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL + +@var = global double 0.0 + +declare void @foo(...) + +define void @bar() { + +; CHECK-LABEL: bar: + +; CHECK: .cfi_def_cfa_offset 40 +; CHECK: sdc1 $f22, 32($sp) +; CHECK: sdc1 $f20, 24($sp) +; CHECK: sw $ra, 20($sp) +; CHECK: sw $16, 16($sp) + +; CHECK-EB: .cfi_offset 55, -8 +; CHECK-EB: .cfi_offset 54, -4 +; CHECK-EB: .cfi_offset 53, -16 +; CHECK-EB: .cfi_offset 52, -12 + +; CHECK-EL: .cfi_offset 54, -8 +; CHECK-EL: .cfi_offset 55, -4 +; CHECK-EL: .cfi_offset 52, -16 +; CHECK-EL: .cfi_offset 53, -12 + +; CHECK: .cfi_offset 31, -20 +; CHECK: .cfi_offset 16, -24 + + %val1 = load volatile double* @var + %val2 = load volatile double* @var + call void (...)* @foo() nounwind + store volatile double %val1, double* @var + store volatile double %val2, double* @var + ret void +} diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll index 61f398314de1..0c13fb1adfbe 100644 --- a/test/CodeGen/Mips/cmov.ll +++ b/test/CodeGen/Mips/cmov.ll @@ -18,9 +18,9 @@ ; 32-CMP-DAG: lw $[[R0:[0-9]+]], %got(i3) ; 32-CMP-DAG: addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(i1) -; 32-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[R1]], $4 -; 32-CMP-DAG: selnez $[[T1:[0-9]+]], $[[R0]], $4 -; 32-CMP-DAG: or $[[T2:[0-9]+]], $[[T1]], $[[T0]] +; 32-CMP-DAG: selnez $[[T0:[0-9]+]], $[[R1]], $4 +; 32-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[R0]], $4 +; 32-CMP-DAG: or $[[T2:[0-9]+]], $[[T0]], $[[T1]] ; 32-CMP-DAG: lw $2, 0($[[T2]]) ; 64-CMOV-DAG: ldr $[[R0:[0-9]+]] @@ -33,9 +33,9 @@ ; (setcc's result is i32 so bits 32-63 are undefined). It's not really ; needed. ; 64-CMP-DAG: sll $[[CC:[0-9]+]], $4, 0 -; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[R1]], $[[CC]] -; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $[[R0]], $[[CC]] -; 64-CMP-DAG: or $[[T2:[0-9]+]], $[[T1]], $[[T0]] +; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $[[R1]], $[[CC]] +; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[R0]], $[[CC]] +; 64-CMP-DAG: or $[[T2:[0-9]+]], $[[T0]], $[[T1]] ; 64-CMP-DAG: ld $2, 0($[[T2]]) define i32* @cmov1(i32 %s) nounwind readonly { @@ -58,9 +58,9 @@ entry: ; 32-CMP-DAG: addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(d) ; 32-CMP-DAG: addiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got(c) -; 32-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[R0]], $4 -; 32-CMP-DAG: selnez $[[T1:[0-9]+]], $[[R1]], $4 -; 32-CMP-DAG: or $[[T2:[0-9]+]], $[[T1]], $[[T0]] +; 32-CMP-DAG: selnez $[[T0:[0-9]+]], $[[R0]], $4 +; 32-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[R1]], $4 +; 32-CMP-DAG: or $[[T2:[0-9]+]], $[[T0]], $[[T1]] ; 32-CMP-DAG: lw $2, 0($[[T2]]) ; 64-CMOV: daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got_disp(d) @@ -73,9 +73,9 @@ entry: ; (setcc's result is i32 so bits 32-63 are undefined). It's not really ; needed. ; 64-CMP-DAG: sll $[[CC:[0-9]+]], $4, 0 -; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[R0]], $[[CC]] -; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $[[R1]], $[[CC]] -; 64-CMP-DAG: or $[[T2:[0-9]+]], $[[T1]], $[[T0]] +; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $[[R0]], $[[CC]] +; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[R1]], $[[CC]] +; 64-CMP-DAG: or $[[T2:[0-9]+]], $[[T0]], $[[T1]] ; 64-CMP-DAG: lw $2, 0($[[T2]]) define i32 @cmov2(i32 %s) nounwind readonly { @@ -97,16 +97,16 @@ entry: ; 32-CMOV: movz ${{[26]}}, $5, $[[R0]] ; 32-CMP-DAG: xori $[[CC:[0-9]+]], $4, 234 -; 32-CMP-DAG: selnez $[[T0:[0-9]+]], $5, $[[CC]] -; 32-CMP-DAG: seleqz $[[T1:[0-9]+]], $6, $[[CC]] +; 32-CMP-DAG: seleqz $[[T0:[0-9]+]], $5, $[[CC]] +; 32-CMP-DAG: selnez $[[T1:[0-9]+]], $6, $[[CC]] ; 32-CMP-DAG: or $2, $[[T0]], $[[T1]] ; 64-CMOV: xori $[[R0:[0-9]+]], $4, 234 ; 64-CMOV: movz ${{[26]}}, $5, $[[R0]] ; 64-CMP-DAG: xori $[[CC:[0-9]+]], $4, 234 -; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $5, $[[CC]] -; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $6, $[[CC]] +; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $5, $[[CC]] +; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $6, $[[CC]] ; 64-CMP-DAG: or $2, $[[T0]], $[[T1]] define i32 @cmov3(i32 %a, i32 %b, i32 %c) nounwind readnone { @@ -116,6 +116,39 @@ entry: ret i32 %cond } +; ALL-LABEL: cmov3_ne: + +; We won't check the result register since we can't know if the move is first +; or last. We do know it will be either one of two registers so we can at least +; check that. + +; FIXME: Use xori instead of addiu+xor. +; 32-CMOV: addiu $[[R0:[0-9]+]], $zero, 234 +; 32-CMOV: xor $[[R1:[0-9]+]], $4, $[[R0]] +; 32-CMOV: movn ${{[26]}}, $5, $[[R1]] + +; 32-CMP-DAG: xori $[[CC:[0-9]+]], $4, 234 +; 32-CMP-DAG: selnez $[[T0:[0-9]+]], $5, $[[CC]] +; 32-CMP-DAG: seleqz $[[T1:[0-9]+]], $6, $[[CC]] +; 32-CMP-DAG: or $2, $[[T0]], $[[T1]] + +; FIXME: Use xori instead of addiu+xor. +; 64-CMOV: addiu $[[R0:[0-9]+]], $zero, 234 +; 64-CMOV: xor $[[R1:[0-9]+]], $4, $[[R0]] +; 64-CMOV: movn ${{[26]}}, $5, $[[R1]] + +; 64-CMP-DAG: xori $[[CC:[0-9]+]], $4, 234 +; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $5, $[[CC]] +; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $6, $[[CC]] +; 64-CMP-DAG: or $2, $[[T0]], $[[T1]] + +define i32 @cmov3_ne(i32 %a, i32 %b, i32 %c) nounwind readnone { +entry: + %cmp = icmp ne i32 %a, 234 + %cond = select i1 %cmp, i32 %b, i32 %c + ret i32 %cond +} + ; ALL-LABEL: cmov4: ; We won't check the result register since we can't know if the move is first @@ -128,6 +161,45 @@ entry: ; 32-CMOV-DAG: movz $[[R1]], $6, $[[R0]] ; 32-CMOV-DAG: movz $[[R2]], $7, $[[R0]] +; 32-CMP-DAG: xori $[[R0:[0-9]+]], $4, 234 +; 32-CMP-DAG: lw $[[R1:[0-9]+]], 16($sp) +; 32-CMP-DAG: lw $[[R2:[0-9]+]], 20($sp) +; 32-CMP-DAG: seleqz $[[T0:[0-9]+]], $6, $[[R0]] +; 32-CMP-DAG: seleqz $[[T1:[0-9]+]], $7, $[[R0]] +; 32-CMP-DAG: selnez $[[T2:[0-9]+]], $[[R1]], $[[R0]] +; 32-CMP-DAG: selnez $[[T3:[0-9]+]], $[[R2]], $[[R0]] +; 32-CMP-DAG: or $2, $[[T0]], $[[T2]] +; 32-CMP-DAG: or $3, $[[T1]], $[[T3]] + +; 64-CMOV: xori $[[R0:[0-9]+]], $4, 234 +; 64-CMOV: movz ${{[26]}}, $5, $[[R0]] + +; 64-CMP-DAG: xori $[[R0:[0-9]+]], $4, 234 +; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $5, $[[R0]] +; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $6, $[[R0]] +; 64-CMP-DAG: or $2, $[[T0]], $[[T1]] + +define i64 @cmov4(i32 %a, i64 %b, i64 %c) nounwind readnone { +entry: + %cmp = icmp eq i32 %a, 234 + %cond = select i1 %cmp, i64 %b, i64 %c + ret i64 %cond +} + +; ALL-LABEL: cmov4_ne: + +; We won't check the result register since we can't know if the move is first +; or last. We do know it will be one of two registers so we can at least check +; that. + +; FIXME: Use xori instead of addiu+xor. +; 32-CMOV-DAG: addiu $[[R0:[0-9]+]], $zero, 234 +; 32-CMOV-DAG: xor $[[R1:[0-9]+]], $4, $[[R0]] +; 32-CMOV-DAG: lw $[[R2:2]], 16($sp) +; 32-CMOV-DAG: lw $[[R3:3]], 20($sp) +; 32-CMOV-DAG: movn $[[R2]], $6, $[[R1]] +; 32-CMOV-DAG: movn $[[R3]], $7, $[[R1]] + ; 32-CMP-DAG: xori $[[R0:[0-9]+]], $4, 234 ; 32-CMP-DAG: lw $[[R1:[0-9]+]], 16($sp) ; 32-CMP-DAG: lw $[[R2:[0-9]+]], 20($sp) @@ -138,17 +210,19 @@ entry: ; 32-CMP-DAG: or $2, $[[T0]], $[[T2]] ; 32-CMP-DAG: or $3, $[[T1]], $[[T3]] -; 64-CMOV: xori $[[R0:[0-9]+]], $4, 234 -; 64-CMOV: movz ${{[26]}}, $5, $[[R0]] +; FIXME: Use xori instead of addiu+xor. +; 64-CMOV: addiu $[[R0:[0-9]+]], $zero, 234 +; 64-CMOV: xor $[[R1:[0-9]+]], $4, $[[R0]] +; 64-CMOV: movn ${{[26]}}, $5, $[[R1]] ; 64-CMP-DAG: xori $[[R0:[0-9]+]], $4, 234 ; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $5, $[[R0]] ; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $6, $[[R0]] ; 64-CMP-DAG: or $2, $[[T0]], $[[T1]] -define i64 @cmov4(i32 %a, i64 %b, i64 %c) nounwind readnone { +define i64 @cmov4_ne(i32 %a, i64 %b, i64 %c) nounwind readnone { entry: - %cmp = icmp eq i32 %a, 234 + %cmp = icmp ne i32 %a, 234 %cond = select i1 %cmp, i64 %b, i64 %c ret i64 %cond } @@ -172,8 +246,8 @@ entry: ; 32-CMP-DAG: addiu $[[I5:[0-9]+]], $zero, 5 ; 32-CMP-DAG: slti $[[R0:[0-9]+]], $4, 32767 ; FIXME: We can do better than this by using selccz to choose between +0 and +2 -; 32-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]] -; 32-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]] +; 32-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]] +; 32-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]] ; 32-CMP-DAG: or $2, $[[T0]], $[[T1]] ; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3 @@ -185,8 +259,8 @@ entry: ; 64-CMP-DAG: addiu $[[I5:[0-9]+]], $zero, 5 ; 64-CMP-DAG: slti $[[R0:[0-9]+]], $4, 32767 ; FIXME: We can do better than this by using selccz to choose between +0 and +2 -; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]] -; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]] +; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]] +; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]] ; 64-CMP-DAG: or $2, $[[T0]], $[[T1]] define i32 @slti0(i32 %a) { @@ -246,8 +320,8 @@ entry: ; 32-CMP-DAG: addiu $[[I5:[0-9]+]], $zero, 5 ; 32-CMP-DAG: slti $[[R0:[0-9]+]], $4, -32768 ; FIXME: We can do better than this by using selccz to choose between +0 and +2 -; 32-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]] -; 32-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]] +; 32-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]] +; 32-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]] ; 32-CMP-DAG: or $2, $[[T0]], $[[T1]] ; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3 @@ -259,8 +333,8 @@ entry: ; 64-CMP-DAG: addiu $[[I5:[0-9]+]], $zero, 5 ; 64-CMP-DAG: slti $[[R0:[0-9]+]], $4, -32768 ; FIXME: We can do better than this by using selccz to choose between +0 and +2 -; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]] -; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]] +; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]] +; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]] ; 64-CMP-DAG: or $2, $[[T0]], $[[T1]] define i32 @slti2(i32 %a) { @@ -329,8 +403,8 @@ entry: ; 32-CMP-DAG: slt $[[CC0:[0-9]+]], $zero, $4 ; 32-CMP-DAG: addiu $[[I32766:[0-9]+]], $zero, 32766 ; 32-CMP-DAG: sltu $[[CC1:[0-9]+]], $[[I32766]], $5 -; 32-CMP-DAG: seleqz $[[CC2:[0-9]+]], $[[CC0]], $4 -; 32-CMP-DAG: selnez $[[CC3:[0-9]+]], $[[CC1]], $4 +; 32-CMP-DAG: selnez $[[CC2:[0-9]+]], $[[CC0]], $4 +; 32-CMP-DAG: seleqz $[[CC3:[0-9]+]], $[[CC1]], $4 ; 32-CMP: or $[[CC:[0-9]+]], $[[CC3]], $[[CC2]] ; 32-CMP-DAG: addiu $[[I5:[0-9]+]], $zero, 5 ; 32-CMP-DAG: addiu $[[I4:[0-9]+]], $zero, 4 @@ -349,8 +423,8 @@ entry: ; 64-CMP-DAG: slti $[[R0:[0-9]+]], $4, 32767 ; FIXME: We can do better than this by adding/subtracting the result of slti ; to/from one of the constants. -; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I5]], $[[R0]] -; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I4]], $[[R0]] +; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I5]], $[[R0]] +; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I4]], $[[R0]] ; 64-CMP-DAG: or $2, $[[T0]], $[[T1]] define i64 @slti64_0(i64 %a) { @@ -374,8 +448,8 @@ entry: ; 32-CMP-DAG: slt $[[CC0:[0-9]+]], $zero, $4 ; 32-CMP-DAG: addiu $[[I32766:[0-9]+]], $zero, 32767 ; 32-CMP-DAG: sltu $[[CC1:[0-9]+]], $[[I32766]], $5 -; 32-CMP-DAG: seleqz $[[CC2:[0-9]+]], $[[CC0]], $4 -; 32-CMP-DAG: selnez $[[CC3:[0-9]+]], $[[CC1]], $4 +; 32-CMP-DAG: selnez $[[CC2:[0-9]+]], $[[CC0]], $4 +; 32-CMP-DAG: seleqz $[[CC3:[0-9]+]], $[[CC1]], $4 ; 32-CMP: or $[[CC:[0-9]+]], $[[CC3]], $[[CC2]] ; 32-CMP-DAG: addiu $[[I5:[0-9]+]], $zero, 5 ; 32-CMP-DAG: addiu $[[I4:[0-9]+]], $zero, 4 @@ -423,8 +497,8 @@ entry: ; 64-CMP-DAG: slti $[[R0:[0-9]+]], $4, -32768 ; FIXME: We can do better than this by adding/subtracting the result of slti ; to/from one of the constants. -; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]] -; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I4]], $[[R0]] +; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]] +; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I4]], $[[R0]] ; 64-CMP-DAG: or $2, $[[T0]], $[[T1]] define i64 @slti64_2(i64 %a) { @@ -476,8 +550,8 @@ entry: ; 32-CMP-DAG: addiu $[[I5:[0-9]+]], $zero, 5 ; 32-CMP-DAG: sltiu $[[R0:[0-9]+]], $4, 32767 ; FIXME: We can do better than this by using selccz to choose between +0 and +2 -; 32-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]] -; 32-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]] +; 32-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]] +; 32-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]] ; 32-CMP-DAG: or $2, $[[T0]], $[[T1]] ; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3 @@ -489,8 +563,8 @@ entry: ; 64-CMP-DAG: addiu $[[I5:[0-9]+]], $zero, 5 ; 64-CMP-DAG: sltiu $[[R0:[0-9]+]], $4, 32767 ; FIXME: We can do better than this by using selccz to choose between +0 and +2 -; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]] -; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]] +; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]] +; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]] ; 64-CMP-DAG: or $2, $[[T0]], $[[T1]] define i32 @sltiu0(i32 %a) { @@ -550,8 +624,8 @@ entry: ; 32-CMP-DAG: addiu $[[I5:[0-9]+]], $zero, 5 ; 32-CMP-DAG: sltiu $[[R0:[0-9]+]], $4, -32768 ; FIXME: We can do better than this by using selccz to choose between +0 and +2 -; 32-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]] -; 32-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]] +; 32-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]] +; 32-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]] ; 32-CMP-DAG: or $2, $[[T0]], $[[T1]] ; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3 @@ -563,8 +637,8 @@ entry: ; 64-CMP-DAG: addiu $[[I5:[0-9]+]], $zero, 5 ; 64-CMP-DAG: sltiu $[[R0:[0-9]+]], $4, -32768 ; FIXME: We can do better than this by using selccz to choose between +0 and +2 -; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]] -; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]] +; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]] +; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]] ; 64-CMP-DAG: or $2, $[[T0]], $[[T1]] define i32 @sltiu2(i32 %a) { diff --git a/test/CodeGen/Mips/countleading.ll b/test/CodeGen/Mips/countleading.ll index 81fb2b44942b..6e63cff123cf 100644 --- a/test/CodeGen/Mips/countleading.ll +++ b/test/CodeGen/Mips/countleading.ll @@ -52,9 +52,9 @@ entry: ; MIPS32-GT-R1-DAG: clz $[[R1:[0-9]+]], $5 ; MIPS32-GT-R1-DAG: addiu $[[R2:2+]], $[[R0]], 32 ; MIPS32-R1-R2-DAG: movn $[[R2]], $[[R1]], $5 -; MIPS32-R6-DAG: selnez $[[R5:[0-9]+]], $[[R2]], $5 -; MIPS32-R6-DAG: seleqz $[[R6:[0-9]+]], $[[R1]], $5 -; MIPS32-R6-DAG: or $2, $[[R5]], $[[R6]] +; MIPS32-R6-DAG: seleqz $[[R5:[0-9]+]], $[[R2]], $5 +; MIPS32-R6-DAG: selnez $[[R6:[0-9]+]], $[[R1]], $5 +; MIPS32-R6-DAG: or $2, $[[R6]], $[[R5]] ; MIPS32-GT-R1-DAG: addiu $3, $zero, 0 ; MIPS64-GT-R1: dclz $2, $4 diff --git a/test/CodeGen/Mips/eh-return32.ll b/test/CodeGen/Mips/eh-return32.ll index c3003b34b162..748050c4d34b 100644 --- a/test/CodeGen/Mips/eh-return32.ll +++ b/test/CodeGen/Mips/eh-return32.ll @@ -1,4 +1,6 @@ -; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s +; RUN: llc -march=mipsel -mcpu=mips32 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6 +; RUN: llc -march=mipsel -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6 +; RUN: llc -march=mipsel -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=R6 declare void @llvm.eh.return.i32(i32, i8*) declare void @foo(...) @@ -9,7 +11,7 @@ entry: call void @llvm.eh.return.i32(i32 %offset, i8* %handler) unreachable -; CHECK: f1 +; CHECK: f1: ; CHECK: addiu $sp, $sp, -[[spoffset:[0-9]+]] ; check that $a0-$a3 are saved on stack. @@ -41,7 +43,8 @@ entry: ; CHECK: addiu $sp, $sp, [[spoffset]] ; CHECK: move $25, $2 ; CHECK: move $ra, $2 -; CHECK: jr $ra +; NOT-R6: jr $ra # &1 | FileCheck %s -check-prefix=64-FP64A +; RUN: llc -march=mips64el -mcpu=mips64 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NO-FP64A +; RUN: not llc -march=mips64el -mcpu=mips64 -mattr=fp64,nooddspreg < %s 2>&1 | FileCheck %s -check-prefix=64-FP64A + +; 64-FP64A: LLVM ERROR: -mattr=+nooddspreg requires the O32 ABI. + +declare double @dbl(); + +define double @call1(double %d, ...) { + ret double %d + +; ALL-LABEL: call1: + +; 32R2-NO-FP64A-LE-NOT: addiu $sp, $sp +; 32R2-NO-FP64A-LE: mtc1 $4, $f0 +; 32R2-NO-FP64A-LE: mthc1 $5, $f0 + +; 32R2-NO-FP64A-BE-NOT: addiu $sp, $sp +; 32R2-NO-FP64A-BE: mtc1 $5, $f0 +; 32R2-NO-FP64A-BE: mthc1 $4, $f0 + +; 32R2-FP64A-LE: addiu $sp, $sp, -8 +; 32R2-FP64A-LE: sw $4, 0($sp) +; 32R2-FP64A-LE: sw $5, 4($sp) +; 32R2-FP64A-LE: ldc1 $f0, 0($sp) + +; 32R2-FP64A-BE: addiu $sp, $sp, -8 +; 32R2-FP64A-BE: sw $5, 0($sp) +; 32R2-FP64A-BE: sw $4, 4($sp) +; 32R2-FP64A-BE: ldc1 $f0, 0($sp) + +; 64-NO-FP64A: daddiu $sp, $sp, -64 +; 64-NO-FP64A: mov.d $f0, $f12 +} + +define double @call2(i32 %i, double %d) { + ret double %d + +; ALL-LABEL: call2: + +; 32R2-NO-FP64A-LE: mtc1 $6, $f0 +; 32R2-NO-FP64A-LE: mthc1 $7, $f0 + +; 32R2-NO-FP64A-BE: mtc1 $7, $f0 +; 32R2-NO-FP64A-BE: mthc1 $6, $f0 + +; 32R2-FP64A-LE: addiu $sp, $sp, -8 +; 32R2-FP64A-LE: sw $6, 0($sp) +; 32R2-FP64A-LE: sw $7, 4($sp) +; 32R2-FP64A-LE: ldc1 $f0, 0($sp) + +; 32R2-FP64A-BE: addiu $sp, $sp, -8 +; 32R2-FP64A-BE: sw $7, 0($sp) +; 32R2-FP64A-BE: sw $6, 4($sp) +; 32R2-FP64A-BE: ldc1 $f0, 0($sp) + +; 64-NO-FP64A-NOT: daddiu $sp, $sp +; 64-NO-FP64A: mov.d $f0, $f13 +} + +define double @call3(float %f1, float %f2, double %d) { + ret double %d + +; ALL-LABEL: call3: + +; 32R2-NO-FP64A-LE: mtc1 $6, $f0 +; 32R2-NO-FP64A-LE: mthc1 $7, $f0 + +; 32R2-NO-FP64A-BE: mtc1 $7, $f0 +; 32R2-NO-FP64A-BE: mthc1 $6, $f0 + +; 32R2-FP64A-LE: addiu $sp, $sp, -8 +; 32R2-FP64A-LE: sw $6, 0($sp) +; 32R2-FP64A-LE: sw $7, 4($sp) +; 32R2-FP64A-LE: ldc1 $f0, 0($sp) + +; 32R2-FP64A-BE: addiu $sp, $sp, -8 +; 32R2-FP64A-BE: sw $7, 0($sp) +; 32R2-FP64A-BE: sw $6, 4($sp) +; 32R2-FP64A-BE: ldc1 $f0, 0($sp) + +; 64-NO-FP64A-NOT: daddiu $sp, $sp +; 64-NO-FP64A: mov.d $f0, $f14 +} + +define double @call4(float %f, double %d, ...) { + ret double %d + +; ALL-LABEL: call4: + +; 32R2-NO-FP64A-LE: mtc1 $6, $f0 +; 32R2-NO-FP64A-LE: mthc1 $7, $f0 + +; 32R2-NO-FP64A-BE: mtc1 $7, $f0 +; 32R2-NO-FP64A-BE: mthc1 $6, $f0 + +; 32R2-FP64A-LE: addiu $sp, $sp, -8 +; 32R2-FP64A-LE: sw $6, 0($sp) +; 32R2-FP64A-LE: sw $7, 4($sp) +; 32R2-FP64A-LE: ldc1 $f0, 0($sp) + +; 32R2-FP64A-BE: addiu $sp, $sp, -8 +; 32R2-FP64A-BE: sw $7, 0($sp) +; 32R2-FP64A-BE: sw $6, 4($sp) +; 32R2-FP64A-BE: ldc1 $f0, 0($sp) + +; 64-NO-FP64A: daddiu $sp, $sp, -48 +; 64-NO-FP64A: mov.d $f0, $f13 +} + +define double @call5(double %a, double %b, ...) { + %1 = fsub double %a, %b + ret double %1 + +; ALL-LABEL: call5: + +; 32R2-NO-FP64A-LE-DAG: mtc1 $4, $[[T0:f[0-9]+]] +; 32R2-NO-FP64A-LE-DAG: mthc1 $5, $[[T0:f[0-9]+]] +; 32R2-NO-FP64A-LE-DAG: mtc1 $6, $[[T1:f[0-9]+]] +; 32R2-NO-FP64A-LE-DAG: mthc1 $7, $[[T1:f[0-9]+]] +; 32R2-NO-FP64A-LE: sub.d $f0, $[[T0]], $[[T1]] + +; 32R2-NO-FP64A-BE-DAG: mtc1 $5, $[[T0:f[0-9]+]] +; 32R2-NO-FP64A-BE-DAG: mthc1 $4, $[[T0:f[0-9]+]] +; 32R2-NO-FP64A-BE-DAG: mtc1 $7, $[[T1:f[0-9]+]] +; 32R2-NO-FP64A-BE-DAG: mthc1 $6, $[[T1:f[0-9]+]] +; 32R2-NO-FP64A-BE: sub.d $f0, $[[T0]], $[[T1]] + +; 32R2-FP64A-LE: addiu $sp, $sp, -8 +; 32R2-FP64A-LE: sw $6, 0($sp) +; 32R2-FP64A-LE: sw $7, 4($sp) +; 32R2-FP64A-LE: ldc1 $[[T1:f[0-9]+]], 0($sp) +; 32R2-FP64A-LE: sw $4, 0($sp) +; 32R2-FP64A-LE: sw $5, 4($sp) +; 32R2-FP64A-LE: ldc1 $[[T0:f[0-9]+]], 0($sp) +; 32R2-FP64A-LE: sub.d $f0, $[[T0]], $[[T1]] + +; 32R2-FP64A-BE: addiu $sp, $sp, -8 +; 32R2-FP64A-BE: sw $7, 0($sp) +; 32R2-FP64A-BE: sw $6, 4($sp) +; 32R2-FP64A-BE: ldc1 $[[T1:f[0-9]+]], 0($sp) +; 32R2-FP64A-BE: sw $5, 0($sp) +; 32R2-FP64A-BE: sw $4, 4($sp) +; 32R2-FP64A-BE: ldc1 $[[T0:f[0-9]+]], 0($sp) +; 32R2-FP64A-BE: sub.d $f0, $[[T0]], $[[T1]] + +; 64-NO-FP64A: sub.d $f0, $f12, $f13 +} + +define double @move_from(double %d) { + %1 = call double @dbl() + %2 = call double @call2(i32 0, double %1) + ret double %2 + +; ALL-LABEL: move_from: + +; 32R2-NO-FP64A-LE-DAG: mfc1 $6, $f0 +; 32R2-NO-FP64A-LE-DAG: mfhc1 $7, $f0 + +; 32R2-NO-FP64A-BE-DAG: mfc1 $7, $f0 +; 32R2-NO-FP64A-BE-DAG: mfhc1 $6, $f0 + +; 32R2-FP64A-LE: addiu $sp, $sp, -32 +; 32R2-FP64A-LE: sdc1 $f0, 16($sp) +; 32R2-FP64A-LE: lw $6, 16($sp) +; FIXME: This store is redundant +; 32R2-FP64A-LE: sdc1 $f0, 16($sp) +; 32R2-FP64A-LE: lw $7, 20($sp) + +; 32R2-FP64A-BE: addiu $sp, $sp, -32 +; 32R2-FP64A-BE: sdc1 $f0, 16($sp) +; 32R2-FP64A-BE: lw $6, 20($sp) +; FIXME: This store is redundant +; 32R2-FP64A-BE: sdc1 $f0, 16($sp) +; 32R2-FP64A-BE: lw $7, 16($sp) + +; 64-NO-FP64A: mov.d $f13, $f0 +} diff --git a/test/CodeGen/Mips/fpxx.ll b/test/CodeGen/Mips/fpxx.ll new file mode 100644 index 000000000000..7e2ed22e2d80 --- /dev/null +++ b/test/CodeGen/Mips/fpxx.ll @@ -0,0 +1,221 @@ +; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-NOFPXX +; RUN: llc -march=mipsel -mcpu=mips32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-FPXX + +; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-NOFPXX +; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-FPXX + +; RUN: llc -march=mips64 -mcpu=mips4 < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-NOFPXX +; RUN: not llc -march=mips64 -mcpu=mips4 -mattr=fpxx < %s 2>&1 | FileCheck %s -check-prefix=4-FPXX + +; RUN: llc -march=mips64 -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NOFPXX +; RUN: not llc -march=mips64 -mcpu=mips64 -mattr=fpxx < %s 2>&1 | FileCheck %s -check-prefix=64-FPXX + +; RUN-TODO: llc -march=mips64 -mcpu=mips4 -mattr=-n64,+o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-NOFPXX +; RUN-TODO: llc -march=mips64 -mcpu=mips4 -mattr=-n64,+o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-FPXX + +; RUN-TODO: llc -march=mips64 -mcpu=mips64 -mattr=-n64,+o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-NOFPXX +; RUN-TODO: llc -march=mips64 -mcpu=mips64 -mattr=-n64,+o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-FPXX + +declare double @dbl(); + +; 4-FPXX: LLVM ERROR: FPXX is not permitted for the N32/N64 ABI's. +; 64-FPXX: LLVM ERROR: FPXX is not permitted for the N32/N64 ABI's. + +define double @test1(double %d, ...) { + ret double %d + +; ALL-LABEL: test1: + +; 32-NOFPXX: mtc1 $4, $f0 +; 32-NOFPXX: mtc1 $5, $f1 + +; 32-FPXX: addiu $sp, $sp, -8 +; 32-FPXX: sw $4, 0($sp) +; 32-FPXX: sw $5, 4($sp) +; 32-FPXX: ldc1 $f0, 0($sp) + +; 32R2-NOFPXX: mtc1 $4, $f0 +; 32R2-NOFPXX: mthc1 $5, $f0 + +; 32R2-FPXX: mtc1 $4, $f0 +; 32R2-FPXX: mthc1 $5, $f0 + +; floats/doubles are not passed in integer registers for n64, so dmtc1 is not used. +; 4-NOFPXX: mov.d $f0, $f12 + +; 64-NOFPXX: mov.d $f0, $f12 +} + +define double @test2(i32 %i, double %d) { + ret double %d + +; ALL-LABEL: test2: + +; 32-NOFPXX: mtc1 $6, $f0 +; 32-NOFPXX: mtc1 $7, $f1 + +; 32-FPXX: addiu $sp, $sp, -8 +; 32-FPXX: sw $6, 0($sp) +; 32-FPXX: sw $7, 4($sp) +; 32-FPXX: ldc1 $f0, 0($sp) + +; 32R2-NOFPXX: mtc1 $6, $f0 +; 32R2-NOFPXX: mthc1 $7, $f0 + +; 32R2-FPXX: mtc1 $6, $f0 +; 32R2-FPXX: mthc1 $7, $f0 + +; 4-NOFPXX: mov.d $f0, $f13 + +; 64-NOFPXX: mov.d $f0, $f13 +} + +define double @test3(float %f1, float %f2, double %d) { + ret double %d + +; ALL-LABEL: test3: + +; 32-NOFPXX: mtc1 $6, $f0 +; 32-NOFPXX: mtc1 $7, $f1 + +; 32-FPXX: addiu $sp, $sp, -8 +; 32-FPXX: sw $6, 0($sp) +; 32-FPXX: sw $7, 4($sp) +; 32-FPXX: ldc1 $f0, 0($sp) + +; 32R2-NOFPXX: mtc1 $6, $f0 +; 32R2-NOFPXX: mthc1 $7, $f0 + +; 32R2-FPXX: mtc1 $6, $f0 +; 32R2-FPXX: mthc1 $7, $f0 + +; 4-NOFPXX: mov.d $f0, $f14 + +; 64-NOFPXX: mov.d $f0, $f14 +} + +define double @test4(float %f, double %d, ...) { + ret double %d + +; ALL-LABEL: test4: + +; 32-NOFPXX: mtc1 $6, $f0 +; 32-NOFPXX: mtc1 $7, $f1 + +; 32-FPXX: addiu $sp, $sp, -8 +; 32-FPXX: sw $6, 0($sp) +; 32-FPXX: sw $7, 4($sp) +; 32-FPXX: ldc1 $f0, 0($sp) + +; 32R2-NOFPXX: mtc1 $6, $f0 +; 32R2-NOFPXX: mthc1 $7, $f0 + +; 32R2-FPXX: mtc1 $6, $f0 +; 32R2-FPXX: mthc1 $7, $f0 + +; 4-NOFPXX: mov.d $f0, $f13 + +; 64-NOFPXX: mov.d $f0, $f13 +} + +define double @test5() { + ret double 0.000000e+00 + +; ALL-LABEL: test5: + +; 32-NOFPXX: mtc1 $zero, $f0 +; 32-NOFPXX: mtc1 $zero, $f1 + +; 32-FPXX: addiu $sp, $sp, -8 +; 32-FPXX: sw $zero, 0($sp) +; 32-FPXX: sw $zero, 4($sp) +; 32-FPXX: ldc1 $f0, 0($sp) + +; 32R2-NOFPXX: mtc1 $zero, $f0 +; 32R2-NOFPXX: mthc1 $zero, $f0 + +; 32R2-FPXX: mtc1 $zero, $f0 +; 32R2-FPXX: mthc1 $zero, $f0 + +; 4-NOFPXX: dmtc1 $zero, $f0 + +; 64-NOFPXX: dmtc1 $zero, $f0 +} + +define double @test6(double %a, double %b, ...) { + %1 = fsub double %a, %b + ret double %1 + +; ALL-LABEL: test6: + +; 32-NOFPXX-DAG: mtc1 $4, $[[T0:f[0-9]+]] +; 32-NOFPXX-DAG: mtc1 $5, ${{f[0-9]*[13579]}} +; 32-NOFPXX-DAG: mtc1 $6, $[[T1:f[0-9]+]] +; 32-NOFPXX-DAG: mtc1 $7, ${{f[0-9]*[13579]}} +; 32-NOFPXX: sub.d $f0, $[[T0]], $[[T1]] + +; 32-FPXX: addiu $sp, $sp, -8 +; 32-FPXX: sw $6, 0($sp) +; 32-FPXX: sw $7, 4($sp) +; 32-FPXX: ldc1 $[[T1:f[0-9]+]], 0($sp) +; 32-FPXX: sw $4, 0($sp) +; 32-FPXX: sw $5, 4($sp) +; 32-FPXX: ldc1 $[[T0:f[0-9]+]], 0($sp) +; 32-FPXX: sub.d $f0, $[[T0]], $[[T1]] + +; 32R2-NOFPXX-DAG: mtc1 $4, $[[T0:f[0-9]+]] +; 32R2-NOFPXX-DAG: mthc1 $5, $[[T0]] +; 32R2-NOFPXX-DAG: mtc1 $6, $[[T1:f[0-9]+]] +; 32R2-NOFPXX-DAG: mthc1 $7, $[[T1]] +; 32R2-NOFPXX: sub.d $f0, $[[T0]], $[[T1]] + +; 32R2-FPXX-DAG: mtc1 $4, $[[T0:f[0-9]+]] +; 32R2-FPXX-DAG: mthc1 $5, $[[T0]] +; 32R2-FPXX-DAG: mtc1 $6, $[[T1:f[0-9]+]] +; 32R2-FPXX-DAG: mthc1 $7, $[[T1]] +; 32R2-FPXX: sub.d $f0, $[[T0]], $[[T1]] + +; floats/doubles are not passed in integer registers for n64, so dmtc1 is not used. +; 4-NOFPXX: sub.d $f0, $f12, $f13 + +; floats/doubles are not passed in integer registers for n64, so dmtc1 is not used. +; 64-NOFPXX: sub.d $f0, $f12, $f13 +} + +define double @move_from1(double %d) { + %1 = call double @dbl() + %2 = call double @test2(i32 0, double %1) + ret double %2 + +; ALL-LABEL: move_from1: + +; 32-NOFPXX-DAG: mfc1 $6, $f0 +; 32-NOFPXX-DAG: mfc1 $7, $f1 + +; 32-FPXX: addiu $sp, $sp, -32 +; 32-FPXX: sdc1 $f0, 16($sp) +; 32-FPXX: lw $6, 16($sp) +; FIXME: This store is redundant +; 32-FPXX: sdc1 $f0, 16($sp) +; 32-FPXX: lw $7, 20($sp) + +; 32R2-NOFPXX-DAG: mfc1 $6, $f0 +; 32R2-NOFPXX-DAG: mfhc1 $7, $f0 + +; 32R2-FPXX-DAG: mfc1 $6, $f0 +; 32R2-FPXX-DAG: mfhc1 $7, $f0 + +; floats/doubles are not passed in integer registers for n64, so dmfc1 is not used. +; We can't use inline assembly to force a copy either because trying to force +; a copy to a GPR this way fails with ; "couldn't allocate input reg for +; constraint 'r'". It therefore seems impossible to test the generation of dmfc1 +; in a simple test. +; 4-NOFPXX: mov.d $f13, $f0 + +; floats/doubles are not passed in integer registers for n64, so dmfc1 is not used. +; We can't use inline assembly to force a copy either because trying to force +; a copy to a GPR this way fails with ; "couldn't allocate input reg for +; constraint 'r'". It therefore seems impossible to test the generation of dmfc1 +; in a simple test. +; 64-NOFPXX: mov.d $f13, $f0 +} diff --git a/test/CodeGen/Mips/llvm-ir/call.ll b/test/CodeGen/Mips/llvm-ir/call.ll new file mode 100644 index 000000000000..4cbf43cae28e --- /dev/null +++ b/test/CodeGen/Mips/llvm-ir/call.ll @@ -0,0 +1,166 @@ +; Test the 'call' instruction and the tailcall variant. + +; FIXME: We should remove the need for -enable-mips-tail-calls +; RUN: llc -march=mips -mcpu=mips32 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32 +; RUN: llc -march=mips -mcpu=mips32r2 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32 +; RUN: llc -march=mips -mcpu=mips32r6 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32 +; RUN: llc -march=mips64 -mcpu=mips4 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64 +; RUN: llc -march=mips64 -mcpu=mips64 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64 +; RUN: llc -march=mips64 -mcpu=mips64r2 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64 +; RUN: llc -march=mips64 -mcpu=mips64r6 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64 + +declare void @extern_void_void() +declare i32 @extern_i32_void() +declare float @extern_float_void() + +define i32 @call_void_void() { +; ALL-LABEL: call_void_void: + +; O32: lw $[[TGT:[0-9]+]], %call16(extern_void_void)($gp) + +; N64: ld $[[TGT:[0-9]+]], %call16(extern_void_void)($gp) + +; ALL: jalr $[[TGT]] + + call void @extern_void_void() + ret i32 0 +} + +define i32 @call_i32_void() { +; ALL-LABEL: call_i32_void: + +; O32: lw $[[TGT:[0-9]+]], %call16(extern_i32_void)($gp) + +; N64: ld $[[TGT:[0-9]+]], %call16(extern_i32_void)($gp) + +; ALL: jalr $[[TGT]] + + %1 = call i32 @extern_i32_void() + %2 = add i32 %1, 1 + ret i32 %2 +} + +define float @call_float_void() { +; ALL-LABEL: call_float_void: + +; FIXME: Not sure why we don't use $gp directly on such a simple test. We should +; look into it at some point. +; O32: addu $[[GP:[0-9]+]], ${{[0-9]+}}, $25 +; O32: lw $[[TGT:[0-9]+]], %call16(extern_float_void)($[[GP]]) + +; N64: ld $[[TGT:[0-9]+]], %call16(extern_float_void)($gp) + +; ALL: jalr $[[TGT]] + +; O32: move $gp, $[[GP]] + + %1 = call float @extern_float_void() + %2 = fadd float %1, 1.0 + ret float %2 +} + +define void @musttail_call_void_void() { +; ALL-LABEL: musttail_call_void_void: + +; O32: lw $[[TGT:[0-9]+]], %call16(extern_void_void)($gp) + +; N64: ld $[[TGT:[0-9]+]], %call16(extern_void_void)($gp) + +; NOT-R6: jr $[[TGT]] +; R6: r6.jr $[[TGT]] + + musttail call void @extern_void_void() + ret void +} + +define i32 @musttail_call_i32_void() { +; ALL-LABEL: musttail_call_i32_void: + +; O32: lw $[[TGT:[0-9]+]], %call16(extern_i32_void)($gp) + +; N64: ld $[[TGT:[0-9]+]], %call16(extern_i32_void)($gp) + +; NOT-R6: jr $[[TGT]] +; R6: r6.jr $[[TGT]] + + %1 = musttail call i32 @extern_i32_void() + ret i32 %1 +} + +define float @musttail_call_float_void() { +; ALL-LABEL: musttail_call_float_void: + +; O32: lw $[[TGT:[0-9]+]], %call16(extern_float_void)($gp) + +; N64: ld $[[TGT:[0-9]+]], %call16(extern_float_void)($gp) + +; NOT-R6: jr $[[TGT]] +; R6: r6.jr $[[TGT]] + + %1 = musttail call float @extern_float_void() + ret float %1 +} + +define i32 @indirect_call_void_void(void ()* %addr) { +; ALL-LABEL: indirect_call_void_void: + +; ALL: move $25, $4 +; ALL: jalr $25 + + call void %addr() + ret i32 0 +} + +define i32 @indirect_call_i32_void(i32 ()* %addr) { +; ALL-LABEL: indirect_call_i32_void: + +; ALL: move $25, $4 +; ALL: jalr $25 + + %1 = call i32 %addr() + %2 = add i32 %1, 1 + ret i32 %2 +} + +define float @indirect_call_float_void(float ()* %addr) { +; ALL-LABEL: indirect_call_float_void: + +; ALL: move $25, $4 +; ALL: jalr $25 + + %1 = call float %addr() + %2 = fadd float %1, 1.0 + ret float %2 +} + +; We can't use 'musttail' here because the verifier is too conservative and +; prohibits any prototype difference. +define void @tail_indirect_call_void_void(void ()* %addr) { +; ALL-LABEL: tail_indirect_call_void_void: + +; ALL: move $25, $4 +; ALL: jr $25 + + tail call void %addr() + ret void +} + +define i32 @tail_indirect_call_i32_void(i32 ()* %addr) { +; ALL-LABEL: tail_indirect_call_i32_void: + +; ALL: move $25, $4 +; ALL: jr $25 + + %1 = tail call i32 %addr() + ret i32 %1 +} + +define float @tail_indirect_call_float_void(float ()* %addr) { +; ALL-LABEL: tail_indirect_call_float_void: + +; ALL: move $25, $4 +; ALL: jr $25 + + %1 = tail call float %addr() + ret float %1 +} diff --git a/test/CodeGen/Mips/llvm-ir/indirectbr.ll b/test/CodeGen/Mips/llvm-ir/indirectbr.ll new file mode 100644 index 000000000000..d8fd78774553 --- /dev/null +++ b/test/CodeGen/Mips/llvm-ir/indirectbr.ll @@ -0,0 +1,34 @@ +; Test all important variants of the unconditional 'br' instruction. + +; RUN: llc -march=mips -mcpu=mips32 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6 +; RUN: llc -march=mips -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6 +; RUN: llc -march=mips -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=R6 +; RUN: llc -march=mips64 -mcpu=mips4 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6 +; RUN: llc -march=mips64 -mcpu=mips64 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6 +; RUN: llc -march=mips64 -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6 +; RUN: llc -march=mips64 -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=R6 + +define i32 @br(i8 *%addr) { +; ALL-LABEL: br: +; NOT-R6: jr $4 # @foo0({float, float} %arg0) { + ret <4 x float> +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[8]) foo1( +; CHECK: .param .align 8 .b8 foo1_param_0[16] +define <2 x float> @foo1({float, float, i64} %arg0) { + ret <2 x float> +} diff --git a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll index e474fa4df5ce..c167db4b46dc 100644 --- a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll +++ b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s ;; These tests should run for all targets @@ -9,28 +9,28 @@ ;;; f64 define double @fadd_f64(double %a, double %b) { -; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}} +; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}} ; CHECK: ret %ret = fadd double %a, %b ret double %ret } define double @fsub_f64(double %a, double %b) { -; CHECK: sub.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}} +; CHECK: sub.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}} ; CHECK: ret %ret = fsub double %a, %b ret double %ret } define double @fmul_f64(double %a, double %b) { -; CHECK: mul.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}} +; CHECK: mul.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}} ; CHECK: ret %ret = fmul double %a, %b ret double %ret } define double @fdiv_f64(double %a, double %b) { -; CHECK: div.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}} +; CHECK: div.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}} ; CHECK: ret %ret = fdiv double %a, %b ret double %ret diff --git a/test/CodeGen/NVPTX/arithmetic-int.ll b/test/CodeGen/NVPTX/arithmetic-int.ll index 8d73b7e6c4c6..b5a2872299b7 100644 --- a/test/CodeGen/NVPTX/arithmetic-int.ll +++ b/test/CodeGen/NVPTX/arithmetic-int.ll @@ -9,70 +9,70 @@ ;;; i64 define i64 @add_i64(i64 %a, i64 %b) { -; CHECK: add.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = add i64 %a, %b ret i64 %ret } define i64 @sub_i64(i64 %a, i64 %b) { -; CHECK: sub.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: sub.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = sub i64 %a, %b ret i64 %ret } define i64 @mul_i64(i64 %a, i64 %b) { -; CHECK: mul.lo.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: mul.lo.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = mul i64 %a, %b ret i64 %ret } define i64 @sdiv_i64(i64 %a, i64 %b) { -; CHECK: div.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: div.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = sdiv i64 %a, %b ret i64 %ret } define i64 @udiv_i64(i64 %a, i64 %b) { -; CHECK: div.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: div.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = udiv i64 %a, %b ret i64 %ret } define i64 @srem_i64(i64 %a, i64 %b) { -; CHECK: rem.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: rem.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = srem i64 %a, %b ret i64 %ret } define i64 @urem_i64(i64 %a, i64 %b) { -; CHECK: rem.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: rem.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = urem i64 %a, %b ret i64 %ret } define i64 @and_i64(i64 %a, i64 %b) { -; CHECK: and.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: and.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = and i64 %a, %b ret i64 %ret } define i64 @or_i64(i64 %a, i64 %b) { -; CHECK: or.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: or.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = or i64 %a, %b ret i64 %ret } define i64 @xor_i64(i64 %a, i64 %b) { -; CHECK: xor.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: xor.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = xor i64 %a, %b ret i64 %ret @@ -80,7 +80,7 @@ define i64 @xor_i64(i64 %a, i64 %b) { define i64 @shl_i64(i64 %a, i64 %b) { ; PTX requires 32-bit shift amount -; CHECK: shl.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}} +; CHECK: shl.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}} ; CHECK: ret %ret = shl i64 %a, %b ret i64 %ret @@ -88,7 +88,7 @@ define i64 @shl_i64(i64 %a, i64 %b) { define i64 @ashr_i64(i64 %a, i64 %b) { ; PTX requires 32-bit shift amount -; CHECK: shr.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}} +; CHECK: shr.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}} ; CHECK: ret %ret = ashr i64 %a, %b ret i64 %ret @@ -96,7 +96,7 @@ define i64 @ashr_i64(i64 %a, i64 %b) { define i64 @lshr_i64(i64 %a, i64 %b) { ; PTX requires 32-bit shift amount -; CHECK: shr.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}} +; CHECK: shr.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}} ; CHECK: ret %ret = lshr i64 %a, %b ret i64 %ret diff --git a/test/CodeGen/NVPTX/atomics.ll b/test/CodeGen/NVPTX/atomics.ll new file mode 100644 index 000000000000..daadb6e9c1a0 --- /dev/null +++ b/test/CodeGen/NVPTX/atomics.ll @@ -0,0 +1,182 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +; CHECK-LABEL: atom0 +define i32 @atom0(i32* %addr, i32 %val) { +; CHECK: atom.add.u32 + %ret = atomicrmw add i32* %addr, i32 %val seq_cst + ret i32 %ret +} + +; CHECK-LABEL: atom1 +define i64 @atom1(i64* %addr, i64 %val) { +; CHECK: atom.add.u64 + %ret = atomicrmw add i64* %addr, i64 %val seq_cst + ret i64 %ret +} + +; CHECK-LABEL: atom2 +define i32 @atom2(i32* %subr, i32 %val) { +; CHECK: neg.s32 +; CHECK: atom.add.u32 + %ret = atomicrmw sub i32* %subr, i32 %val seq_cst + ret i32 %ret +} + +; CHECK-LABEL: atom3 +define i64 @atom3(i64* %subr, i64 %val) { +; CHECK: neg.s64 +; CHECK: atom.add.u64 + %ret = atomicrmw sub i64* %subr, i64 %val seq_cst + ret i64 %ret +} + +; CHECK-LABEL: atom4 +define i32 @atom4(i32* %subr, i32 %val) { +; CHECK: atom.and.b32 + %ret = atomicrmw and i32* %subr, i32 %val seq_cst + ret i32 %ret +} + +; CHECK-LABEL: atom5 +define i64 @atom5(i64* %subr, i64 %val) { +; CHECK: atom.and.b64 + %ret = atomicrmw and i64* %subr, i64 %val seq_cst + ret i64 %ret +} + +;; NAND not yet supported +;define i32 @atom6(i32* %subr, i32 %val) { +; %ret = atomicrmw nand i32* %subr, i32 %val seq_cst +; ret i32 %ret +;} + +;define i64 @atom7(i64* %subr, i64 %val) { +; %ret = atomicrmw nand i64* %subr, i64 %val seq_cst +; ret i64 %ret +;} + +; CHECK-LABEL: atom8 +define i32 @atom8(i32* %subr, i32 %val) { +; CHECK: atom.or.b32 + %ret = atomicrmw or i32* %subr, i32 %val seq_cst + ret i32 %ret +} + +; CHECK-LABEL: atom9 +define i64 @atom9(i64* %subr, i64 %val) { +; CHECK: atom.or.b64 + %ret = atomicrmw or i64* %subr, i64 %val seq_cst + ret i64 %ret +} + +; CHECK-LABEL: atom10 +define i32 @atom10(i32* %subr, i32 %val) { +; CHECK: atom.xor.b32 + %ret = atomicrmw xor i32* %subr, i32 %val seq_cst + ret i32 %ret +} + +; CHECK-LABEL: atom11 +define i64 @atom11(i64* %subr, i64 %val) { +; CHECK: atom.xor.b64 + %ret = atomicrmw xor i64* %subr, i64 %val seq_cst + ret i64 %ret +} + +; CHECK-LABEL: atom12 +define i32 @atom12(i32* %subr, i32 %val) { +; CHECK: atom.max.s32 + %ret = atomicrmw max i32* %subr, i32 %val seq_cst + ret i32 %ret +} + +; CHECK-LABEL: atom13 +define i64 @atom13(i64* %subr, i64 %val) { +; CHECK: atom.max.s64 + %ret = atomicrmw max i64* %subr, i64 %val seq_cst + ret i64 %ret +} + +; CHECK-LABEL: atom14 +define i32 @atom14(i32* %subr, i32 %val) { +; CHECK: atom.min.s32 + %ret = atomicrmw min i32* %subr, i32 %val seq_cst + ret i32 %ret +} + +; CHECK-LABEL: atom15 +define i64 @atom15(i64* %subr, i64 %val) { +; CHECK: atom.min.s64 + %ret = atomicrmw min i64* %subr, i64 %val seq_cst + ret i64 %ret +} + +; CHECK-LABEL: atom16 +define i32 @atom16(i32* %subr, i32 %val) { +; CHECK: atom.max.u32 + %ret = atomicrmw umax i32* %subr, i32 %val seq_cst + ret i32 %ret +} + +; CHECK-LABEL: atom17 +define i64 @atom17(i64* %subr, i64 %val) { +; CHECK: atom.max.u64 + %ret = atomicrmw umax i64* %subr, i64 %val seq_cst + ret i64 %ret +} + +; CHECK-LABEL: atom18 +define i32 @atom18(i32* %subr, i32 %val) { +; CHECK: atom.min.u32 + %ret = atomicrmw umin i32* %subr, i32 %val seq_cst + ret i32 %ret +} + +; CHECK-LABEL: atom19 +define i64 @atom19(i64* %subr, i64 %val) { +; CHECK: atom.min.u64 + %ret = atomicrmw umin i64* %subr, i64 %val seq_cst + ret i64 %ret +} + +declare float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %addr, float %val) + +; CHECK-LABEL: atomic_add_f32_generic +define float @atomic_add_f32_generic(float* %addr, float %val) { +; CHECK: atom.add.f32 + %ret = call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %addr, float %val) + ret float %ret +} + +declare float @llvm.nvvm.atomic.load.add.f32.p1f32(float addrspace(1)* %addr, float %val) + +; CHECK-LABEL: atomic_add_f32_addrspace1 +define float @atomic_add_f32_addrspace1(float addrspace(1)* %addr, float %val) { +; CHECK: atom.global.add.f32 + %ret = call float @llvm.nvvm.atomic.load.add.f32.p1f32(float addrspace(1)* %addr, float %val) + ret float %ret +} + +declare float @llvm.nvvm.atomic.load.add.f32.p3f32(float addrspace(3)* %addr, float %val) + +; CHECK-LABEL: atomic_add_f32_addrspace3 +define float @atomic_add_f32_addrspace3(float addrspace(3)* %addr, float %val) { +; CHECK: atom.shared.add.f32 + %ret = call float @llvm.nvvm.atomic.load.add.f32.p3f32(float addrspace(3)* %addr, float %val) + ret float %ret +} + +; CHECK-LABEL: atomic_cmpxchg_i32 +define i32 @atomic_cmpxchg_i32(i32* %addr, i32 %cmp, i32 %new) { +; CHECK: atom.cas.b32 + %pairold = cmpxchg i32* %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +; CHECK-LABEL: atomic_cmpxchg_i64 +define i64 @atomic_cmpxchg_i64(i64* %addr, i64 %cmp, i64 %new) { +; CHECK: atom.cas.b64 + %pairold = cmpxchg i64* %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} diff --git a/test/CodeGen/NVPTX/bfe.ll b/test/CodeGen/NVPTX/bfe.ll new file mode 100644 index 000000000000..2e816fec2c59 --- /dev/null +++ b/test/CodeGen/NVPTX/bfe.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +; CHECK: bfe0 +define i32 @bfe0(i32 %a) { +; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 4, 4 +; CHECK-NOT: shr +; CHECK-NOT: and + %val0 = ashr i32 %a, 4 + %val1 = and i32 %val0, 15 + ret i32 %val1 +} + +; CHECK: bfe1 +define i32 @bfe1(i32 %a) { +; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 3, 3 +; CHECK-NOT: shr +; CHECK-NOT: and + %val0 = ashr i32 %a, 3 + %val1 = and i32 %val0, 7 + ret i32 %val1 +} + +; CHECK: bfe2 +define i32 @bfe2(i32 %a) { +; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 5, 3 +; CHECK-NOT: shr +; CHECK-NOT: and + %val0 = ashr i32 %a, 5 + %val1 = and i32 %val0, 7 + ret i32 %val1 +} diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index 28dfa460e8ea..83d491637041 100644 --- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -20,11 +20,11 @@ entry: %buf = alloca [16 x i8], align 4 ; CHECK: .local .align 4 .b8 __local_depot0[16] -; CHECK: mov.u64 %rl[[BUF_REG:[0-9]+]] -; CHECK: cvta.local.u64 %SP, %rl[[BUF_REG]] +; CHECK: mov.u64 %rd[[BUF_REG:[0-9]+]] +; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]] -; CHECK: ld.param.u64 %rl[[A_REG:[0-9]+]], [kernel_func_param_0] -; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rl[[A_REG]]] +; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] +; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rd[[A_REG]]] ; CHECK: st.f32 [%SP+0], %f[[A0_REG]] %0 = load float* %a, align 4 @@ -46,11 +46,11 @@ entry: %7 = bitcast i8* %arrayidx7 to float* store float %6, float* %7, align 4 -; CHECK: add.u64 %rl[[SP_REG:[0-9]+]], %SP, 0 +; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0 ; CHECK: .param .b64 param0; -; CHECK-NEXT: st.param.b64 [param0+0], %rl[[A_REG]] +; CHECK-NEXT: st.param.b64 [param0+0], %rd[[A_REG]] ; CHECK-NEXT: .param .b64 param1; -; CHECK-NEXT: st.param.b64 [param1+0], %rl[[SP_REG]] +; CHECK-NEXT: st.param.b64 [param1+0], %rd[[SP_REG]] ; CHECK-NEXT: call.uni ; CHECK-NEXT: callee, diff --git a/test/CodeGen/NVPTX/compare-int.ll b/test/CodeGen/NVPTX/compare-int.ll index c595f215f6f1..e4e0601db59f 100644 --- a/test/CodeGen/NVPTX/compare-int.ll +++ b/test/CodeGen/NVPTX/compare-int.ll @@ -9,8 +9,8 @@ ;;; i64 define i64 @icmp_eq_i64(i64 %a, i64 %b) { -; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp eq i64 %a, %b %ret = zext i1 %cmp to i64 @@ -18,8 +18,8 @@ define i64 @icmp_eq_i64(i64 %a, i64 %b) { } define i64 @icmp_ne_i64(i64 %a, i64 %b) { -; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ne i64 %a, %b %ret = zext i1 %cmp to i64 @@ -27,8 +27,8 @@ define i64 @icmp_ne_i64(i64 %a, i64 %b) { } define i64 @icmp_ugt_i64(i64 %a, i64 %b) { -; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ugt i64 %a, %b %ret = zext i1 %cmp to i64 @@ -36,8 +36,8 @@ define i64 @icmp_ugt_i64(i64 %a, i64 %b) { } define i64 @icmp_uge_i64(i64 %a, i64 %b) { -; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp uge i64 %a, %b %ret = zext i1 %cmp to i64 @@ -45,8 +45,8 @@ define i64 @icmp_uge_i64(i64 %a, i64 %b) { } define i64 @icmp_ult_i64(i64 %a, i64 %b) { -; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ult i64 %a, %b %ret = zext i1 %cmp to i64 @@ -54,8 +54,8 @@ define i64 @icmp_ult_i64(i64 %a, i64 %b) { } define i64 @icmp_ule_i64(i64 %a, i64 %b) { -; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ule i64 %a, %b %ret = zext i1 %cmp to i64 @@ -63,8 +63,8 @@ define i64 @icmp_ule_i64(i64 %a, i64 %b) { } define i64 @icmp_sgt_i64(i64 %a, i64 %b) { -; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sgt i64 %a, %b %ret = zext i1 %cmp to i64 @@ -72,8 +72,8 @@ define i64 @icmp_sgt_i64(i64 %a, i64 %b) { } define i64 @icmp_sge_i64(i64 %a, i64 %b) { -; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sge i64 %a, %b %ret = zext i1 %cmp to i64 @@ -81,8 +81,8 @@ define i64 @icmp_sge_i64(i64 %a, i64 %b) { } define i64 @icmp_slt_i64(i64 %a, i64 %b) { -; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp slt i64 %a, %b %ret = zext i1 %cmp to i64 @@ -90,8 +90,8 @@ define i64 @icmp_slt_i64(i64 %a, i64 %b) { } define i64 @icmp_sle_i64(i64 %a, i64 %b) { -; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sle i64 %a, %b %ret = zext i1 %cmp to i64 diff --git a/test/CodeGen/NVPTX/convert-fp.ll b/test/CodeGen/NVPTX/convert-fp.ll index 1882121fa724..4b5446e317f4 100644 --- a/test/CodeGen/NVPTX/convert-fp.ll +++ b/test/CodeGen/NVPTX/convert-fp.ll @@ -10,7 +10,7 @@ define i16 @cvt_i16_f32(float %x) { } define i16 @cvt_i16_f64(double %x) { -; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptoui double %x to i16 ret i16 %a @@ -24,7 +24,7 @@ define i32 @cvt_i32_f32(float %x) { } define i32 @cvt_i32_f64(double %x) { -; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptoui double %x to i32 ret i32 %a @@ -32,14 +32,14 @@ define i32 @cvt_i32_f64(double %x) { define i64 @cvt_i64_f32(float %x) { -; CHECK: cvt.rzi.u64.f32 %rl{{[0-9]+}}, %f{{[0-9]+}}; +; CHECK: cvt.rzi.u64.f32 %rd{{[0-9]+}}, %f{{[0-9]+}}; ; CHECK: ret; %a = fptoui float %x to i64 ret i64 %a } define i64 @cvt_i64_f64(double %x) { -; CHECK: cvt.rzi.u64.f64 %rl{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: cvt.rzi.u64.f64 %rd{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptoui double %x to i64 ret i64 %a @@ -60,14 +60,14 @@ define float @cvt_f32_i32(i32 %x) { } define float @cvt_f32_i64(i64 %x) { -; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rl{{[0-9]+}}; +; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rd{{[0-9]+}}; ; CHECK: ret; %a = uitofp i64 %x to float ret float %a } define float @cvt_f32_f64(double %x) { -; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptrunc double %x to float ret float %a @@ -88,56 +88,56 @@ define float @cvt_f32_s32(i32 %x) { } define float @cvt_f32_s64(i64 %x) { -; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %a = sitofp i64 %x to float ret float %a } define double @cvt_f64_i16(i16 %x) { -; CHECK: cvt.rn.f64.u16 %fl{{[0-9]+}}, %rs{{[0-9]+}}; +; CHECK: cvt.rn.f64.u16 %fd{{[0-9]+}}, %rs{{[0-9]+}}; ; CHECK: ret; %a = uitofp i16 %x to double ret double %a } define double @cvt_f64_i32(i32 %x) { -; CHECK: cvt.rn.f64.u32 %fl{{[0-9]+}}, %r{{[0-9]+}}; +; CHECK: cvt.rn.f64.u32 %fd{{[0-9]+}}, %r{{[0-9]+}}; ; CHECK: ret; %a = uitofp i32 %x to double ret double %a } define double @cvt_f64_i64(i64 %x) { -; CHECK: cvt.rn.f64.u64 %fl{{[0-9]+}}, %rl{{[0-9]+}}; +; CHECK: cvt.rn.f64.u64 %fd{{[0-9]+}}, %rd{{[0-9]+}}; ; CHECK: ret; %a = uitofp i64 %x to double ret double %a } define double @cvt_f64_f32(float %x) { -; CHECK: cvt.f64.f32 %fl{{[0-9]+}}, %f{{[0-9]+}}; +; CHECK: cvt.f64.f32 %fd{{[0-9]+}}, %f{{[0-9]+}}; ; CHECK: ret; %a = fpext float %x to double ret double %a } define double @cvt_f64_s16(i16 %x) { -; CHECK: cvt.rn.f64.s16 %fl{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: cvt.rn.f64.s16 %fd{{[0-9]+}}, %rs{{[0-9]+}} ; CHECK: ret %a = sitofp i16 %x to double ret double %a } define double @cvt_f64_s32(i32 %x) { -; CHECK: cvt.rn.f64.s32 %fl{{[0-9]+}}, %r{{[0-9]+}} +; CHECK: cvt.rn.f64.s32 %fd{{[0-9]+}}, %r{{[0-9]+}} ; CHECK: ret %a = sitofp i32 %x to double ret double %a } define double @cvt_f64_s64(i64 %x) { -; CHECK: cvt.rn.f64.s64 %fl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: cvt.rn.f64.s64 %fd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %a = sitofp i64 %x to double ret double %a diff --git a/test/CodeGen/NVPTX/convert-int-sm20.ll b/test/CodeGen/NVPTX/convert-int-sm20.ll index 227cd31e11b3..57a231629e00 100644 --- a/test/CodeGen/NVPTX/convert-int-sm20.ll +++ b/test/CodeGen/NVPTX/convert-int-sm20.ll @@ -48,16 +48,16 @@ define i32 @cvt_i32_i64(i64 %x) { ; i64 define i64 @cvt_i64_i16(i16 %x) { -; CHECK: ld.param.u16 %rl[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}] -; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]] +; CHECK: ld.param.u16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}] +; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]] ; CHECK: ret %a = zext i16 %x to i64 ret i64 %a } define i64 @cvt_i64_i32(i32 %x) { -; CHECK: ld.param.u32 %rl[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}] -; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]] +; CHECK: ld.param.u32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}] +; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]] ; CHECK: ret %a = zext i32 %x to i64 ret i64 %a diff --git a/test/CodeGen/NVPTX/envreg.ll b/test/CodeGen/NVPTX/envreg.ll new file mode 100644 index 000000000000..a341b49ecdf3 --- /dev/null +++ b/test/CodeGen/NVPTX/envreg.ll @@ -0,0 +1,139 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +declare i32 @llvm.nvvm.read.ptx.sreg.envreg0() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg1() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg2() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg3() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg4() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg5() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg6() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg7() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg8() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg9() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg10() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg11() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg12() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg13() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg14() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg15() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg16() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg17() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg18() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg19() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg20() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg21() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg22() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg23() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg24() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg25() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg26() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg27() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg28() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg29() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg30() +declare i32 @llvm.nvvm.read.ptx.sreg.envreg31() + + +; CHECK: foo +define i32 @foo() { +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg0 + %val0 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg0() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg1 + %val1 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg1() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg2 + %val2 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg2() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg3 + %val3 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg3() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg4 + %val4 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg4() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg5 + %val5 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg5() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg6 + %val6 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg6() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg7 + %val7 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg7() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg8 + %val8 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg8() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg9 + %val9 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg9() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg10 + %val10 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg10() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg11 + %val11 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg11() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg12 + %val12 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg12() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg13 + %val13 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg13() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg14 + %val14 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg14() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg15 + %val15 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg15() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg16 + %val16 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg16() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg17 + %val17 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg17() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg18 + %val18 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg18() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg19 + %val19 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg19() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg20 + %val20 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg20() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg21 + %val21 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg21() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg22 + %val22 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg22() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg23 + %val23 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg23() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg24 + %val24 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg24() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg25 + %val25 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg25() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg26 + %val26 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg26() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg27 + %val27 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg27() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg28 + %val28 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg28() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg29 + %val29 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg29() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg30 + %val30 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg30() +; CHECK: mov.b32 %r{{[0-9]+}}, %envreg31 + %val31 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg31() + + + %ret0 = add i32 %val0, %val1 + %ret1 = add i32 %ret0, %val2 + %ret2 = add i32 %ret1, %val3 + %ret3 = add i32 %ret2, %val4 + %ret4 = add i32 %ret3, %val5 + %ret5 = add i32 %ret4, %val6 + %ret6 = add i32 %ret5, %val7 + %ret7 = add i32 %ret6, %val8 + %ret8 = add i32 %ret7, %val9 + %ret9 = add i32 %ret8, %val10 + %ret10 = add i32 %ret9, %val11 + %ret11 = add i32 %ret10, %val12 + %ret12 = add i32 %ret11, %val13 + %ret13 = add i32 %ret12, %val14 + %ret14 = add i32 %ret13, %val15 + %ret15 = add i32 %ret14, %val16 + %ret16 = add i32 %ret15, %val17 + %ret17 = add i32 %ret16, %val18 + %ret18 = add i32 %ret17, %val19 + %ret19 = add i32 %ret18, %val20 + %ret20 = add i32 %ret19, %val21 + %ret21 = add i32 %ret20, %val22 + %ret22 = add i32 %ret21, %val23 + %ret23 = add i32 %ret22, %val24 + %ret24 = add i32 %ret23, %val25 + %ret25 = add i32 %ret24, %val26 + %ret26 = add i32 %ret25, %val27 + %ret27 = add i32 %ret26, %val28 + %ret28 = add i32 %ret27, %val29 + %ret29 = add i32 %ret28, %val30 + %ret30 = add i32 %ret29, %val31 + + ret i32 %ret30 +} diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll index 4ef1a9a4cefb..14b5c45b87d8 100644 --- a/test/CodeGen/NVPTX/fma.ll +++ b/test/CodeGen/NVPTX/fma.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s define ptx_device float @t1_f32(float %x, float %y, float %z) { ; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; @@ -9,7 +9,7 @@ define ptx_device float @t1_f32(float %x, float %y, float %z) { } define ptx_device double @t1_f64(double %x, double %y, double %z) { -; CHECK: fma.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fmul double %x, %y %b = fadd double %a, %z diff --git a/test/CodeGen/NVPTX/fp-contract.ll b/test/CodeGen/NVPTX/fp-contract.ll new file mode 100644 index 000000000000..3f68b188ba75 --- /dev/null +++ b/test/CodeGen/NVPTX/fp-contract.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT + +target triple = "nvptx64-unknown-cuda" + +;; Make sure we are generating proper instruction sequences for fused ops +;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit +;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas +;; is free to fuse with a multiply if it is able. If fusion is not allowed, +;; we do not form fma.rn at the PTX level and explicitly generate add.rn +;; for all adds to prevent ptxas from fusion the ops. + +;; FAST-LABEL: @t0 +;; DEFAULT-LABEL: @t0 +define float @t0(float %a, float %b, float %c) { +;; FAST: fma.rn.f32 +;; DEFAULT: mul.rn.f32 +;; DEFAULT: add.rn.f32 + %v0 = fmul float %a, %b + %v1 = fadd float %v0, %c + ret float %v1 +} + +;; FAST-LABEL: @t1 +;; DEFAULT-LABEL: @t1 +define float @t1(float %a, float %b) { +;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32 +;; to prevent ptxas from fusing this with anything else. +;; FAST: add.f32 +;; DEFAULT: add.rn.f32 + %v1 = fadd float %a, %b + ret float %v1 +} diff --git a/test/CodeGen/NVPTX/fp-literals.ll b/test/CodeGen/NVPTX/fp-literals.ll index 0cc2413e009f..755e0f9250a1 100644 --- a/test/CodeGen/NVPTX/fp-literals.ll +++ b/test/CodeGen/NVPTX/fp-literals.ll @@ -1,4 +1,7 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s + +target triple = "nvptx64-unknown-cuda" +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" ; Make sure we can properly differentiate between single-precision and ; double-precision FP literals. @@ -11,7 +14,7 @@ define float @myaddf(float %a) { } ; CHECK: myaddd -; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, 0d3FF0000000000000 +; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, 0d3FF0000000000000 define double @myaddd(double %a) { %ret = fadd double %a, 1.0 ret double %ret diff --git a/test/CodeGen/NVPTX/fp16.ll b/test/CodeGen/NVPTX/fp16.ll new file mode 100644 index 000000000000..8770399f2ec9 --- /dev/null +++ b/test/CodeGen/NVPTX/fp16.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s + +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone +declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone +declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone + +; CHECK-LABEL: @test_convert_fp16_to_fp32 +; CHECK: cvt.f32.f16 +define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { + %val = load i16 addrspace(1)* %in, align 2 + %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + + +; CHECK-LABEL: @test_convert_fp16_to_fp64 +; CHECK: cvt.f64.f16 +define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { + %val = load i16 addrspace(1)* %in, align 2 + %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone + store double %cvt, double addrspace(1)* %out, align 4 + ret void +} + + +; CHECK-LABEL: @test_convert_fp32_to_fp16 +; CHECK: cvt.rn.f16.f32 +define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %val = load float addrspace(1)* %in, align 2 + %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone + store i16 %cvt, i16 addrspace(1)* %out, align 4 + ret void +} + + +; CHECK-LABEL: @test_convert_fp64_to_fp16 +; CHECK: cvt.rn.f16.f64 +define void @test_convert_fp64_to_fp16(i16 addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { + %val = load double addrspace(1)* %in, align 2 + %cvt = call i16 @llvm.convert.to.fp16.f64(double %val) nounwind readnone + store i16 %cvt, i16 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/NVPTX/gvar-init.ll b/test/CodeGen/NVPTX/gvar-init.ll new file mode 100644 index 000000000000..8c959422e66a --- /dev/null +++ b/test/CodeGen/NVPTX/gvar-init.ll @@ -0,0 +1,5 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; Error out if initializer is given for address spaces that do not support initializers +; XFAIL: * +@g0 = addrspace(3) global i32 42 diff --git a/test/CodeGen/NVPTX/half.ll b/test/CodeGen/NVPTX/half.ll new file mode 100644 index 000000000000..aa08cc78e91a --- /dev/null +++ b/test/CodeGen/NVPTX/half.ll @@ -0,0 +1,70 @@ +; RUN: llc < %s -march=nvptx | FileCheck %s + +define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) { +; CHECK-LABEL: @test_load_store +; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] +; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] + %val = load half addrspace(1)* %in + store half %val, half addrspace(1) * %out + ret void +} + +define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) { +; CHECK-LABEL: @test_bitcast_from_half +; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] +; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] + %val = load half addrspace(1) * %in + %val_int = bitcast half %val to i16 + store i16 %val_int, i16 addrspace(1)* %out + ret void +} + +define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) { +; CHECK-LABEL: @test_bitcast_to_half +; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] +; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] + %val = load i16 addrspace(1)* %in + %val_fp = bitcast i16 %val to half + store half %val_fp, half addrspace(1)* %out + ret void +} + +define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) { +; CHECK-LABEL: @test_extend32 +; CHECK: cvt.f32.f16 + + %val16 = load half addrspace(1)* %in + %val32 = fpext half %val16 to float + store float %val32, float addrspace(1)* %out + ret void +} + +define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) { +; CHECK-LABEL: @test_extend64 +; CHECK: cvt.f64.f16 + + %val16 = load half addrspace(1)* %in + %val64 = fpext half %val16 to double + store double %val64, double addrspace(1)* %out + ret void +} + +define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) { +; CHECK-LABEL: test_trunc32 +; CHECK: cvt.rn.f16.f32 + + %val32 = load float addrspace(1)* %in + %val16 = fptrunc float %val32 to half + store half %val16, half addrspace(1)* %out + ret void +} + +define void @test_trunc64(double addrspace(1)* %in, half addrspace(1)* %out) { +; CHECK-LABEL: @test_trunc64 +; CHECK: cvt.rn.f16.f64 + + %val32 = load double addrspace(1)* %in + %val16 = fptrunc double %val32 to half + store half %val16, half addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/NVPTX/imad.ll b/test/CodeGen/NVPTX/imad.ll new file mode 100644 index 000000000000..67421c7cac4b --- /dev/null +++ b/test/CodeGen/NVPTX/imad.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; CHECK: imad +define i32 @imad(i32 %a, i32 %b, i32 %c) { +; CHECK: mad.lo.s32 + %val0 = mul i32 %a, %b + %val1 = add i32 %val0, %c + ret i32 %val1 +} diff --git a/test/CodeGen/NVPTX/implicit-def.ll b/test/CodeGen/NVPTX/implicit-def.ll index 06d3d562046e..2d2c6e527f6d 100644 --- a/test/CodeGen/NVPTX/implicit-def.ll +++ b/test/CodeGen/NVPTX/implicit-def.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s ; CHECK: // implicit-def: %f[[F0:[0-9]+]] -; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]]; +; CHECK: add.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]]; define float @foo(float %a) { %ret = fadd float %a, undef ret float %ret diff --git a/test/CodeGen/NVPTX/inline-asm.ll b/test/CodeGen/NVPTX/inline-asm.ll index d76eb4239ee3..6f0578d4cff4 100644 --- a/test/CodeGen/NVPTX/inline-asm.ll +++ b/test/CodeGen/NVPTX/inline-asm.ll @@ -7,3 +7,10 @@ entry: %0 = call float asm "ex2.approx.ftz.f32 $0, $1;", "=f,f"(float %x) ret float %0 } + +define i32 @foo(i1 signext %cond, i32 %a, i32 %b) #0 { +entry: +; CHECK: selp.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}} + %0 = tail call i32 asm "selp.b32 $0, $1, $2, $3;", "=r,r,r,b"(i32 %a, i32 %b, i1 %cond) + ret i32 %0 +} diff --git a/test/CodeGen/NVPTX/intrinsic-old.ll b/test/CodeGen/NVPTX/intrinsic-old.ll index af91bb442412..3c51776c0ec9 100644 --- a/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/test/CodeGen/NVPTX/intrinsic-old.ll @@ -198,7 +198,7 @@ define ptx_device i32 @test_clock() { } define ptx_device i64 @test_clock64() { -; CHECK: mov.u64 %rl{{[0-9]+}}, %clock64; +; CHECK: mov.u64 %rd{{[0-9]+}}, %clock64; ; CHECK: ret; %x = call i64 @llvm.ptx.read.clock64() ret i64 %x diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll index 78e1e7789014..34b671d70e94 100644 --- a/test/CodeGen/NVPTX/intrinsics.ll +++ b/test/CodeGen/NVPTX/intrinsics.ll @@ -9,7 +9,7 @@ define ptx_device float @test_fabsf(float %f) { } define ptx_device double @test_fabs(double %d) { -; CHECK: abs.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: abs.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %x = call double @llvm.fabs.f64(double %d) ret double %x diff --git a/test/CodeGen/NVPTX/isspacep.ll b/test/CodeGen/NVPTX/isspacep.ll new file mode 100644 index 000000000000..47fa7a6714df --- /dev/null +++ b/test/CodeGen/NVPTX/isspacep.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +declare i1 @llvm.nvvm.isspacep.const(i8*) readnone noinline +declare i1 @llvm.nvvm.isspacep.global(i8*) readnone noinline +declare i1 @llvm.nvvm.isspacep.local(i8*) readnone noinline +declare i1 @llvm.nvvm.isspacep.shared(i8*) readnone noinline + +; CHECK: is_const +define i1 @is_const(i8* %addr) { +; CHECK: isspacep.const + %v = tail call i1 @llvm.nvvm.isspacep.const(i8* %addr) + ret i1 %v +} + +; CHECK: is_global +define i1 @is_global(i8* %addr) { +; CHECK: isspacep.global + %v = tail call i1 @llvm.nvvm.isspacep.global(i8* %addr) + ret i1 %v +} + +; CHECK: is_local +define i1 @is_local(i8* %addr) { +; CHECK: isspacep.local + %v = tail call i1 @llvm.nvvm.isspacep.local(i8* %addr) + ret i1 %v +} + +; CHECK: is_shared +define i1 @is_shared(i8* %addr) { +; CHECK: isspacep.shared + %v = tail call i1 @llvm.nvvm.isspacep.shared(i8* %addr) + ret i1 %v +} + diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll index 133ef09afdb2..f33659c92e84 100644 --- a/test/CodeGen/NVPTX/ld-addrspace.ll +++ b/test/CodeGen/NVPTX/ld-addrspace.ll @@ -6,7 +6,7 @@ define i8 @ld_global_i8(i8 addrspace(1)* %ptr) { ; PTX32: ld.global.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(1)* %ptr ret i8 %a @@ -15,7 +15,7 @@ define i8 @ld_global_i8(i8 addrspace(1)* %ptr) { define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) { ; PTX32: ld.shared.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(3)* %ptr ret i8 %a @@ -24,7 +24,7 @@ define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) { define i8 @ld_local_i8(i8 addrspace(5)* %ptr) { ; PTX32: ld.local.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(5)* %ptr ret i8 %a @@ -34,7 +34,7 @@ define i8 @ld_local_i8(i8 addrspace(5)* %ptr) { define i16 @ld_global_i16(i16 addrspace(1)* %ptr) { ; PTX32: ld.global.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(1)* %ptr ret i16 %a @@ -43,7 +43,7 @@ define i16 @ld_global_i16(i16 addrspace(1)* %ptr) { define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) { ; PTX32: ld.shared.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(3)* %ptr ret i16 %a @@ -52,7 +52,7 @@ define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) { define i16 @ld_local_i16(i16 addrspace(5)* %ptr) { ; PTX32: ld.local.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(5)* %ptr ret i16 %a @@ -62,7 +62,7 @@ define i16 @ld_local_i16(i16 addrspace(5)* %ptr) { define i32 @ld_global_i32(i32 addrspace(1)* %ptr) { ; PTX32: ld.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32 addrspace(1)* %ptr ret i32 %a @@ -71,7 +71,7 @@ define i32 @ld_global_i32(i32 addrspace(1)* %ptr) { define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) { ; PTX32: ld.shared.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32 addrspace(3)* %ptr ret i32 %a @@ -80,7 +80,7 @@ define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) { define i32 @ld_local_i32(i32 addrspace(5)* %ptr) { ; PTX32: ld.local.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32 addrspace(5)* %ptr ret i32 %a @@ -88,27 +88,27 @@ define i32 @ld_local_i32(i32 addrspace(5)* %ptr) { ;; i64 define i64 @ld_global_i64(i64 addrspace(1)* %ptr) { -; PTX32: ld.global.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.global.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i64 addrspace(1)* %ptr ret i64 %a } define i64 @ld_shared_i64(i64 addrspace(3)* %ptr) { -; PTX32: ld.shared.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.shared.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i64 addrspace(3)* %ptr ret i64 %a } define i64 @ld_local_i64(i64 addrspace(5)* %ptr) { -; PTX32: ld.local.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.local.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i64 addrspace(5)* %ptr ret i64 %a @@ -118,7 +118,7 @@ define i64 @ld_local_i64(i64 addrspace(5)* %ptr) { define float @ld_global_f32(float addrspace(1)* %ptr) { ; PTX32: ld.global.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load float addrspace(1)* %ptr ret float %a @@ -127,7 +127,7 @@ define float @ld_global_f32(float addrspace(1)* %ptr) { define float @ld_shared_f32(float addrspace(3)* %ptr) { ; PTX32: ld.shared.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load float addrspace(3)* %ptr ret float %a @@ -136,7 +136,7 @@ define float @ld_shared_f32(float addrspace(3)* %ptr) { define float @ld_local_f32(float addrspace(5)* %ptr) { ; PTX32: ld.local.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load float addrspace(5)* %ptr ret float %a @@ -144,27 +144,27 @@ define float @ld_local_f32(float addrspace(5)* %ptr) { ;; f64 define double @ld_global_f64(double addrspace(1)* %ptr) { -; PTX32: ld.global.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.global.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load double addrspace(1)* %ptr ret double %a } define double @ld_shared_f64(double addrspace(3)* %ptr) { -; PTX32: ld.shared.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.shared.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load double addrspace(3)* %ptr ret double %a } define double @ld_local_f64(double addrspace(5)* %ptr) { -; PTX32: ld.local.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.local.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load double addrspace(5)* %ptr ret double %a diff --git a/test/CodeGen/NVPTX/ld-generic.ll b/test/CodeGen/NVPTX/ld-generic.ll index 3728268c24d5..d629e0ecc647 100644 --- a/test/CodeGen/NVPTX/ld-generic.ll +++ b/test/CodeGen/NVPTX/ld-generic.ll @@ -6,7 +6,7 @@ define i8 @ld_global_i8(i8 addrspace(0)* %ptr) { ; PTX32: ld.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(0)* %ptr ret i8 %a @@ -16,7 +16,7 @@ define i8 @ld_global_i8(i8 addrspace(0)* %ptr) { define i16 @ld_global_i16(i16 addrspace(0)* %ptr) { ; PTX32: ld.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(0)* %ptr ret i16 %a @@ -26,7 +26,7 @@ define i16 @ld_global_i16(i16 addrspace(0)* %ptr) { define i32 @ld_global_i32(i32 addrspace(0)* %ptr) { ; PTX32: ld.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32 addrspace(0)* %ptr ret i32 %a @@ -34,9 +34,9 @@ define i32 @ld_global_i32(i32 addrspace(0)* %ptr) { ;; i64 define i64 @ld_global_i64(i64 addrspace(0)* %ptr) { -; PTX32: ld.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i64 addrspace(0)* %ptr ret i64 %a @@ -46,7 +46,7 @@ define i64 @ld_global_i64(i64 addrspace(0)* %ptr) { define float @ld_global_f32(float addrspace(0)* %ptr) { ; PTX32: ld.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load float addrspace(0)* %ptr ret float %a @@ -54,9 +54,9 @@ define float @ld_global_f32(float addrspace(0)* %ptr) { ;; f64 define double @ld_global_f64(double addrspace(0)* %ptr) { -; PTX32: ld.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load double addrspace(0)* %ptr ret double %a diff --git a/test/CodeGen/NVPTX/ldu-i8.ll b/test/CodeGen/NVPTX/ldu-i8.ll index 81a82b2c38b5..9cc667557906 100644 --- a/test/CodeGen/NVPTX/ldu-i8.ll +++ b/test/CodeGen/NVPTX/ldu-i8.ll @@ -2,13 +2,15 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -declare i8 @llvm.nvvm.ldu.global.i.i8(i8*) +declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*) define i8 @foo(i8* %a) { ; Ensure we properly truncate off the high-order 24 bits ; CHECK: ldu.global.u8 ; CHECK: cvt.u32.u16 ; CHECK: and.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 255 - %val = tail call i8 @llvm.nvvm.ldu.global.i.i8(i8* %a) + %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a), !align !0 ret i8 %val } + +!0 = metadata !{i32 4} diff --git a/test/CodeGen/NVPTX/ldu-ldg.ll b/test/CodeGen/NVPTX/ldu-ldg.ll new file mode 100644 index 000000000000..3b0619ff5175 --- /dev/null +++ b/test/CodeGen/NVPTX/ldu-ldg.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr) +declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr) +declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr) +declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr) + + +; CHECK: func0 +define i8 @func0(i8 addrspace(1)* %ptr) { +; ldu.global.u8 + %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0 + ret i8 %val +} + +; CHECK: func1 +define i32 @func1(i32 addrspace(1)* %ptr) { +; ldu.global.u32 + %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0 + ret i32 %val +} + +; CHECK: func2 +define i8 @func2(i8 addrspace(1)* %ptr) { +; ld.global.nc.u8 + %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0 + ret i8 %val +} + +; CHECK: func3 +define i32 @func3(i32 addrspace(1)* %ptr) { +; ld.global.nc.u32 + %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0 + ret i32 %val +} + + + +!0 = metadata !{i32 4} diff --git a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll index 26cadc401b79..55707ea85106 100644 --- a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll +++ b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll @@ -7,9 +7,9 @@ define void @reg_plus_offset(i32* %a) { ; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+32]; ; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+36]; %p2 = getelementptr i32* %a, i32 8 - %t1 = call i32 @llvm.nvvm.ldu.global.i.i32(i32* %p2), !align !1 + %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2), !align !1 %p3 = getelementptr i32* %a, i32 9 - %t2 = call i32 @llvm.nvvm.ldu.global.i.i32(i32* %p3), !align !1 + %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3), !align !1 %t3 = mul i32 %t1, %t2 store i32 %t3, i32* %a ret void @@ -17,5 +17,5 @@ define void @reg_plus_offset(i32* %a) { !1 = metadata !{ i32 4 } -declare i32 @llvm.nvvm.ldu.global.i.i32(i32*) +declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*) declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() diff --git a/test/CodeGen/NVPTX/local-stack-frame.ll b/test/CodeGen/NVPTX/local-stack-frame.ll index c0d7d1c3a638..377eee9170e6 100644 --- a/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/test/CodeGen/NVPTX/local-stack-frame.ll @@ -7,8 +7,8 @@ ; PTX32: cvta.local.u32 %SP, %r{{[0-9]+}}; ; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; ; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; -; PTX64: mov.u64 %rl{{[0-9]+}}, __local_depot{{[0-9]+}}; -; PTX64: cvta.local.u64 %SP, %rl{{[0-9]+}}; +; PTX64: mov.u64 %rd{{[0-9]+}}, __local_depot{{[0-9]+}}; +; PTX64: cvta.local.u64 %SP, %rd{{[0-9]+}}; ; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; ; PTX64: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; define void @foo(i32 %a) { diff --git a/test/CodeGen/NVPTX/managed.ll b/test/CodeGen/NVPTX/managed.ll new file mode 100644 index 000000000000..4d7e7817f77b --- /dev/null +++ b/test/CodeGen/NVPTX/managed.ll @@ -0,0 +1,11 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +; CHECK: .visible .global .align 4 .u32 device_g; +@device_g = addrspace(1) global i32 zeroinitializer +; CHECK: .visible .global .attribute(.managed) .align 4 .u32 managed_g; +@managed_g = addrspace(1) global i32 zeroinitializer + + +!nvvm.annotations = !{!0} +!0 = metadata !{i32 addrspace(1)* @managed_g, metadata !"managed", i32 1} diff --git a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll new file mode 100644 index 000000000000..90c9c4306de7 --- /dev/null +++ b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -0,0 +1,77 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: t1 +define <4 x float> @t1(i8* %p1) { +; CHECK-NOT: ld.v4 +; CHECK-NOT: ld.v2 +; CHECK-NOT: ld.f32 +; CHECK: ld.u8 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 1 + ret <4 x float> %r +} + +; CHECK-LABEL: t2 +define <4 x float> @t2(i8* %p1) { +; CHECK-NOT: ld.v4 +; CHECK-NOT: ld.v2 +; CHECK: ld.f32 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 4 + ret <4 x float> %r +} + +; CHECK-LABEL: t3 +define <4 x float> @t3(i8* %p1) { +; CHECK-NOT: ld.v4 +; CHECK: ld.v2 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 8 + ret <4 x float> %r +} + +; CHECK-LABEL: t4 +define <4 x float> @t4(i8* %p1) { +; CHECK: ld.v4 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 16 + ret <4 x float> %r +} + + +; CHECK-LABEL: s1 +define void @s1(<4 x float>* %p1, <4 x float> %v) { +; CHECK-NOT: st.v4 +; CHECK-NOT: st.v2 +; CHECK-NOT: st.f32 +; CHECK: st.u8 + store <4 x float> %v, <4 x float>* %p1, align 1 + ret void +} + +; CHECK-LABEL: s2 +define void @s2(<4 x float>* %p1, <4 x float> %v) { +; CHECK-NOT: st.v4 +; CHECK-NOT: st.v2 +; CHECK: st.f32 + store <4 x float> %v, <4 x float>* %p1, align 4 + ret void +} + +; CHECK-LABEL: s3 +define void @s3(<4 x float>* %p1, <4 x float> %v) { +; CHECK-NOT: st.v4 + store <4 x float> %v, <4 x float>* %p1, align 8 + ret void +} + +; CHECK-LABEL: s4 +define void @s4(<4 x float>* %p1, <4 x float> %v) { +; CHECK: st.v4 + store <4 x float> %v, <4 x float>* %p1, align 16 + ret void +} + diff --git a/test/CodeGen/NVPTX/mulwide.ll b/test/CodeGen/NVPTX/mulwide.ll new file mode 100644 index 000000000000..927946c8fb33 --- /dev/null +++ b/test/CodeGen/NVPTX/mulwide.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; CHECK: mulwide16 +define i32 @mulwide16(i16 %a, i16 %b) { +; CHECK: mul.wide.s16 + %val0 = sext i16 %a to i32 + %val1 = sext i16 %b to i32 + %val2 = mul i32 %val0, %val1 + ret i32 %val2 +} + +; CHECK: mulwideu16 +define i32 @mulwideu16(i16 %a, i16 %b) { +; CHECK: mul.wide.u16 + %val0 = zext i16 %a to i32 + %val1 = zext i16 %b to i32 + %val2 = mul i32 %val0, %val1 + ret i32 %val2 +} + +; CHECK: mulwide32 +define i64 @mulwide32(i32 %a, i32 %b) { +; CHECK: mul.wide.s32 + %val0 = sext i32 %a to i64 + %val1 = sext i32 %b to i64 + %val2 = mul i64 %val0, %val1 + ret i64 %val2 +} + +; CHECK: mulwideu32 +define i64 @mulwideu32(i32 %a, i32 %b) { +; CHECK: mul.wide.u32 + %val0 = zext i32 %a to i64 + %val1 = zext i32 %b to i64 + %val2 = mul i64 %val0, %val1 + ret i64 %val2 +} diff --git a/test/CodeGen/NVPTX/nvvm-reflect.ll b/test/CodeGen/NVPTX/nvvm-reflect.ll index 45438847b8de..21e9c69e657a 100644 --- a/test/CodeGen/NVPTX/nvvm-reflect.ll +++ b/test/CodeGen/NVPTX/nvvm-reflect.ll @@ -32,3 +32,17 @@ exit: %ret = phi float [%ret1, %use_mul], [%ret2, %use_add] ret float %ret } + +declare i32 @llvm.nvvm.reflect.p0i8(i8*) + +; USE_MUL_0: define i32 @intrinsic +; USE_MUL_1: define i32 @intrinsic +define i32 @intrinsic() { +; USE_MUL_0-NOT: call i32 @llvm.nvvm.reflect +; USE_MUL_0: ret i32 0 +; USE_MUL_1-NOT: call i32 @llvm.nvvm.reflect +; USE_MUL_1: ret i32 1 + %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8] addrspace(4)* @str, i32 0, i32 0)) + %reflect = tail call i32 @llvm.nvvm.reflect.p0i8(i8* %ptr) + ret i32 %reflect +} diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll index e7a81be01b14..cc67a6fff8e4 100644 --- a/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -5,7 +5,7 @@ define ptx_kernel void @t1(i1* %a) { ; PTX32: mov.u16 %rs{{[0-9]+}}, 0; ; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}; ; PTX64: mov.u16 %rs{{[0-9]+}}, 0; -; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}; +; PTX64-NEXT: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}; store i1 false, i1* %a ret void } @@ -15,7 +15,7 @@ define ptx_kernel void @t2(i1* %a, i8* %b) { ; PTX32: ld.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; ; PTX32: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1; -; PTX64: ld.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; ; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1; diff --git a/test/CodeGen/NVPTX/rotate.ll b/test/CodeGen/NVPTX/rotate.ll new file mode 100644 index 000000000000..dfc8b4fd5fcb --- /dev/null +++ b/test/CodeGen/NVPTX/rotate.ll @@ -0,0 +1,58 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s + + +declare i32 @llvm.nvvm.rotate.b32(i32, i32) +declare i64 @llvm.nvvm.rotate.b64(i64, i32) +declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) + +; SM20: rotate32 +; SM35: rotate32 +define i32 @rotate32(i32 %a, i32 %b) { +; SM20: shl.b32 +; SM20: sub.s32 +; SM20: shr.b32 +; SM20: add.u32 +; SM35: shf.l.wrap.b32 + %val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b) + ret i32 %val +} + +; SM20: rotate64 +; SM35: rotate64 +define i64 @rotate64(i64 %a, i32 %b) { +; SM20: shl.b64 +; SM20: sub.u32 +; SM20: shr.b64 +; SM20: add.u64 +; SM35: shf.l.wrap.b32 +; SM35: shf.l.wrap.b32 + %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b) + ret i64 %val +} + +; SM20: rotateright64 +; SM35: rotateright64 +define i64 @rotateright64(i64 %a, i32 %b) { +; SM20: shr.b64 +; SM20: sub.u32 +; SM20: shl.b64 +; SM20: add.u64 +; SM35: shf.r.wrap.b32 +; SM35: shf.r.wrap.b32 + %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b) + ret i64 %val +} + +; SM20: rotl0 +; SM35: rotl0 +define i32 @rotl0(i32 %x) { +; SM20: shl.b32 +; SM20: shr.b32 +; SM20: add.u32 +; SM35: shf.l.wrap.b32 + %t0 = shl i32 %x, 8 + %t1 = lshr i32 %x, 24 + %t2 = or i32 %t0, %t1 + ret i32 %t2 +} diff --git a/test/CodeGen/NVPTX/shift-parts.ll b/test/CodeGen/NVPTX/shift-parts.ll new file mode 100644 index 000000000000..748297caf339 --- /dev/null +++ b/test/CodeGen/NVPTX/shift-parts.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; CHECK: shift_parts_left_128 +define void @shift_parts_left_128(i128* %val, i128* %amtptr) { +; CHECK: shl.b64 +; CHECK: mov.u32 +; CHECK: sub.s32 +; CHECK: shr.u64 +; CHECK: or.b64 +; CHECK: add.s32 +; CHECK: shl.b64 +; CHECK: setp.gt.s32 +; CHECK: selp.b64 +; CHECK: shl.b64 + %amt = load i128* %amtptr + %a = load i128* %val + %val0 = shl i128 %a, %amt + store i128 %val0, i128* %val + ret void +} + +; CHECK: shift_parts_right_128 +define void @shift_parts_right_128(i128* %val, i128* %amtptr) { +; CHECK: shr.u64 +; CHECK: sub.s32 +; CHECK: shl.b64 +; CHECK: or.b64 +; CHECK: add.s32 +; CHECK: shr.s64 +; CHECK: setp.gt.s32 +; CHECK: selp.b64 +; CHECK: shr.s64 + %amt = load i128* %amtptr + %a = load i128* %val + %val0 = ashr i128 %a, %amt + store i128 %val0, i128* %val + ret void +} diff --git a/test/CodeGen/NVPTX/st-addrspace.ll b/test/CodeGen/NVPTX/st-addrspace.ll index 68c09fe065bc..34a83f343324 100644 --- a/test/CodeGen/NVPTX/st-addrspace.ll +++ b/test/CodeGen/NVPTX/st-addrspace.ll @@ -7,7 +7,7 @@ define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) { ; PTX32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(1)* %ptr ret void @@ -16,7 +16,7 @@ define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) { define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) { ; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(3)* %ptr ret void @@ -25,7 +25,7 @@ define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) { define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) { ; PTX32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(5)* %ptr ret void @@ -36,7 +36,7 @@ define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) { define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) { ; PTX32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i16 %a, i16 addrspace(1)* %ptr ret void @@ -45,7 +45,7 @@ define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) { define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) { ; PTX32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i16 %a, i16 addrspace(3)* %ptr ret void @@ -54,7 +54,7 @@ define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) { define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) { ; PTX32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i16 %a, i16 addrspace(5)* %ptr ret void @@ -65,7 +65,7 @@ define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) { define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) { ; PTX32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}} +; PTX64: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, i32 addrspace(1)* %ptr ret void @@ -74,7 +74,7 @@ define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) { define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) { ; PTX32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}} +; PTX64: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, i32 addrspace(3)* %ptr ret void @@ -83,7 +83,7 @@ define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) { define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) { ; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}} +; PTX64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, i32 addrspace(5)* %ptr ret void @@ -92,27 +92,27 @@ define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) { ;; i64 define void @st_global_i64(i64 addrspace(1)* %ptr, i64 %a) { -; PTX32: st.global.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}} +; PTX32: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}} +; PTX64: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; PTX64: ret store i64 %a, i64 addrspace(1)* %ptr ret void } define void @st_shared_i64(i64 addrspace(3)* %ptr, i64 %a) { -; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}} +; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}} +; PTX64: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; PTX64: ret store i64 %a, i64 addrspace(3)* %ptr ret void } define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) { -; PTX32: st.local.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}} +; PTX32: st.local.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}} +; PTX64: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; PTX64: ret store i64 %a, i64 addrspace(5)* %ptr ret void @@ -123,7 +123,7 @@ define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) { define void @st_global_f32(float addrspace(1)* %ptr, float %a) { ; PTX32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}} +; PTX64: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; PTX64: ret store float %a, float addrspace(1)* %ptr ret void @@ -132,7 +132,7 @@ define void @st_global_f32(float addrspace(1)* %ptr, float %a) { define void @st_shared_f32(float addrspace(3)* %ptr, float %a) { ; PTX32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}} +; PTX64: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; PTX64: ret store float %a, float addrspace(3)* %ptr ret void @@ -141,7 +141,7 @@ define void @st_shared_f32(float addrspace(3)* %ptr, float %a) { define void @st_local_f32(float addrspace(5)* %ptr, float %a) { ; PTX32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}} +; PTX64: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; PTX64: ret store float %a, float addrspace(5)* %ptr ret void @@ -150,27 +150,27 @@ define void @st_local_f32(float addrspace(5)* %ptr, float %a) { ;; f64 define void @st_global_f64(double addrspace(1)* %ptr, double %a) { -; PTX32: st.global.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}} +; PTX32: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}} +; PTX64: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; PTX64: ret store double %a, double addrspace(1)* %ptr ret void } define void @st_shared_f64(double addrspace(3)* %ptr, double %a) { -; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}} +; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}} +; PTX64: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; PTX64: ret store double %a, double addrspace(3)* %ptr ret void } define void @st_local_f64(double addrspace(5)* %ptr, double %a) { -; PTX32: st.local.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}} +; PTX32: st.local.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}} +; PTX64: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; PTX64: ret store double %a, double addrspace(5)* %ptr ret void diff --git a/test/CodeGen/NVPTX/st-generic.ll b/test/CodeGen/NVPTX/st-generic.ll index b9c616fbd19e..022f7ab214ca 100644 --- a/test/CodeGen/NVPTX/st-generic.ll +++ b/test/CodeGen/NVPTX/st-generic.ll @@ -7,7 +7,7 @@ define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) { ; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(0)* %ptr ret void @@ -18,7 +18,7 @@ define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) { define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) { ; PTX32: st.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i16 %a, i16 addrspace(0)* %ptr ret void @@ -29,7 +29,7 @@ define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) { define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) { ; PTX32: st.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} ; PTX32: ret -; PTX64: st.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}} +; PTX64: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, i32 addrspace(0)* %ptr ret void @@ -38,9 +38,9 @@ define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) { ;; i64 define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) { -; PTX32: st.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}} +; PTX32: st.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} ; PTX32: ret -; PTX64: st.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}} +; PTX64: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; PTX64: ret store i64 %a, i64 addrspace(0)* %ptr ret void @@ -51,7 +51,7 @@ define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) { define void @st_global_f32(float addrspace(0)* %ptr, float %a) { ; PTX32: st.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} ; PTX32: ret -; PTX64: st.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}} +; PTX64: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; PTX64: ret store float %a, float addrspace(0)* %ptr ret void @@ -60,9 +60,9 @@ define void @st_global_f32(float addrspace(0)* %ptr, float %a) { ;; f64 define void @st_global_f64(double addrspace(0)* %ptr, double %a) { -; PTX32: st.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}} +; PTX32: st.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} ; PTX32: ret -; PTX64: st.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}} +; PTX64: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; PTX64: ret store double %a, double addrspace(0)* %ptr ret void diff --git a/test/CodeGen/NVPTX/surf-read-cuda.ll b/test/CodeGen/NVPTX/surf-read-cuda.ll new file mode 100644 index 000000000000..10a1ecc4c473 --- /dev/null +++ b/test/CodeGen/NVPTX/surf-read-cuda.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20 +; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30 + +target triple = "nvptx-unknown-cuda" + +declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32) +declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*) + + +; SM20-LABEL: .entry foo +; SM30-LABEL: .entry foo +define void @foo(i64 %img, float* %red, i32 %idx) { +; SM20: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0]; +; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}] +; SM30: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0]; +; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}] + %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx) +; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] +; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] + %ret = sitofp i32 %val to float +; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]] +; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]] + store float %ret, float* %red + ret void +} + +@surf0 = internal addrspace(1) global i64 0, align 8 + +; SM20-LABEL: .entry bar +; SM30-LABEL: .entry bar +define void @bar(float* %red, i32 %idx) { +; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0 + %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0) +; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [surf0, {%r{{[0-9]+}}}] +; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFHANDLE]], {%r{{[0-9]+}}}] + %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %surfHandle, i32 %idx) +; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] +; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] + %ret = sitofp i32 %val to float +; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]] +; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]] + store float %ret, float* %red + ret void +} + + + + +!nvvm.annotations = !{!1, !2, !3} +!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1} +!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1} +!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1} + diff --git a/test/CodeGen/NVPTX/surf-write-cuda.ll b/test/CodeGen/NVPTX/surf-write-cuda.ll new file mode 100644 index 000000000000..654c47f46957 --- /dev/null +++ b/test/CodeGen/NVPTX/surf-write-cuda.ll @@ -0,0 +1,42 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20 +; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30 + +target triple = "nvptx-unknown-cuda" + +declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32) +declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*) + + +; SM20-LABEL: .entry foo +; SM30-LABEL: .entry foo +define void @foo(i64 %img, i32 %val, i32 %idx) { +; SM20: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0]; +; SM20: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}} +; SM30: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0]; +; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}} + tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val) + ret void +} + + +@surf0 = internal addrspace(1) global i64 0, align 8 + + + +; SM20-LABEL: .entry bar +; SM30-LABEL: .entry bar +define void @bar(i32 %val, i32 %idx) { +; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0 + %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0) +; SM20: sust.b.1d.b32.trap [surf0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}} +; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}} + tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %surfHandle, i32 %idx, i32 %val) + ret void +} + + +!nvvm.annotations = !{!1, !2, !3} +!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1} +!2 = metadata !{void (i32, i32)* @bar, metadata !"kernel", i32 1} +!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1} + diff --git a/test/CodeGen/NVPTX/tex-read-cuda.ll b/test/CodeGen/NVPTX/tex-read-cuda.ll new file mode 100644 index 000000000000..ee0cefa919b1 --- /dev/null +++ b/test/CodeGen/NVPTX/tex-read-cuda.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20 +; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30 + + +target triple = "nvptx-unknown-cuda" + +declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32) +declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*) + +; SM20-LABEL: .entry foo +; SM30-LABEL: .entry foo +define void @foo(i64 %img, float* %red, i32 %idx) { +; SM20: ld.param.u64 %rd[[TEXREG:[0-9]+]], [foo_param_0]; +; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}] +; SM30: ld.param.u64 %rd[[TEXREG:[0-9]+]], [foo_param_0]; +; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}] + %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx) + %ret = extractvalue { float, float, float, float } %val, 0 +; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]] +; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]] + store float %ret, float* %red + ret void +} + + +@tex0 = internal addrspace(1) global i64 0, align 8 + +; SM20-LABEL: .entry bar +; SM30-LABEL: .entry bar +define void @bar(float* %red, i32 %idx) { +; SM30: mov.u64 %rd[[TEXHANDLE:[0-9]+]], tex0 + %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0) +; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [tex0, {%r{{[0-9]+}}}] +; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}] + %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx) + %ret = extractvalue { float, float, float, float } %val, 0 +; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]] +; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]] + store float %ret, float* %red + ret void +} + +!nvvm.annotations = !{!1, !2, !3} +!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1} +!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1} +!3 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1} diff --git a/test/CodeGen/NVPTX/tex-read.ll b/test/CodeGen/NVPTX/tex-read.ll index 291060b9848d..55e4bfc9e453 100644 --- a/test/CodeGen/NVPTX/tex-read.ll +++ b/test/CodeGen/NVPTX/tex-read.ll @@ -2,12 +2,12 @@ target triple = "nvptx-unknown-nvcl" -declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64, i64, i32) +declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32) ; CHECK: .entry foo define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) { ; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}] - %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64 %img, i64 %sampler, i32 %idx) + %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx) %ret = extractvalue { float, float, float, float } %val, 0 ; CHECK: st.f32 [%r{{[0-9]+}}], %f[[RED]] store float %ret, float* %red diff --git a/test/CodeGen/NVPTX/texsurf-queries.ll b/test/CodeGen/NVPTX/texsurf-queries.ll new file mode 100644 index 000000000000..c7637ccff77a --- /dev/null +++ b/test/CodeGen/NVPTX/texsurf-queries.ll @@ -0,0 +1,103 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20 +; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30 + +target triple = "nvptx-unknown-cuda" + +@tex0 = internal addrspace(1) global i64 0, align 8 +@surf0 = internal addrspace(1) global i64 0, align 8 + +declare i32 @llvm.nvvm.txq.width(i64) +declare i32 @llvm.nvvm.txq.height(i64) +declare i32 @llvm.nvvm.suq.width(i64) +declare i32 @llvm.nvvm.suq.height(i64) +declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*) + + +; SM20-LABEL: @t0 +; SM30-LABEL: @t0 +define i32 @t0(i64 %texHandle) { +; SM20: txq.width.b32 +; SM30: txq.width.b32 + %width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle) + ret i32 %width +} + +; SM20-LABEL: @t1 +; SM30-LABEL: @t1 +define i32 @t1() { +; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0 + %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0) +; SM20: txq.width.b32 %r{{[0-9]+}}, [tex0] +; SM30: txq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]] + %width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle) + ret i32 %width +} + + +; SM20-LABEL: @t2 +; SM30-LABEL: @t2 +define i32 @t2(i64 %texHandle) { +; SM20: txq.height.b32 +; SM30: txq.height.b32 + %height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle) + ret i32 %height +} + +; SM20-LABEL: @t3 +; SM30-LABEL: @t3 +define i32 @t3() { +; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0 + %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0) +; SM20: txq.height.b32 %r{{[0-9]+}}, [tex0] +; SM30: txq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]] + %height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle) + ret i32 %height +} + + +; SM20-LABEL: @s0 +; SM30-LABEL: @s0 +define i32 @s0(i64 %surfHandle) { +; SM20: suq.width.b32 +; SM30: suq.width.b32 + %width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle) + ret i32 %width +} + +; SM20-LABEL: @s1 +; SM30-LABEL: @s1 +define i32 @s1() { +; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0 + %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0) +; SM20: suq.width.b32 %r{{[0-9]+}}, [surf0] +; SM30: suq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]] + %width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle) + ret i32 %width +} + + +; SM20-LABEL: @s2 +; SM30-LABEL: @s2 +define i32 @s2(i64 %surfHandle) { +; SM20: suq.height.b32 +; SM30: suq.height.b32 + %height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle) + ret i32 %height +} + +; SM20-LABEL: @s3 +; SM30-LABEL: @s3 +define i32 @s3() { +; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0 + %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0) +; SM20: suq.height.b32 %r{{[0-9]+}}, [surf0] +; SM30: suq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]] + %height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle) + ret i32 %height +} + + + +!nvvm.annotations = !{!1, !2} +!1 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1} +!2 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1} diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll new file mode 100644 index 000000000000..a03d7fd41914 --- /dev/null +++ b/test/CodeGen/NVPTX/vector-call.ll @@ -0,0 +1,12 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target triple = "nvptx-unknown-cuda" + +declare void @bar(<4 x i32>) + +; CHECK-LABEL @foo +define void @foo(<4 x i32> %a) { +; CHECK: st.param.v4.b32 + tail call void @bar(<4 x i32> %a) + ret void +} diff --git a/test/CodeGen/NVPTX/weak-global.ll b/test/CodeGen/NVPTX/weak-global.ll new file mode 100644 index 000000000000..2bef4c5228a9 --- /dev/null +++ b/test/CodeGen/NVPTX/weak-global.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; CHECK: .weak .global .align 4 .u32 g +@g = common addrspace(1) global i32 zeroinitializer + +define i32 @func0() { + %val = load i32 addrspace(1)* @g + ret i32 %val +} diff --git a/test/CodeGen/NVPTX/weak-linkage.ll b/test/CodeGen/NVPTX/weak-linkage.ll new file mode 100644 index 000000000000..7a1335783642 --- /dev/null +++ b/test/CodeGen/NVPTX/weak-linkage.ll @@ -0,0 +1,12 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +; CHECK: .weak .func foo +define weak void @foo() { + ret void +} + +; CHECK: .visible .func bar +define void @bar() { + ret void +} diff --git a/test/CodeGen/PowerPC/Frames-alloca.ll b/test/CodeGen/PowerPC/Frames-alloca.ll index 4588bc05352b..c701fef8e629 100644 --- a/test/CodeGen/PowerPC/Frames-alloca.ll +++ b/test/CodeGen/PowerPC/Frames-alloca.ll @@ -12,15 +12,15 @@ ; CHECK-PPC32-NOFP: stw r31, -4(r1) ; CHECK-PPC32-NOFP: lwz r1, 0(r1) ; CHECK-PPC32-NOFP: lwz r31, -4(r1) -; CHECK-PPC32-RS: stwu r1, -80(r1) -; CHECK-PPC32-RS-NOFP: stwu r1, -80(r1) +; CHECK-PPC32-RS: stwu r1, -48(r1) +; CHECK-PPC32-RS-NOFP: stwu r1, -48(r1) ; CHECK-PPC64: std r31, -8(r1) -; CHECK-PPC64: stdu r1, -128(r1) +; CHECK-PPC64: stdu r1, -64(r1) ; CHECK-PPC64: ld r1, 0(r1) ; CHECK-PPC64: ld r31, -8(r1) ; CHECK-PPC64-NOFP: std r31, -8(r1) -; CHECK-PPC64-NOFP: stdu r1, -128(r1) +; CHECK-PPC64-NOFP: stdu r1, -64(r1) ; CHECK-PPC64-NOFP: ld r1, 0(r1) ; CHECK-PPC64-NOFP: ld r31, -8(r1) diff --git a/test/CodeGen/PowerPC/Frames-large.ll b/test/CodeGen/PowerPC/Frames-large.ll index d07fea726770..0ccea42619af 100644 --- a/test/CodeGen/PowerPC/Frames-large.ll +++ b/test/CodeGen/PowerPC/Frames-large.ll @@ -15,9 +15,9 @@ define i32* @f1() nounwind { ; PPC32-NOFP: _f1: ; PPC32-NOFP: lis r0, -1 -; PPC32-NOFP: ori r0, r0, 32704 +; PPC32-NOFP: ori r0, r0, 32736 ; PPC32-NOFP: stwux r1, r1, r0 -; PPC32-NOFP: addi r3, r1, 68 +; PPC32-NOFP: addi r3, r1, 36 ; PPC32-NOFP: lwz r1, 0(r1) ; PPC32-NOFP: blr @@ -25,10 +25,10 @@ define i32* @f1() nounwind { ; PPC32-FP: _f1: ; PPC32-FP: lis r0, -1 ; PPC32-FP: stw r31, -4(r1) -; PPC32-FP: ori r0, r0, 32704 +; PPC32-FP: ori r0, r0, 32736 ; PPC32-FP: stwux r1, r1, r0 ; PPC32-FP: mr r31, r1 -; PPC32-FP: addi r3, r31, 64 +; PPC32-FP: addi r3, r31, 32 ; PPC32-FP: lwz r1, 0(r1) ; PPC32-FP: lwz r31, -4(r1) ; PPC32-FP: blr @@ -36,9 +36,9 @@ define i32* @f1() nounwind { ; PPC64-NOFP: _f1: ; PPC64-NOFP: lis r0, -1 -; PPC64-NOFP: ori r0, r0, 32656 +; PPC64-NOFP: ori r0, r0, 32720 ; PPC64-NOFP: stdux r1, r1, r0 -; PPC64-NOFP: addi r3, r1, 116 +; PPC64-NOFP: addi r3, r1, 52 ; PPC64-NOFP: ld r1, 0(r1) ; PPC64-NOFP: blr @@ -46,10 +46,10 @@ define i32* @f1() nounwind { ; PPC64-FP: _f1: ; PPC64-FP: lis r0, -1 ; PPC64-FP: std r31, -8(r1) -; PPC64-FP: ori r0, r0, 32640 +; PPC64-FP: ori r0, r0, 32704 ; PPC64-FP: stdux r1, r1, r0 ; PPC64-FP: mr r31, r1 -; PPC64-FP: addi r3, r31, 124 +; PPC64-FP: addi r3, r31, 60 ; PPC64-FP: ld r1, 0(r1) ; PPC64-FP: ld r31, -8(r1) ; PPC64-FP: blr diff --git a/test/CodeGen/PowerPC/Frames-small.ll b/test/CodeGen/PowerPC/Frames-small.ll index 0f6bd1021f80..28c1a5b54dd2 100644 --- a/test/CodeGen/PowerPC/Frames-small.ll +++ b/test/CodeGen/PowerPC/Frames-small.ll @@ -1,25 +1,25 @@ ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -o %t1 ; RUN: not grep "stw r31, -4(r1)" %t1 -; RUN: grep "stwu r1, -16448(r1)" %t1 -; RUN: grep "addi r1, r1, 16448" %t1 +; RUN: grep "stwu r1, -16416(r1)" %t1 +; RUN: grep "addi r1, r1, 16416" %t1 ; RUN: llc < %s -march=ppc32 | \ ; RUN: not grep "lwz r31, -4(r1)" ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim \ ; RUN: -o %t2 ; RUN: grep "stw r31, -4(r1)" %t2 -; RUN: grep "stwu r1, -16448(r1)" %t2 -; RUN: grep "addi r1, r1, 16448" %t2 +; RUN: grep "stwu r1, -16416(r1)" %t2 +; RUN: grep "addi r1, r1, 16416" %t2 ; RUN: grep "lwz r31, -4(r1)" %t2 ; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 -o %t3 ; RUN: not grep "std r31, -8(r1)" %t3 -; RUN: grep "stdu r1, -16496(r1)" %t3 -; RUN: grep "addi r1, r1, 16496" %t3 +; RUN: grep "stdu r1, -16432(r1)" %t3 +; RUN: grep "addi r1, r1, 16432" %t3 ; RUN: not grep "ld r31, -8(r1)" %t3 ; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 -disable-fp-elim \ ; RUN: -o %t4 ; RUN: grep "std r31, -8(r1)" %t4 -; RUN: grep "stdu r1, -16512(r1)" %t4 -; RUN: grep "addi r1, r1, 16512" %t4 +; RUN: grep "stdu r1, -16448(r1)" %t4 +; RUN: grep "addi r1, r1, 16448" %t4 ; RUN: grep "ld r31, -8(r1)" %t4 define i32* @f1() { diff --git a/test/CodeGen/PowerPC/anon_aggr.ll b/test/CodeGen/PowerPC/anon_aggr.ll index 3bae5c6516c9..6c4f140de127 100644 --- a/test/CodeGen/PowerPC/anon_aggr.ll +++ b/test/CodeGen/PowerPC/anon_aggr.ll @@ -62,8 +62,7 @@ unequal: } ; CHECK-LABEL: func2: -; CHECK: addi [[REG1:[0-9]+]], 1, 64 -; CHECK: ld [[REG2:[0-9]+]], 8([[REG1]]) +; CHECK: ld [[REG2:[0-9]+]], 72(1) ; CHECK: cmpld {{[0-9]+}}, 4, [[REG2]] ; CHECK-DAG: std [[REG2]], -[[OFFSET1:[0-9]+]] ; CHECK-DAG: std 4, -[[OFFSET2:[0-9]+]] @@ -82,8 +81,7 @@ unequal: ; DARWIN32: lwz r3, -[[OFFSET2]] ; DARWIN64: _func2: -; DARWIN64: addi r[[REG1:[0-9]+]], r1, 64 -; DARWIN64: ld r[[REG2:[0-9]+]], 8(r[[REG1]]) +; DARWIN64: ld r[[REG2:[0-9]+]], 72(r1) ; DARWIN64: mr ; DARWIN64: mr r[[REG3:[0-9]+]], r[[REGA:[0-9]+]] ; DARWIN64: cmpld cr{{[0-9]+}}, r[[REGA]], r[[REG2]] @@ -108,10 +106,8 @@ unequal: } ; CHECK-LABEL: func3: -; CHECK: addi [[REG1:[0-9]+]], 1, 64 -; CHECK: addi [[REG2:[0-9]+]], 1, 48 -; CHECK: ld [[REG3:[0-9]+]], 8([[REG1]]) -; CHECK: ld [[REG4:[0-9]+]], 8([[REG2]]) +; CHECK: ld [[REG3:[0-9]+]], 72(1) +; CHECK: ld [[REG4:[0-9]+]], 56(1) ; CHECK: cmpld {{[0-9]+}}, [[REG4]], [[REG3]] ; CHECK: std [[REG3]], -[[OFFSET1:[0-9]+]](1) ; CHECK: std [[REG4]], -[[OFFSET2:[0-9]+]](1) @@ -130,10 +126,8 @@ unequal: ; DARWIN32: lwz r3, -[[OFFSET1]] ; DARWIN64: _func3: -; DARWIN64: addi r[[REG1:[0-9]+]], r1, 64 -; DARWIN64: addi r[[REG2:[0-9]+]], r1, 48 -; DARWIN64: ld r[[REG3:[0-9]+]], 8(r[[REG1]]) -; DARWIN64: ld r[[REG4:[0-9]+]], 8(r[[REG2]]) +; DARWIN64: ld r[[REG3:[0-9]+]], 72(r1) +; DARWIN64: ld r[[REG4:[0-9]+]], 56(r1) ; DARWIN64: cmpld cr{{[0-9]+}}, r[[REG4]], r[[REG3]] ; DARWIN64: std r[[REG3]], -[[OFFSET1:[0-9]+]] ; DARWIN64: std r[[REG4]], -[[OFFSET2:[0-9]+]] @@ -157,12 +151,11 @@ unequal: } ; CHECK-LABEL: func4: -; CHECK: addi [[REG1:[0-9]+]], 1, 128 +; CHECK: ld [[REG3:[0-9]+]], 136(1) ; CHECK: ld [[REG2:[0-9]+]], 120(1) -; CHECK: ld [[REG3:[0-9]+]], 8([[REG1]]) ; CHECK: cmpld {{[0-9]+}}, [[REG2]], [[REG3]] -; CHECK: std [[REG2]], -[[OFFSET1:[0-9]+]](1) ; CHECK: std [[REG3]], -[[OFFSET2:[0-9]+]](1) +; CHECK: std [[REG2]], -[[OFFSET1:[0-9]+]](1) ; CHECK: ld 3, -[[OFFSET1]](1) ; CHECK: ld 3, -[[OFFSET2]](1) @@ -178,9 +171,8 @@ unequal: ; DARWIN32: lwz r[[REG1]], -[[OFFSET2]] ; DARWIN64: _func4: -; DARWIN64: addi r[[REG1:[0-9]+]], r1, 128 ; DARWIN64: ld r[[REG2:[0-9]+]], 120(r1) -; DARWIN64: ld r[[REG3:[0-9]+]], 8(r[[REG1]]) +; DARWIN64: ld r[[REG3:[0-9]+]], 136(r1) ; DARWIN64: mr r[[REG4:[0-9]+]], r[[REG2]] ; DARWIN64: cmpld cr{{[0-9]+}}, r[[REG2]], r[[REG3]] ; DARWIN64: std r[[REG4]], -[[OFFSET1:[0-9]+]] diff --git a/test/CodeGen/PowerPC/available-externally.ll b/test/CodeGen/PowerPC/available-externally.ll index abed0de80b88..53c435995485 100644 --- a/test/CodeGen/PowerPC/available-externally.ll +++ b/test/CodeGen/PowerPC/available-externally.ll @@ -1,7 +1,8 @@ ; RUN: llc < %s -relocation-model=static | FileCheck %s -check-prefix=STATIC -; RUN: llc < %s -relocation-model=pic | FileCheck %s -check-prefix=PIC +; RUN: llc < %s -relocation-model=pic -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PIC +; RUN: llc < %s -relocation-model=pic -mtriple=powerpc-unknown-linux | FileCheck %s -check-prefix=PICELF ; RUN: llc < %s -relocation-model=pic -mtriple=powerpc64-apple-darwin8 | FileCheck %s -check-prefix=PIC64 -; RUN: llc < %s -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=DYNAMIC +; RUN: llc < %s -relocation-model=dynamic-no-pic -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=DYNAMIC ; RUN: llc < %s -relocation-model=dynamic-no-pic -mtriple=powerpc64-apple-darwin8 | FileCheck %s -check-prefix=DYNAMIC64 ; PR4482 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" @@ -18,6 +19,10 @@ entry: ; PIC: bl L_exact_log2$stub ; PIC: blr +; PICELF: foo: +; PICELF: bl exact_log2@PLT +; PICELF: blr + ; PIC64: _foo: ; PIC64: bl L_exact_log2$stub ; PIC64: blr diff --git a/test/CodeGen/PowerPC/complex-return.ll b/test/CodeGen/PowerPC/complex-return.ll index 5ac7524ff5c7..8a6adaee5556 100644 --- a/test/CodeGen/PowerPC/complex-return.ll +++ b/test/CodeGen/PowerPC/complex-return.ll @@ -26,8 +26,8 @@ entry: ; CHECK-LABEL: foo: ; CHECK: lfd 3 ; CHECK: lfd 4 -; CHECK: lfd 2 ; CHECK: lfd 1 +; CHECK: lfd 2 define { float, float } @oof() nounwind { entry: diff --git a/test/CodeGen/PowerPC/early-ret2.ll b/test/CodeGen/PowerPC/early-ret2.ll index a8e456fea622..17847770a831 100644 --- a/test/CodeGen/PowerPC/early-ret2.ll +++ b/test/CodeGen/PowerPC/early-ret2.ll @@ -11,7 +11,7 @@ while.body.lr.ph: ; preds = %entry br i1 undef, label %while.end, label %while.body while.body: ; preds = %while.body, %while.body.lr.ph - br i1 false, label %while.end, label %while.body, !llvm.vectorizer.already_vectorized !0 + br i1 false, label %while.end, label %while.body, !llvm.loop.vectorize.already_vectorized !0 while.end: ; preds = %while.body, %while.body.lr.ph, %entry ret void diff --git a/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll b/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll index db0d8ed0ffa4..ac41e8c27700 100644 --- a/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll +++ b/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll @@ -116,18 +116,6 @@ entry: ret void } -define void @fptoui_float_i64(float %a) nounwind ssp { -entry: -; ELF64: fptoui_float_i64 - %b.addr = alloca i64, align 4 - %conv = fptoui float %a to i64 -; ELF64: fctiduz -; ELF64: stfd -; ELF64: ld - store i64 %conv, i64* %b.addr, align 4 - ret void -} - define void @fptoui_double_i32(double %a) nounwind ssp { entry: ; ELF64: fptoui_double_i32 @@ -140,14 +128,3 @@ entry: ret void } -define void @fptoui_double_i64(double %a) nounwind ssp { -entry: -; ELF64: fptoui_double_i64 - %b.addr = alloca i64, align 8 - %conv = fptoui double %a to i64 -; ELF64: fctiduz -; ELF64: stfd -; ELF64: ld - store i64 %conv, i64* %b.addr, align 8 - ret void -} diff --git a/test/CodeGen/PowerPC/fast-isel-conversion.ll b/test/CodeGen/PowerPC/fast-isel-conversion.ll index a31c31210c39..5e00675c0398 100644 --- a/test/CodeGen/PowerPC/fast-isel-conversion.ll +++ b/test/CodeGen/PowerPC/fast-isel-conversion.ll @@ -1,15 +1,24 @@ ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64 +; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=970 | FileCheck %s --check-prefix=PPC970 + +;; Tests for 970 don't use -fast-isel-abort because we intentionally punt +;; to SelectionDAG in some cases. ; Test sitofp define void @sitofp_single_i64(i64 %a, float %b) nounwind ssp { entry: ; ELF64: sitofp_single_i64 +; PPC970: sitofp_single_i64 %b.addr = alloca float, align 4 %conv = sitofp i64 %a to float ; ELF64: std ; ELF64: lfd ; ELF64: fcfids +; PPC970: std +; PPC970: lfd +; PPC970: fcfid +; PPC970: frsp store float %conv, float* %b.addr, align 4 ret void } @@ -17,11 +26,16 @@ entry: define void @sitofp_single_i32(i32 %a, float %b) nounwind ssp { entry: ; ELF64: sitofp_single_i32 +; PPC970: sitofp_single_i32 %b.addr = alloca float, align 4 %conv = sitofp i32 %a to float ; ELF64: std ; ELF64: lfiwax ; ELF64: fcfids +; PPC970: std +; PPC970: lfd +; PPC970: fcfid +; PPC970: frsp store float %conv, float* %b.addr, align 4 ret void } @@ -29,12 +43,18 @@ entry: define void @sitofp_single_i16(i16 %a, float %b) nounwind ssp { entry: ; ELF64: sitofp_single_i16 +; PPC970: sitofp_single_i16 %b.addr = alloca float, align 4 %conv = sitofp i16 %a to float ; ELF64: extsh ; ELF64: std ; ELF64: lfd ; ELF64: fcfids +; PPC970: extsh +; PPC970: std +; PPC970: lfd +; PPC970: fcfid +; PPC970: frsp store float %conv, float* %b.addr, align 4 ret void } @@ -42,12 +62,18 @@ entry: define void @sitofp_single_i8(i8 %a) nounwind ssp { entry: ; ELF64: sitofp_single_i8 +; PPC970: sitofp_single_i8 %b.addr = alloca float, align 4 %conv = sitofp i8 %a to float ; ELF64: extsb ; ELF64: std ; ELF64: lfd ; ELF64: fcfids +; PPC970: extsb +; PPC970: std +; PPC970: lfd +; PPC970: fcfid +; PPC970: frsp store float %conv, float* %b.addr, align 4 ret void } @@ -55,11 +81,15 @@ entry: define void @sitofp_double_i32(i32 %a, double %b) nounwind ssp { entry: ; ELF64: sitofp_double_i32 +; PPC970: sitofp_double_i32 %b.addr = alloca double, align 8 %conv = sitofp i32 %a to double ; ELF64: std ; ELF64: lfiwax ; ELF64: fcfid +; PPC970: std +; PPC970: lfd +; PPC970: fcfid store double %conv, double* %b.addr, align 8 ret void } @@ -67,11 +97,15 @@ entry: define void @sitofp_double_i64(i64 %a, double %b) nounwind ssp { entry: ; ELF64: sitofp_double_i64 +; PPC970: sitofp_double_i64 %b.addr = alloca double, align 8 %conv = sitofp i64 %a to double ; ELF64: std ; ELF64: lfd ; ELF64: fcfid +; PPC970: std +; PPC970: lfd +; PPC970: fcfid store double %conv, double* %b.addr, align 8 ret void } @@ -79,12 +113,17 @@ entry: define void @sitofp_double_i16(i16 %a, double %b) nounwind ssp { entry: ; ELF64: sitofp_double_i16 +; PPC970: sitofp_double_i16 %b.addr = alloca double, align 8 %conv = sitofp i16 %a to double ; ELF64: extsh ; ELF64: std ; ELF64: lfd ; ELF64: fcfid +; PPC970: extsh +; PPC970: std +; PPC970: lfd +; PPC970: fcfid store double %conv, double* %b.addr, align 8 ret void } @@ -92,12 +131,17 @@ entry: define void @sitofp_double_i8(i8 %a, double %b) nounwind ssp { entry: ; ELF64: sitofp_double_i8 +; PPC970: sitofp_double_i8 %b.addr = alloca double, align 8 %conv = sitofp i8 %a to double ; ELF64: extsb ; ELF64: std ; ELF64: lfd ; ELF64: fcfid +; PPC970: extsb +; PPC970: std +; PPC970: lfd +; PPC970: fcfid store double %conv, double* %b.addr, align 8 ret void } @@ -107,11 +151,13 @@ entry: define void @uitofp_single_i64(i64 %a, float %b) nounwind ssp { entry: ; ELF64: uitofp_single_i64 +; PPC970: uitofp_single_i64 %b.addr = alloca float, align 4 %conv = uitofp i64 %a to float ; ELF64: std ; ELF64: lfd ; ELF64: fcfidus +; PPC970-NOT: fcfidus store float %conv, float* %b.addr, align 4 ret void } @@ -119,11 +165,14 @@ entry: define void @uitofp_single_i32(i32 %a, float %b) nounwind ssp { entry: ; ELF64: uitofp_single_i32 +; PPC970: uitofp_single_i32 %b.addr = alloca float, align 4 %conv = uitofp i32 %a to float ; ELF64: std ; ELF64: lfiwzx ; ELF64: fcfidus +; PPC970-NOT: lfiwzx +; PPC970-NOT: fcfidus store float %conv, float* %b.addr, align 4 ret void } @@ -131,12 +180,18 @@ entry: define void @uitofp_single_i16(i16 %a, float %b) nounwind ssp { entry: ; ELF64: uitofp_single_i16 +; PPC970: uitofp_single_i16 %b.addr = alloca float, align 4 %conv = uitofp i16 %a to float ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48 ; ELF64: std ; ELF64: lfd ; ELF64: fcfidus +; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 16, 31 +; PPC970: std +; PPC970: lfd +; PPC970: fcfid +; PPC970: frsp store float %conv, float* %b.addr, align 4 ret void } @@ -144,12 +199,18 @@ entry: define void @uitofp_single_i8(i8 %a) nounwind ssp { entry: ; ELF64: uitofp_single_i8 +; PPC970: uitofp_single_i8 %b.addr = alloca float, align 4 %conv = uitofp i8 %a to float ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56 ; ELF64: std ; ELF64: lfd ; ELF64: fcfidus +; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31 +; PPC970: std +; PPC970: lfd +; PPC970: fcfid +; PPC970: frsp store float %conv, float* %b.addr, align 4 ret void } @@ -157,11 +218,13 @@ entry: define void @uitofp_double_i64(i64 %a, double %b) nounwind ssp { entry: ; ELF64: uitofp_double_i64 +; PPC970: uitofp_double_i64 %b.addr = alloca double, align 8 %conv = uitofp i64 %a to double ; ELF64: std ; ELF64: lfd ; ELF64: fcfidu +; PPC970-NOT: fcfidu store double %conv, double* %b.addr, align 8 ret void } @@ -169,11 +232,14 @@ entry: define void @uitofp_double_i32(i32 %a, double %b) nounwind ssp { entry: ; ELF64: uitofp_double_i32 +; PPC970: uitofp_double_i32 %b.addr = alloca double, align 8 %conv = uitofp i32 %a to double ; ELF64: std ; ELF64: lfiwzx ; ELF64: fcfidu +; PPC970-NOT: lfiwzx +; PPC970-NOT: fcfidu store double %conv, double* %b.addr, align 8 ret void } @@ -181,12 +247,17 @@ entry: define void @uitofp_double_i16(i16 %a, double %b) nounwind ssp { entry: ; ELF64: uitofp_double_i16 +; PPC970: uitofp_double_i16 %b.addr = alloca double, align 8 %conv = uitofp i16 %a to double ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48 ; ELF64: std ; ELF64: lfd ; ELF64: fcfidu +; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 16, 31 +; PPC970: std +; PPC970: lfd +; PPC970: fcfid store double %conv, double* %b.addr, align 8 ret void } @@ -194,12 +265,17 @@ entry: define void @uitofp_double_i8(i8 %a, double %b) nounwind ssp { entry: ; ELF64: uitofp_double_i8 +; PPC970: uitofp_double_i8 %b.addr = alloca double, align 8 %conv = uitofp i8 %a to double ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56 ; ELF64: std ; ELF64: lfd ; ELF64: fcfidu +; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31 +; PPC970: std +; PPC970: lfd +; PPC970: fcfid store double %conv, double* %b.addr, align 8 ret void } @@ -209,11 +285,15 @@ entry: define void @fptosi_float_i32(float %a) nounwind ssp { entry: ; ELF64: fptosi_float_i32 +; PPC970: fptosi_float_i32 %b.addr = alloca i32, align 4 %conv = fptosi float %a to i32 ; ELF64: fctiwz ; ELF64: stfd ; ELF64: lwa +; PPC970: fctiwz +; PPC970: stfd +; PPC970: lwa store i32 %conv, i32* %b.addr, align 4 ret void } @@ -221,11 +301,15 @@ entry: define void @fptosi_float_i64(float %a) nounwind ssp { entry: ; ELF64: fptosi_float_i64 +; PPC970: fptosi_float_i64 %b.addr = alloca i64, align 4 %conv = fptosi float %a to i64 ; ELF64: fctidz ; ELF64: stfd ; ELF64: ld +; PPC970: fctidz +; PPC970: stfd +; PPC970: ld store i64 %conv, i64* %b.addr, align 4 ret void } @@ -233,11 +317,15 @@ entry: define void @fptosi_double_i32(double %a) nounwind ssp { entry: ; ELF64: fptosi_double_i32 +; PPC970: fptosi_double_i32 %b.addr = alloca i32, align 8 %conv = fptosi double %a to i32 ; ELF64: fctiwz ; ELF64: stfd ; ELF64: lwa +; PPC970: fctiwz +; PPC970: stfd +; PPC970: lwa store i32 %conv, i32* %b.addr, align 8 ret void } @@ -245,11 +333,15 @@ entry: define void @fptosi_double_i64(double %a) nounwind ssp { entry: ; ELF64: fptosi_double_i64 +; PPC970: fptosi_double_i64 %b.addr = alloca i64, align 8 %conv = fptosi double %a to i64 ; ELF64: fctidz ; ELF64: stfd ; ELF64: ld +; PPC970: fctidz +; PPC970: stfd +; PPC970: ld store i64 %conv, i64* %b.addr, align 8 ret void } @@ -259,11 +351,15 @@ entry: define void @fptoui_float_i32(float %a) nounwind ssp { entry: ; ELF64: fptoui_float_i32 +; PPC970: fptoui_float_i32 %b.addr = alloca i32, align 4 %conv = fptoui float %a to i32 ; ELF64: fctiwuz ; ELF64: stfd ; ELF64: lwz +; PPC970: fctidz +; PPC970: stfd +; PPC970: lwz store i32 %conv, i32* %b.addr, align 4 ret void } @@ -271,11 +367,13 @@ entry: define void @fptoui_float_i64(float %a) nounwind ssp { entry: ; ELF64: fptoui_float_i64 +; PPC970: fptoui_float_i64 %b.addr = alloca i64, align 4 %conv = fptoui float %a to i64 ; ELF64: fctiduz ; ELF64: stfd ; ELF64: ld +; PPC970-NOT: fctiduz store i64 %conv, i64* %b.addr, align 4 ret void } @@ -283,11 +381,15 @@ entry: define void @fptoui_double_i32(double %a) nounwind ssp { entry: ; ELF64: fptoui_double_i32 +; PPC970: fptoui_double_i32 %b.addr = alloca i32, align 8 %conv = fptoui double %a to i32 ; ELF64: fctiwuz ; ELF64: stfd ; ELF64: lwz +; PPC970: fctidz +; PPC970: stfd +; PPC970: lwz store i32 %conv, i32* %b.addr, align 8 ret void } @@ -295,11 +397,13 @@ entry: define void @fptoui_double_i64(double %a) nounwind ssp { entry: ; ELF64: fptoui_double_i64 +; PPC970: fptoui_double_i64 %b.addr = alloca i64, align 8 %conv = fptoui double %a to i64 ; ELF64: fctiduz ; ELF64: stfd ; ELF64: ld +; PPC970-NOT: fctiduz store i64 %conv, i64* %b.addr, align 8 ret void } diff --git a/test/CodeGen/PowerPC/hello-reloc.s b/test/CodeGen/PowerPC/hello-reloc.s index 1e3fb8fb0e71..97dfbb5362fa 100644 --- a/test/CodeGen/PowerPC/hello-reloc.s +++ b/test/CodeGen/PowerPC/hello-reloc.s @@ -62,17 +62,17 @@ L_.str: ; @.str ; DARWIN-G4-DUMP:AddressSize: 32bit ; DARWIN-G4-DUMP:Relocations [ ; DARWIN-G4-DUMP: Section __text { -; DARWIN-G4-DUMP: 0x34 1 2 0 PPC_RELOC_BR24 0 - -; DARWIN-G4-DUMP: 0x30 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 _main -; DARWIN-G4-DUMP: 0x0 0 2 n/a PPC_RELOC_PAIR 1 _main -; DARWIN-G4-DUMP: 0x2C 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 _main -; DARWIN-G4-DUMP: 0x60 0 2 n/a PPC_RELOC_PAIR 1 _main +; DARWIN-G4-DUMP: 0x34 1 2 0 PPC_RELOC_BR24 0 0x3 +; DARWIN-G4-DUMP: 0x30 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 0x74 +; DARWIN-G4-DUMP: 0x0 0 2 n/a PPC_RELOC_PAIR 1 0x14 +; DARWIN-G4-DUMP: 0x2C 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 0x74 +; DARWIN-G4-DUMP: 0x60 0 2 n/a PPC_RELOC_PAIR 1 0x14 ; DARWIN-G4-DUMP: } ; DARWIN-G4-DUMP: Section __picsymbolstub1 { -; DARWIN-G4-DUMP: 0x14 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 _main -; DARWIN-G4-DUMP: 0x0 0 2 n/a PPC_RELOC_PAIR 1 _main -; DARWIN-G4-DUMP: 0xC 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 _main -; DARWIN-G4-DUMP: 0x18 0 2 n/a PPC_RELOC_PAIR 1 _main +; DARWIN-G4-DUMP: 0x14 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 0x70 +; DARWIN-G4-DUMP: 0x0 0 2 n/a PPC_RELOC_PAIR 1 0x58 +; DARWIN-G4-DUMP: 0xC 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 0x70 +; DARWIN-G4-DUMP: 0x18 0 2 n/a PPC_RELOC_PAIR 1 0x58 ; DARWIN-G4-DUMP: } ; DARWIN-G4-DUMP: Section __la_symbol_ptr { ; DARWIN-G4-DUMP: 0x0 0 2 1 PPC_RELOC_VANILLA 0 dyld_stub_binding_helper diff --git a/test/CodeGen/PowerPC/ppc32-pic.ll b/test/CodeGen/PowerPC/ppc32-pic.ll new file mode 100644 index 000000000000..5bb78a4655ae --- /dev/null +++ b/test/CodeGen/PowerPC/ppc32-pic.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -relocation-model=pic | FileCheck %s +@foobar = common global i32 0, align 4 + +define i32 @foo() { +entry: + %0 = load i32* @foobar, align 4 + ret i32 %0 +} + +; CHECK: [[POFF:\.L[0-9]+\$poff]]: +; CHECK-NEXT: .long .L.TOC.-[[PB:\.L[0-9]+\$pb]] +; CHECK-NEXT: foo: +; CHECK: bl [[PB]] +; CHECK-NEXT: [[PB]]: +; CHECK: mflr 30 +; CHECK: lwz [[REG:[0-9]+]], [[POFF]]-[[PB]](30) +; CHECK-NEXT: add 30, [[REG]], 30 +; CHECK: lwz [[VREG:[0-9]+]], [[VREF:\.LC[0-9]+]]-.L.TOC.(30) +; CHECK: lwz {{[0-9]+}}, 0([[VREG]]) +; CHECK: [[VREF]]: +; CHECK-NEXT: .long foobar diff --git a/test/CodeGen/PowerPC/ppc64-altivec-abi.ll b/test/CodeGen/PowerPC/ppc64-altivec-abi.ll new file mode 100644 index 000000000000..0bed329f0e54 --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64-altivec-abi.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -march=ppc64 -mattr=+altivec | FileCheck %s + +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Verify that in the 64-bit Linux ABI, vector arguments take up space +; in the parameter save area. + +define i64 @callee(i64 %a, <4 x i32> %b, i64 %c, <4 x i32> %d, i64 %e) { +entry: + ret i64 %e +} +; CHECK-LABEL: callee: +; CHECK: ld 3, 112(1) + +define void @caller(i64 %x, <4 x i32> %y) { +entry: + tail call void @test(i64 %x, <4 x i32> %y, i64 %x, <4 x i32> %y, i64 %x) + ret void +} +; CHECK-LABEL: caller: +; CHECK: std 3, 112(1) + +declare void @test(i64, <4 x i32>, i64, <4 x i32>, i64) + diff --git a/test/CodeGen/PowerPC/ppc64-byval-align.ll b/test/CodeGen/PowerPC/ppc64-byval-align.ll new file mode 100644 index 000000000000..0e73cf2b0e05 --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64-byval-align.ll @@ -0,0 +1,56 @@ +; RUN: llc -O1 < %s -march=ppc64 -mcpu=pwr7 | FileCheck %s + +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +%struct.test = type { i64, [8 x i8] } +%struct.pad = type { [8 x i64] } + +@gt = common global %struct.test zeroinitializer, align 16 +@gp = common global %struct.pad zeroinitializer, align 8 + +define signext i32 @callee1(i32 signext %x, %struct.test* byval align 16 nocapture readnone %y, i32 signext %z) { +entry: + ret i32 %z +} +; CHECK-LABEL: @callee1 +; CHECK: mr 3, 7 +; CHECK: blr + +declare signext i32 @test1(i32 signext, %struct.test* byval align 16, i32 signext) +define void @caller1(i32 signext %z) { +entry: + %call = tail call signext i32 @test1(i32 signext 0, %struct.test* byval align 16 @gt, i32 signext %z) + ret void +} +; CHECK-LABEL: @caller1 +; CHECK: mr [[REG:[0-9]+]], 3 +; CHECK: mr 7, [[REG]] +; CHECK: bl test1 + +define i64 @callee2(%struct.pad* byval nocapture readnone %x, i32 signext %y, %struct.test* byval align 16 nocapture readonly %z) { +entry: + %x1 = getelementptr inbounds %struct.test* %z, i64 0, i32 0 + %0 = load i64* %x1, align 16 + ret i64 %0 +} +; CHECK-LABEL: @callee2 +; CHECK: ld [[REG:[0-9]+]], 128(1) +; CHECK: mr 3, [[REG]] +; CHECK: blr + +declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16) +define void @caller2(i64 %z) { +entry: + %tmp = alloca %struct.test, align 16 + %.compoundliteral.sroa.0.0..sroa_idx = getelementptr inbounds %struct.test* %tmp, i64 0, i32 0 + store i64 %z, i64* %.compoundliteral.sroa.0.0..sroa_idx, align 16 + %call = call i64 @test2(%struct.pad* byval @gp, i32 signext 0, %struct.test* byval align 16 %tmp) + ret void +} +; CHECK-LABEL: @caller2 +; CHECK: std 3, [[OFF:[0-9]+]](1) +; CHECK: ld [[REG:[0-9]+]], [[OFF]](1) +; CHECK: std [[REG]], 128(1) +; CHECK: bl test2 + diff --git a/test/CodeGen/PowerPC/ppc64-smallarg.ll b/test/CodeGen/PowerPC/ppc64-smallarg.ll new file mode 100644 index 000000000000..0d5b078e217a --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64-smallarg.ll @@ -0,0 +1,59 @@ +; Verify that small structures and float arguments are passed in the +; least significant part of a stack slot doubleword. + +; RUN: llc < %s | FileCheck %s + +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +%struct.large_arg = type { [8 x i64] } +%struct.small_arg = type { i16, i8 } + +@gl = common global %struct.large_arg zeroinitializer, align 8 +@gs = common global %struct.small_arg zeroinitializer, align 2 +@gf = common global float 0.000000e+00, align 4 + +define void @callee1(%struct.small_arg* noalias nocapture sret %agg.result, %struct.large_arg* byval nocapture readnone %pad, %struct.small_arg* byval nocapture readonly %x) { +entry: + %0 = bitcast %struct.small_arg* %x to i32* + %1 = bitcast %struct.small_arg* %agg.result to i32* + %2 = load i32* %0, align 2 + store i32 %2, i32* %1, align 2 + ret void +} +; CHECK: @callee1 +; CHECK: lwz {{[0-9]+}}, 124(1) +; CHECK: blr + +define void @caller1() { +entry: + %tmp = alloca %struct.small_arg, align 2 + call void @test1(%struct.small_arg* sret %tmp, %struct.large_arg* byval @gl, %struct.small_arg* byval @gs) + ret void +} +; CHECK: @caller1 +; CHECK: stw {{[0-9]+}}, 124(1) +; CHECK: bl test1 + +declare void @test1(%struct.small_arg* sret, %struct.large_arg* byval, %struct.small_arg* byval) + +define float @callee2(float %pad1, float %pad2, float %pad3, float %pad4, float %pad5, float %pad6, float %pad7, float %pad8, float %pad9, float %pad10, float %pad11, float %pad12, float %pad13, float %x) { +entry: + ret float %x +} +; CHECK: @callee2 +; CHECK: lfs {{[0-9]+}}, 156(1) +; CHECK: blr + +define void @caller2() { +entry: + %0 = load float* @gf, align 4 + %call = tail call float @test2(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float %0) + ret void +} +; CHECK: @caller2 +; CHECK: stfs {{[0-9]+}}, 156(1) +; CHECK: bl test2 + +declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float) + diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll new file mode 100644 index 000000000000..9eed623bacaa --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll @@ -0,0 +1,329 @@ +; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; +; Verify use of registers for float/vector aggregate return. +; + +define [8 x float] @return_float([8 x float] %x) { +entry: + ret [8 x float] %x +} +; CHECK-LABEL: @return_float +; CHECK: %entry +; CHECK-NEXT: blr + +define [8 x double] @return_double([8 x double] %x) { +entry: + ret [8 x double] %x +} +; CHECK-LABEL: @return_double +; CHECK: %entry +; CHECK-NEXT: blr + +define [4 x ppc_fp128] @return_ppcf128([4 x ppc_fp128] %x) { +entry: + ret [4 x ppc_fp128] %x +} +; CHECK-LABEL: @return_ppcf128 +; CHECK: %entry +; CHECK-NEXT: blr + +define [8 x <4 x i32>] @return_v4i32([8 x <4 x i32>] %x) { +entry: + ret [8 x <4 x i32>] %x +} +; CHECK-LABEL: @return_v4i32 +; CHECK: %entry +; CHECK-NEXT: blr + + +; +; Verify amount of space taken up by aggregates in the parameter save area. +; + +define i64 @callee_float([7 x float] %a, [7 x float] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_float +; CHECK: ld 3, 96(1) +; CHECK: blr + +define void @caller_float(i64 %x, [7 x float] %y) { +entry: + tail call void @test_float([7 x float] %y, [7 x float] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_float +; CHECK: std 3, 96(1) +; CHECK: bl test_float + +declare void @test_float([7 x float], [7 x float], i64) + +define i64 @callee_double(i64 %a, [7 x double] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_double +; CHECK: ld 3, 96(1) +; CHECK: blr + +define void @caller_double(i64 %x, [7 x double] %y) { +entry: + tail call void @test_double(i64 %x, [7 x double] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_double +; CHECK: std 3, 96(1) +; CHECK: bl test_double + +declare void @test_double(i64, [7 x double], i64) + +define i64 @callee_ppcf128(i64 %a, [4 x ppc_fp128] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_ppcf128 +; CHECK: ld 3, 104(1) +; CHECK: blr + +define void @caller_ppcf128(i64 %x, [4 x ppc_fp128] %y) { +entry: + tail call void @test_ppcf128(i64 %x, [4 x ppc_fp128] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_ppcf128 +; CHECK: std 3, 104(1) +; CHECK: bl test_ppcf128 + +declare void @test_ppcf128(i64, [4 x ppc_fp128], i64) + +define i64 @callee_i64(i64 %a, [7 x i64] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_i64 +; CHECK: ld 3, 96(1) +; CHECK: blr + +define void @caller_i64(i64 %x, [7 x i64] %y) { +entry: + tail call void @test_i64(i64 %x, [7 x i64] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_i64 +; CHECK: std 3, 96(1) +; CHECK: bl test_i64 + +declare void @test_i64(i64, [7 x i64], i64) + +define i64 @callee_i128(i64 %a, [4 x i128] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_i128 +; CHECK: ld 3, 112(1) +; CHECK: blr + +define void @caller_i128(i64 %x, [4 x i128] %y) { +entry: + tail call void @test_i128(i64 %x, [4 x i128] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_i128 +; CHECK: std 3, 112(1) +; CHECK: bl test_i128 + +declare void @test_i128(i64, [4 x i128], i64) + +define i64 @callee_v4i32(i64 %a, [4 x <4 x i32>] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_v4i32 +; CHECK: ld 3, 112(1) +; CHECK: blr + +define void @caller_v4i32(i64 %x, [4 x <4 x i32>] %y) { +entry: + tail call void @test_v4i32(i64 %x, [4 x <4 x i32>] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_v4i32 +; CHECK: std 3, 112(1) +; CHECK: bl test_v4i32 + +declare void @test_v4i32(i64, [4 x <4 x i32>], i64) + + +; +; Verify handling of floating point arguments in GPRs +; + +%struct.float8 = type { [8 x float] } +%struct.float5 = type { [5 x float] } +%struct.float2 = type { [2 x float] } + +@g8 = common global %struct.float8 zeroinitializer, align 4 +@g5 = common global %struct.float5 zeroinitializer, align 4 +@g2 = common global %struct.float2 zeroinitializer, align 4 + +define float @callee0([7 x float] %a, [7 x float] %b) { +entry: + %b.extract = extractvalue [7 x float] %b, 6 + ret float %b.extract +} +; CHECK-LABEL: @callee0 +; CHECK: stw 10, [[OFF:.*]](1) +; CHECK: lfs 1, [[OFF]](1) +; CHECK: blr + +define void @caller0([7 x float] %a) { +entry: + tail call void @test0([7 x float] %a, [7 x float] %a) + ret void +} +; CHECK-LABEL: @caller0 +; CHECK-DAG: fmr 8, 1 +; CHECK-DAG: fmr 9, 2 +; CHECK-DAG: fmr 10, 3 +; CHECK-DAG: fmr 11, 4 +; CHECK-DAG: fmr 12, 5 +; CHECK-DAG: fmr 13, 6 +; CHECK-DAG: stfs 7, [[OFF:[0-9]+]](1) +; CHECK-DAG: lwz 10, [[OFF]](1) +; CHECK: bl test0 + +declare void @test0([7 x float], [7 x float]) + +define float @callee1([8 x float] %a, [8 x float] %b) { +entry: + %b.extract = extractvalue [8 x float] %b, 7 + ret float %b.extract +} +; CHECK-LABEL: @callee1 +; CHECK: rldicl [[REG:[0-9]+]], 10, 32, 32 +; CHECK: stw [[REG]], [[OFF:.*]](1) +; CHECK: lfs 1, [[OFF]](1) +; CHECK: blr + +define void @caller1([8 x float] %a) { +entry: + tail call void @test1([8 x float] %a, [8 x float] %a) + ret void +} +; CHECK-LABEL: @caller1 +; CHECK-DAG: fmr 9, 1 +; CHECK-DAG: fmr 10, 2 +; CHECK-DAG: fmr 11, 3 +; CHECK-DAG: fmr 12, 4 +; CHECK-DAG: fmr 13, 5 +; CHECK-DAG: stfs 5, [[OFF0:[0-9]+]](1) +; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1) +; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1) +; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1) +; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1) +; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1) +; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1) +; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1) +; CHECK-DAG: sldi [[REG1]], [[REG1]], 32 +; CHECK-DAG: sldi [[REG3]], [[REG3]], 32 +; CHECK-DAG: or 9, [[REG0]], [[REG1]] +; CHECK-DAG: or 10, [[REG2]], [[REG3]] +; CHECK: bl test1 + +declare void @test1([8 x float], [8 x float]) + +define float @callee2([8 x float] %a, [5 x float] %b, [2 x float] %c) { +entry: + %c.extract = extractvalue [2 x float] %c, 1 + ret float %c.extract +} +; CHECK-LABEL: @callee2 +; CHECK: rldicl [[REG:[0-9]+]], 10, 32, 32 +; CHECK: stw [[REG]], [[OFF:.*]](1) +; CHECK: lfs 1, [[OFF]](1) +; CHECK: blr + +define void @caller2() { +entry: + %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4 + %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4 + %2 = load [2 x float]* getelementptr inbounds (%struct.float2* @g2, i64 0, i32 0), align 4 + tail call void @test2([8 x float] %0, [5 x float] %1, [2 x float] %2) + ret void +} +; CHECK-LABEL: @caller2 +; CHECK: ld [[REG:[0-9]+]], .LC +; CHECK-DAG: lfs 1, 0([[REG]]) +; CHECK-DAG: lfs 2, 4([[REG]]) +; CHECK-DAG: lfs 3, 8([[REG]]) +; CHECK-DAG: lfs 4, 12([[REG]]) +; CHECK-DAG: lfs 5, 16([[REG]]) +; CHECK-DAG: lfs 6, 20([[REG]]) +; CHECK-DAG: lfs 7, 24([[REG]]) +; CHECK-DAG: lfs 8, 28([[REG]]) +; CHECK: ld [[REG:[0-9]+]], .LC +; CHECK-DAG: lfs 9, 0([[REG]]) +; CHECK-DAG: lfs 10, 4([[REG]]) +; CHECK-DAG: lfs 11, 8([[REG]]) +; CHECK-DAG: lfs 12, 12([[REG]]) +; CHECK-DAG: lfs 13, 16([[REG]]) +; CHECK: ld [[REG:[0-9]+]], .LC +; CHECK-DAG: lwz [[REG0:[0-9]+]], 0([[REG]]) +; CHECK-DAG: lwz [[REG1:[0-9]+]], 4([[REG]]) +; CHECK-DAG: sldi [[REG1]], [[REG1]], 32 +; CHECK-DAG: or 10, [[REG0]], [[REG1]] +; CHECK: bl test2 + +declare void @test2([8 x float], [5 x float], [2 x float]) + +define double @callee3([8 x float] %a, [5 x float] %b, double %c) { +entry: + ret double %c +} +; CHECK-LABEL: @callee3 +; CHECK: std 10, [[OFF:.*]](1) +; CHECK: lfd 1, [[OFF]](1) +; CHECK: blr + +define void @caller3(double %d) { +entry: + %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4 + %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4 + tail call void @test3([8 x float] %0, [5 x float] %1, double %d) + ret void +} +; CHECK-LABEL: @caller3 +; CHECK: stfd 1, [[OFF:.*]](1) +; CHECK: ld 10, [[OFF]](1) +; CHECK: bl test3 + +declare void @test3([8 x float], [5 x float], double) + +define float @callee4([8 x float] %a, [5 x float] %b, float %c) { +entry: + ret float %c +} +; CHECK-LABEL: @callee4 +; CHECK: stw 10, [[OFF:.*]](1) +; CHECK: lfs 1, [[OFF]](1) +; CHECK: blr + +define void @caller4(float %f) { +entry: + %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4 + %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4 + tail call void @test4([8 x float] %0, [5 x float] %1, float %f) + ret void +} +; CHECK-LABEL: @caller4 +; CHECK: stfs 1, [[OFF:.*]](1) +; CHECK: lwz 10, [[OFF]](1) +; CHECK: bl test4 + +declare void @test4([8 x float], [5 x float], float) + diff --git a/test/CodeGen/PowerPC/ppc64le-calls.ll b/test/CodeGen/PowerPC/ppc64le-calls.ll new file mode 100644 index 000000000000..0d667dde96b4 --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64le-calls.ll @@ -0,0 +1,17 @@ +; RUN: llc -march=ppc64le -mcpu=pwr8 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; Indirect calls requires a full stub creation +define void @test_indirect(void ()* nocapture %fp) { +; CHECK-LABEL: @test_indirect + tail call void %fp() +; CHECK-DAG: std 2, 24(1) +; CHECK-DAG: mr 12, 3 +; CHECK-DAG: mtctr 3 +; CHECK: bctrl +; CHECK-NEXT: ld 2, 24(1) + ret void +} + diff --git a/test/CodeGen/PowerPC/ppc64le-crsave.ll b/test/CodeGen/PowerPC/ppc64le-crsave.ll new file mode 100644 index 000000000000..17174d7ad764 --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64le-crsave.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +@_ZTIi = external constant i8* +declare i8* @__cxa_allocate_exception(i64) +declare void @__cxa_throw(i8*, i8*, i8*) + +define void @crsave() { +entry: + call void asm sideeffect "", "~{cr2}"() + call void asm sideeffect "", "~{cr3}"() + call void asm sideeffect "", "~{cr4}"() + + %exception = call i8* @__cxa_allocate_exception(i64 4) + %0 = bitcast i8* %exception to i32* + store i32 0, i32* %0 + call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null) + unreachable + +return: ; No predecessors! + ret void +} +; CHECK-LABEL: @crsave +; CHECK: .cfi_offset cr2, 8 +; CHECK: .cfi_offset cr3, 8 +; CHECK: .cfi_offset cr4, 8 + diff --git a/test/CodeGen/PowerPC/ppc64le-localentry.ll b/test/CodeGen/PowerPC/ppc64le-localentry.ll new file mode 100644 index 000000000000..4676ce8eadc6 --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64le-localentry.ll @@ -0,0 +1,46 @@ +; RUN: llc -march=ppc64le -mcpu=pwr8 < %s | FileCheck %s +; RUN: llc -march=ppc64le -mcpu=pwr8 -O0 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +@number64 = global i64 10, align 8 + +; CHECK: .abiversion 2 + +define i64 @use_toc(i64 %a) nounwind { +entry: +; CHECK-LABEL: @use_toc +; CHECK-NEXT: .Ltmp[[TMP1:[0-9]+]]: +; CHECK-NEXT: addis 2, 12, .TOC.-.Ltmp[[TMP1]]@ha +; CHECK-NEXT: addi 2, 2, .TOC.-.Ltmp[[TMP1]]@l +; CHECK-NEXT: .Ltmp[[TMP2:[0-9]+]]: +; CHECK-NEXT: .localentry use_toc, .Ltmp[[TMP2]]-.Ltmp[[TMP1]] +; CHECK-NEXT: %entry + %0 = load i64* @number64, align 8 + %cmp = icmp eq i64 %0, %a + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +declare void @callee() +define void @use_toc_implicit() nounwind { +entry: +; CHECK-LABEL: @use_toc_implicit +; CHECK-NEXT: .Ltmp[[TMP1:[0-9]+]]: +; CHECK-NEXT: addis 2, 12, .TOC.-.Ltmp[[TMP1]]@ha +; CHECK-NEXT: addi 2, 2, .TOC.-.Ltmp[[TMP1]]@l +; CHECK-NEXT: .Ltmp[[TMP2:[0-9]+]]: +; CHECK-NEXT: .localentry use_toc_implicit, .Ltmp[[TMP2]]-.Ltmp[[TMP1]] +; CHECK-NEXT: %entry + call void @callee() + ret void +} + +define i64 @no_toc(i64 %a) nounwind { +entry: +; CHECK-LABEL: @no_toc +; CHECK-NEXT: %entry + ret i64 %a +} + diff --git a/test/CodeGen/PowerPC/ppc64le-smallarg.ll b/test/CodeGen/PowerPC/ppc64le-smallarg.ll new file mode 100644 index 000000000000..120c14039f99 --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64le-smallarg.ll @@ -0,0 +1,59 @@ +; Verify that small structures and float arguments are passed in the +; least significant part of a stack slot doubleword. + +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +%struct.large_arg = type { [8 x i64] } +%struct.small_arg = type { i16, i8 } + +@gl = common global %struct.large_arg zeroinitializer, align 8 +@gs = common global %struct.small_arg zeroinitializer, align 2 +@gf = common global float 0.000000e+00, align 4 + +define void @callee1(%struct.small_arg* noalias nocapture sret %agg.result, %struct.large_arg* byval nocapture readnone %pad, %struct.small_arg* byval nocapture readonly %x) { +entry: + %0 = bitcast %struct.small_arg* %x to i32* + %1 = bitcast %struct.small_arg* %agg.result to i32* + %2 = load i32* %0, align 2 + store i32 %2, i32* %1, align 2 + ret void +} +; CHECK: @callee1 +; CHECK: lwz {{[0-9]+}}, 104(1) +; CHECK: blr + +define void @caller1() { +entry: + %tmp = alloca %struct.small_arg, align 2 + call void @test1(%struct.small_arg* sret %tmp, %struct.large_arg* byval @gl, %struct.small_arg* byval @gs) + ret void +} +; CHECK: @caller1 +; CHECK: stw {{[0-9]+}}, 104(1) +; CHECK: bl test1 + +declare void @test1(%struct.small_arg* sret, %struct.large_arg* byval, %struct.small_arg* byval) + +define float @callee2(float %pad1, float %pad2, float %pad3, float %pad4, float %pad5, float %pad6, float %pad7, float %pad8, float %pad9, float %pad10, float %pad11, float %pad12, float %pad13, float %x) { +entry: + ret float %x +} +; CHECK: @callee2 +; CHECK: lfs {{[0-9]+}}, 136(1) +; CHECK: blr + +define void @caller2() { +entry: + %0 = load float* @gf, align 4 + %call = tail call float @test2(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float %0) + ret void +} +; CHECK: @caller2 +; CHECK: stfs {{[0-9]+}}, 136(1) +; CHECK: bl test2 + +declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float) + diff --git a/test/CodeGen/PowerPC/ppcf128-endian.ll b/test/CodeGen/PowerPC/ppcf128-endian.ll new file mode 100644 index 000000000000..2a5f13a5c3da --- /dev/null +++ b/test/CodeGen/PowerPC/ppcf128-endian.ll @@ -0,0 +1,154 @@ +; RUN: llc -mcpu=pwr7 -mattr=+altivec < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +@g = common global ppc_fp128 0xM00000000000000000000000000000000, align 16 + +define void @callee(ppc_fp128 %x) { +entry: + %x.addr = alloca ppc_fp128, align 16 + store ppc_fp128 %x, ppc_fp128* %x.addr, align 16 + %0 = load ppc_fp128* %x.addr, align 16 + store ppc_fp128 %0, ppc_fp128* @g, align 16 + ret void +} +; CHECK: @callee +; CHECK: ld [[REG:[0-9]+]], .LC +; CHECK: stfd 2, 8([[REG]]) +; CHECK: stfd 1, 0([[REG]]) +; CHECK: blr + +define void @caller() { +entry: + %0 = load ppc_fp128* @g, align 16 + call void @test(ppc_fp128 %0) + ret void +} +; CHECK: @caller +; CHECK: ld [[REG:[0-9]+]], .LC +; CHECK: lfd 2, 8([[REG]]) +; CHECK: lfd 1, 0([[REG]]) +; CHECK: bl test + +declare void @test(ppc_fp128) + +define void @caller_const() { +entry: + call void @test(ppc_fp128 0xM3FF00000000000000000000000000000) + ret void +} +; CHECK: .LCPI[[LC:[0-9]+]]_0: +; CHECK: .long 1065353216 +; CHECK: .LCPI[[LC]]_1: +; CHECK: .long 0 +; CHECK: @caller_const +; CHECK: addi [[REG0:[0-9]+]], {{[0-9]+}}, .LCPI[[LC]]_0 +; CHECK: addi [[REG1:[0-9]+]], {{[0-9]+}}, .LCPI[[LC]]_1 +; CHECK: lfs 1, 0([[REG0]]) +; CHECK: lfs 2, 0([[REG1]]) +; CHECK: bl test + +define ppc_fp128 @result() { +entry: + %0 = load ppc_fp128* @g, align 16 + ret ppc_fp128 %0 +} +; CHECK: @result +; CHECK: ld [[REG:[0-9]+]], .LC +; CHECK: lfd 1, 0([[REG]]) +; CHECK: lfd 2, 8([[REG]]) +; CHECK: blr + +define void @use_result() { +entry: + %call = tail call ppc_fp128 @test_result() #3 + store ppc_fp128 %call, ppc_fp128* @g, align 16 + ret void +} +; CHECK: @use_result +; CHECK: bl test_result +; CHECK: ld [[REG:[0-9]+]], .LC +; CHECK: stfd 2, 8([[REG]]) +; CHECK: stfd 1, 0([[REG]]) +; CHECK: blr + +declare ppc_fp128 @test_result() + +define void @caller_result() { +entry: + %call = tail call ppc_fp128 @test_result() + tail call void @test(ppc_fp128 %call) + ret void +} +; CHECK: @caller_result +; CHECK: bl test_result +; CHECK-NEXT: nop +; CHECK-NEXT: bl test +; CHECK-NEXT: nop + +define i128 @convert_from(ppc_fp128 %x) { +entry: + %0 = bitcast ppc_fp128 %x to i128 + ret i128 %0 +} +; CHECK: @convert_from +; CHECK: stfd 1, [[OFF1:.*]](1) +; CHECK: stfd 2, [[OFF2:.*]](1) +; CHECK: ld 3, [[OFF1]](1) +; CHECK: ld 4, [[OFF2]](1) +; CHECK: blr + +define ppc_fp128 @convert_to(i128 %x) { +entry: + %0 = bitcast i128 %x to ppc_fp128 + ret ppc_fp128 %0 +} +; CHECK: @convert_to +; CHECK: std 3, [[OFF1:.*]](1) +; CHECK: std 4, [[OFF2:.*]](1) +; CHECK: lfd 1, [[OFF1]](1) +; CHECK: lfd 2, [[OFF2]](1) +; CHECK: blr + +define ppc_fp128 @convert_to2(i128 %x) { +entry: + %shl = shl i128 %x, 1 + %0 = bitcast i128 %shl to ppc_fp128 + ret ppc_fp128 %0 +} + +; CHECK: @convert_to +; CHECK: std 3, [[OFF1:.*]](1) +; CHECK: std 4, [[OFF2:.*]](1) +; CHECK: lfd 1, [[OFF1]](1) +; CHECK: lfd 2, [[OFF2]](1) +; CHECK: blr + +define double @convert_vector(<4 x i32> %x) { +entry: + %cast = bitcast <4 x i32> %x to ppc_fp128 + %conv = fptrunc ppc_fp128 %cast to double + ret double %conv +} +; CHECK: @convert_vector +; CHECK: addi [[REG:[0-9]+]], 1, [[OFF:.*]] +; CHECK: stvx 2, 0, [[REG]] +; CHECK: lfd 1, [[OFF]](1) +; CHECK: blr + +declare void @llvm.va_start(i8*) + +define double @vararg(i32 %a, ...) { +entry: + %va = alloca i8*, align 8 + %va1 = bitcast i8** %va to i8* + call void @llvm.va_start(i8* %va1) + %arg = va_arg i8** %va, ppc_fp128 + %conv = fptrunc ppc_fp128 %arg to double + ret double %conv +} +; CHECK: @vararg +; CHECK: lfd 1, 0({{[0-9]+}}) +; CHECK: blr + diff --git a/test/CodeGen/PowerPC/resolvefi-basereg.ll b/test/CodeGen/PowerPC/resolvefi-basereg.ll new file mode 100644 index 000000000000..62c2d139920a --- /dev/null +++ b/test/CodeGen/PowerPC/resolvefi-basereg.ll @@ -0,0 +1,362 @@ +; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s + +; Due to a bug in resolveFrameIndex we ended up with invalid addresses +; containing a base register 0. Verify that this no longer happens. +; CHECK-NOT: (0) + +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +%struct.Info = type { i32, i32, i8*, i8*, i8*, [32 x i8*], i64, [32 x i64], i64, i64, i64, [32 x i64] } +%struct.S1998 = type { [2 x i32*], i64, i64, double, i16, i32, [29 x %struct.anon], i16, i8, i32, [8 x i8] } +%struct.anon = type { [16 x double], i32, i16, i32, [3 x i8], [6 x i8], [4 x i32], i8 } + +@info = global %struct.Info zeroinitializer, align 8 +@fails = global i32 0, align 4 +@intarray = global [256 x i32] zeroinitializer, align 4 +@s1998 = global %struct.S1998 zeroinitializer, align 16 +@a1998 = external global [5 x %struct.S1998] + +define void @test1998() { +entry: + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %tmp = alloca i32, align 4 + %agg.tmp = alloca %struct.S1998, align 16 + %agg.tmp111 = alloca %struct.S1998, align 16 + %agg.tmp112 = alloca %struct.S1998, align 16 + %agg.tmp113 = alloca %struct.S1998, align 16 + %agg.tmp114 = alloca %struct.S1998, align 16 + %agg.tmp115 = alloca %struct.S1998, align 16 + %agg.tmp116 = alloca %struct.S1998, align 16 + %agg.tmp117 = alloca %struct.S1998, align 16 + %agg.tmp118 = alloca %struct.S1998, align 16 + %agg.tmp119 = alloca %struct.S1998, align 16 + call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.S1998* @s1998 to i8*), i8 0, i64 5168, i32 16, i1 false) + call void @llvm.memset.p0i8.i64(i8* bitcast ([5 x %struct.S1998]* @a1998 to i8*), i8 0, i64 25840, i32 16, i1 false) + call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.Info* @info to i8*), i8 0, i64 832, i32 8, i1 false) + store i8* bitcast (%struct.S1998* @s1998 to i8*), i8** getelementptr inbounds (%struct.Info* @info, i32 0, i32 2), align 8 + store i8* bitcast ([5 x %struct.S1998]* @a1998 to i8*), i8** getelementptr inbounds (%struct.Info* @info, i32 0, i32 3), align 8 + store i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 3) to i8*), i8** getelementptr inbounds (%struct.Info* @info, i32 0, i32 4), align 8 + store i64 5168, i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 6), align 8 + store i64 16, i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 8), align 8 + store i64 16, i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 9), align 8 + store i64 16, i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 10), align 8 + %0 = load i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 8), align 8 + %sub = sub i64 %0, 1 + %and = and i64 ptrtoint (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 3) to i64), %sub + %tobool = icmp ne i64 %and, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i32* @fails, align 4 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32* @fails, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + store i32 0, i32* %i, align 4 + store i32 0, i32* %j, align 4 + %2 = load i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom + store i8* bitcast (i32** getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 0, i64 1) to i8*), i8** %arrayidx, align 8 + %3 = load i32* %i, align 4 + %idxprom1 = sext i32 %3 to i64 + %arrayidx2 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom1 + store i64 8, i64* %arrayidx2, align 8 + %4 = load i32* %i, align 4 + %idxprom3 = sext i32 %4 to i64 + %arrayidx4 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom3 + store i64 8, i64* %arrayidx4, align 8 + store i32* getelementptr inbounds ([256 x i32]* @intarray, i32 0, i64 190), i32** getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 0, i64 1), align 8 + store i32* getelementptr inbounds ([256 x i32]* @intarray, i32 0, i64 241), i32** getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 0, i64 1), align 8 + %5 = load i32* %i, align 4 + %inc5 = add nsw i32 %5, 1 + store i32 %inc5, i32* %i, align 4 + %6 = load i32* %i, align 4 + %idxprom6 = sext i32 %6 to i64 + %arrayidx7 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom6 + store i8* bitcast (i64* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 1) to i8*), i8** %arrayidx7, align 8 + %7 = load i32* %i, align 4 + %idxprom8 = sext i32 %7 to i64 + %arrayidx9 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom8 + store i64 8, i64* %arrayidx9, align 8 + %8 = load i32* %i, align 4 + %idxprom10 = sext i32 %8 to i64 + %arrayidx11 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom10 + store i64 8, i64* %arrayidx11, align 8 + store i64 -3866974208859106459, i64* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 1), align 8 + store i64 -185376695371304091, i64* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 1), align 8 + %9 = load i32* %i, align 4 + %inc12 = add nsw i32 %9, 1 + store i32 %inc12, i32* %i, align 4 + %10 = load i32* %i, align 4 + %idxprom13 = sext i32 %10 to i64 + %arrayidx14 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom13 + store i8* bitcast (i64* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 2) to i8*), i8** %arrayidx14, align 8 + %11 = load i32* %i, align 4 + %idxprom15 = sext i32 %11 to i64 + %arrayidx16 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom15 + store i64 8, i64* %arrayidx16, align 8 + %12 = load i32* %i, align 4 + %idxprom17 = sext i32 %12 to i64 + %arrayidx18 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom17 + store i64 8, i64* %arrayidx18, align 8 + store i64 -963638028680427187, i64* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 2), align 8 + store i64 7510542175772455554, i64* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 2), align 8 + %13 = load i32* %i, align 4 + %inc19 = add nsw i32 %13, 1 + store i32 %inc19, i32* %i, align 4 + %14 = load i32* %i, align 4 + %idxprom20 = sext i32 %14 to i64 + %arrayidx21 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom20 + store i8* bitcast (double* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 3) to i8*), i8** %arrayidx21, align 8 + %15 = load i32* %i, align 4 + %idxprom22 = sext i32 %15 to i64 + %arrayidx23 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom22 + store i64 8, i64* %arrayidx23, align 8 + %16 = load i32* %i, align 4 + %idxprom24 = sext i32 %16 to i64 + %arrayidx25 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom24 + store i64 16, i64* %arrayidx25, align 8 + store double 0xC0F8783300000000, double* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 3), align 16 + store double 0xC10DF3CCC0000000, double* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 3), align 16 + %17 = load i32* %i, align 4 + %inc26 = add nsw i32 %17, 1 + store i32 %inc26, i32* %i, align 4 + %18 = load i32* %i, align 4 + %idxprom27 = sext i32 %18 to i64 + %arrayidx28 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom27 + store i8* bitcast (i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 4) to i8*), i8** %arrayidx28, align 8 + %19 = load i32* %i, align 4 + %idxprom29 = sext i32 %19 to i64 + %arrayidx30 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom29 + store i64 2, i64* %arrayidx30, align 8 + %20 = load i32* %i, align 4 + %idxprom31 = sext i32 %20 to i64 + %arrayidx32 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom31 + store i64 2, i64* %arrayidx32, align 8 + store i16 -15897, i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 4), align 2 + store i16 30935, i16* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 4), align 2 + %21 = load i32* %i, align 4 + %inc33 = add nsw i32 %21, 1 + store i32 %inc33, i32* %i, align 4 + store i32 -419541644, i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 5), align 4 + store i32 2125926812, i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 5), align 4 + %22 = load i32* %j, align 4 + %inc34 = add nsw i32 %22, 1 + store i32 %inc34, i32* %j, align 4 + %23 = load i32* %i, align 4 + %idxprom35 = sext i32 %23 to i64 + %arrayidx36 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom35 + store i8* bitcast (double* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 0, i64 0) to i8*), i8** %arrayidx36, align 8 + %24 = load i32* %i, align 4 + %idxprom37 = sext i32 %24 to i64 + %arrayidx38 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom37 + store i64 8, i64* %arrayidx38, align 8 + %25 = load i32* %i, align 4 + %idxprom39 = sext i32 %25 to i64 + %arrayidx40 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom39 + store i64 8, i64* %arrayidx40, align 8 + store double 0xC0FC765780000000, double* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 0, i64 0), align 8 + store double 0xC1025CD7A0000000, double* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 0, i64 0), align 8 + %26 = load i32* %i, align 4 + %inc41 = add nsw i32 %26, 1 + store i32 %inc41, i32* %i, align 4 + %bf.load = load i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 1), align 8 + %bf.clear = and i32 %bf.load, 7 + %bf.set = or i32 %bf.clear, 16 + store i32 %bf.set, i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 1), align 8 + %bf.load42 = load i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 1), align 8 + %bf.clear43 = and i32 %bf.load42, 7 + %bf.set44 = or i32 %bf.clear43, 24 + store i32 %bf.set44, i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 1), align 8 + %27 = load i32* %j, align 4 + %inc45 = add nsw i32 %27, 1 + store i32 %inc45, i32* %j, align 4 + %bf.load46 = load i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 2), align 4 + %bf.clear47 = and i16 %bf.load46, 127 + store i16 %bf.clear47, i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 2), align 4 + %bf.load48 = load i16* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 2), align 4 + %bf.clear49 = and i16 %bf.load48, 127 + store i16 %bf.clear49, i16* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 2), align 4 + %28 = load i32* %j, align 4 + %inc50 = add nsw i32 %28, 1 + store i32 %inc50, i32* %j, align 4 + %bf.load51 = load i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 3), align 8 + %bf.clear52 = and i32 %bf.load51, 63 + store i32 %bf.clear52, i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 3), align 8 + %bf.load53 = load i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 3), align 8 + %bf.clear54 = and i32 %bf.load53, 63 + %bf.set55 = or i32 %bf.clear54, 64 + store i32 %bf.set55, i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 3), align 8 + %29 = load i32* %j, align 4 + %inc56 = add nsw i32 %29, 1 + store i32 %inc56, i32* %j, align 4 + %bf.load57 = load i24* bitcast ([3 x i8]* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 4) to i24*), align 4 + %bf.clear58 = and i24 %bf.load57, 63 + store i24 %bf.clear58, i24* bitcast ([3 x i8]* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 4) to i24*), align 4 + %bf.load59 = load i24* bitcast ([3 x i8]* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 4) to i24*), align 4 + %bf.clear60 = and i24 %bf.load59, 63 + store i24 %bf.clear60, i24* bitcast ([3 x i8]* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 4) to i24*), align 4 + %30 = load i32* %j, align 4 + %inc61 = add nsw i32 %30, 1 + store i32 %inc61, i32* %j, align 4 + %31 = load i32* %i, align 4 + %idxprom62 = sext i32 %31 to i64 + %arrayidx63 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom62 + store i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 5, i64 5), i8** %arrayidx63, align 8 + %32 = load i32* %i, align 4 + %idxprom64 = sext i32 %32 to i64 + %arrayidx65 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom64 + store i64 1, i64* %arrayidx65, align 8 + %33 = load i32* %i, align 4 + %idxprom66 = sext i32 %33 to i64 + %arrayidx67 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom66 + store i64 1, i64* %arrayidx67, align 8 + store i8 -83, i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 5, i64 5), align 1 + store i8 -67, i8* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 5, i64 5), align 1 + %34 = load i32* %i, align 4 + %inc68 = add nsw i32 %34, 1 + store i32 %inc68, i32* %i, align 4 + %35 = load i32* %i, align 4 + %idxprom69 = sext i32 %35 to i64 + %arrayidx70 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom69 + store i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 5, i64 1), i8** %arrayidx70, align 8 + %36 = load i32* %i, align 4 + %idxprom71 = sext i32 %36 to i64 + %arrayidx72 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom71 + store i64 1, i64* %arrayidx72, align 8 + %37 = load i32* %i, align 4 + %idxprom73 = sext i32 %37 to i64 + %arrayidx74 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom73 + store i64 1, i64* %arrayidx74, align 8 + store i8 34, i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 5, i64 1), align 1 + store i8 64, i8* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 5, i64 1), align 1 + %38 = load i32* %i, align 4 + %inc75 = add nsw i32 %38, 1 + store i32 %inc75, i32* %i, align 4 + %39 = load i32* %i, align 4 + %idxprom76 = sext i32 %39 to i64 + %arrayidx77 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom76 + store i8* bitcast (i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 6, i64 3) to i8*), i8** %arrayidx77, align 8 + %40 = load i32* %i, align 4 + %idxprom78 = sext i32 %40 to i64 + %arrayidx79 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom78 + store i64 4, i64* %arrayidx79, align 8 + %41 = load i32* %i, align 4 + %idxprom80 = sext i32 %41 to i64 + %arrayidx81 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom80 + store i64 4, i64* %arrayidx81, align 8 + store i32 -3, i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 6, i64 3), align 4 + store i32 -3, i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 6, i64 3), align 4 + %42 = load i32* %i, align 4 + %inc82 = add nsw i32 %42, 1 + store i32 %inc82, i32* %i, align 4 + %43 = load i32* %i, align 4 + %idxprom83 = sext i32 %43 to i64 + %arrayidx84 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom83 + store i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 7), i8** %arrayidx84, align 8 + %44 = load i32* %i, align 4 + %idxprom85 = sext i32 %44 to i64 + %arrayidx86 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom85 + store i64 1, i64* %arrayidx86, align 8 + %45 = load i32* %i, align 4 + %idxprom87 = sext i32 %45 to i64 + %arrayidx88 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom87 + store i64 1, i64* %arrayidx88, align 8 + store i8 106, i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 7), align 1 + store i8 -102, i8* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 7), align 1 + %46 = load i32* %i, align 4 + %inc89 = add nsw i32 %46, 1 + store i32 %inc89, i32* %i, align 4 + %47 = load i32* %i, align 4 + %idxprom90 = sext i32 %47 to i64 + %arrayidx91 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom90 + store i8* bitcast (i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 7) to i8*), i8** %arrayidx91, align 8 + %48 = load i32* %i, align 4 + %idxprom92 = sext i32 %48 to i64 + %arrayidx93 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom92 + store i64 2, i64* %arrayidx93, align 8 + %49 = load i32* %i, align 4 + %idxprom94 = sext i32 %49 to i64 + %arrayidx95 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom94 + store i64 2, i64* %arrayidx95, align 8 + store i16 29665, i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 7), align 2 + store i16 7107, i16* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 7), align 2 + %50 = load i32* %i, align 4 + %inc96 = add nsw i32 %50, 1 + store i32 %inc96, i32* %i, align 4 + %51 = load i32* %i, align 4 + %idxprom97 = sext i32 %51 to i64 + %arrayidx98 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom97 + store i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 8), i8** %arrayidx98, align 8 + %52 = load i32* %i, align 4 + %idxprom99 = sext i32 %52 to i64 + %arrayidx100 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom99 + store i64 1, i64* %arrayidx100, align 8 + %53 = load i32* %i, align 4 + %idxprom101 = sext i32 %53 to i64 + %arrayidx102 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom101 + store i64 1, i64* %arrayidx102, align 8 + store i8 52, i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 8), align 1 + store i8 -86, i8* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 8), align 1 + %54 = load i32* %i, align 4 + %inc103 = add nsw i32 %54, 1 + store i32 %inc103, i32* %i, align 4 + %55 = load i32* %i, align 4 + %idxprom104 = sext i32 %55 to i64 + %arrayidx105 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom104 + store i8* bitcast (i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 9) to i8*), i8** %arrayidx105, align 8 + %56 = load i32* %i, align 4 + %idxprom106 = sext i32 %56 to i64 + %arrayidx107 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom106 + store i64 4, i64* %arrayidx107, align 8 + %57 = load i32* %i, align 4 + %idxprom108 = sext i32 %57 to i64 + %arrayidx109 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom108 + store i64 4, i64* %arrayidx109, align 8 + store i32 -54118453, i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 9), align 4 + store i32 1668755823, i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 9), align 4 + %58 = load i32* %i, align 4 + %inc110 = add nsw i32 %58, 1 + store i32 %inc110, i32* %i, align 4 + store i32 %inc110, i32* %tmp + %59 = load i32* %tmp + %60 = load i32* %i, align 4 + store i32 %60, i32* getelementptr inbounds (%struct.Info* @info, i32 0, i32 0), align 4 + %61 = load i32* %j, align 4 + store i32 %61, i32* getelementptr inbounds (%struct.Info* @info, i32 0, i32 1), align 4 + %62 = bitcast %struct.S1998* %agg.tmp111 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %62, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false) + %63 = bitcast %struct.S1998* %agg.tmp112 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %63, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false) + call void @check1998(%struct.S1998* sret %agg.tmp, %struct.S1998* byval align 16 %agg.tmp111, %struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 1), %struct.S1998* byval align 16 %agg.tmp112) + call void @checkx1998(%struct.S1998* byval align 16 %agg.tmp) + %64 = bitcast %struct.S1998* %agg.tmp113 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %64, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false) + %65 = bitcast %struct.S1998* %agg.tmp114 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %65, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false) + %66 = bitcast %struct.S1998* %agg.tmp115 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %66, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false) + call void (i32, ...)* @check1998va(i32 signext 1, double 1.000000e+00, %struct.S1998* byval align 16 %agg.tmp113, i64 2, %struct.S1998* byval align 16 %agg.tmp114, %struct.S1998* byval align 16 %agg.tmp115) + %67 = bitcast %struct.S1998* %agg.tmp116 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %67, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false) + %68 = bitcast %struct.S1998* %agg.tmp117 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %68, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false) + %69 = bitcast %struct.S1998* %agg.tmp118 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %69, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false) + %70 = bitcast %struct.S1998* %agg.tmp119 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %70, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false) + call void (i32, ...)* @check1998va(i32 signext 2, %struct.S1998* byval align 16 %agg.tmp116, %struct.S1998* byval align 16 %agg.tmp117, ppc_fp128 0xM40000000000000000000000000000000, %struct.S1998* byval align 16 %agg.tmp118, %struct.S1998* byval align 16 %agg.tmp119) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) + +declare void @check1998(%struct.S1998* sret, %struct.S1998* byval align 16, %struct.S1998*, %struct.S1998* byval align 16) +declare void @check1998va(i32 signext, ...) +declare void @checkx1998(%struct.S1998* byval align 16 %arg) + diff --git a/test/CodeGen/PowerPC/resolvefi-disp.ll b/test/CodeGen/PowerPC/resolvefi-disp.ll new file mode 100644 index 000000000000..ca42bcd767a0 --- /dev/null +++ b/test/CodeGen/PowerPC/resolvefi-disp.ll @@ -0,0 +1,71 @@ +; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -print-after=localstackalloc <%s >%t 2>&1 && FileCheck <%t %s + +; Due to a bug in isFrameOffsetLegal we ended up with resolveFrameIndex creating +; addresses with out-of-range displacements. Verify that this no longer happens. +; CHECK-NOT: LD {{3276[8-9]}} +; CHECK-NOT: LD {{327[7-9][0-9]}} +; CHECK-NOT: LD {{32[8-9][0-9][0-9]}} +; CHECK-NOT: LD {{3[3-9][0-9][0-9][0-9]}} +; CHECK-NOT: LD {{[4-9][0-9][0-9][0-9][0-9]}} +; CHECK-NOT: LD {{[1-9][0-9][0-9][0-9][0-9][0-9]+}} + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +%struct.S2760 = type { <2 x float>, %struct.anon, i32, [28 x i8] } +%struct.anon = type { [11 x %struct.anon.0], i64, [6 x { i64, i64 }], [24 x i8] } +%struct.anon.0 = type { [30 x %union.U4DI], i8, [0 x i16], [30 x i8] } +%union.U4DI = type { <4 x i64> } + +@s2760 = external global %struct.S2760 +@fails = external global i32 + +define void @check2760(%struct.S2760* noalias sret %agg.result, %struct.S2760* byval align 16, %struct.S2760* %arg1, %struct.S2760* byval align 16) { +entry: + %arg0 = alloca %struct.S2760, align 32 + %arg2 = alloca %struct.S2760, align 32 + %arg1.addr = alloca %struct.S2760*, align 8 + %ret = alloca %struct.S2760, align 32 + %b1 = alloca %struct.S2760, align 32 + %b2 = alloca %struct.S2760, align 32 + %2 = bitcast %struct.S2760* %arg0 to i8* + %3 = bitcast %struct.S2760* %0 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 11104, i32 16, i1 false) + %4 = bitcast %struct.S2760* %arg2 to i8* + %5 = bitcast %struct.S2760* %1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 11104, i32 16, i1 false) + store %struct.S2760* %arg1, %struct.S2760** %arg1.addr, align 8 + %6 = bitcast %struct.S2760* %ret to i8* + call void @llvm.memset.p0i8.i64(i8* %6, i8 0, i64 11104, i32 32, i1 false) + %7 = bitcast %struct.S2760* %b1 to i8* + call void @llvm.memset.p0i8.i64(i8* %7, i8 0, i64 11104, i32 32, i1 false) + %8 = bitcast %struct.S2760* %b2 to i8* + call void @llvm.memset.p0i8.i64(i8* %8, i8 0, i64 11104, i32 32, i1 false) + %b = getelementptr inbounds %struct.S2760* %arg0, i32 0, i32 1 + %g = getelementptr inbounds %struct.anon* %b, i32 0, i32 1 + %9 = load i64* %g, align 8 + %10 = load i64* getelementptr inbounds (%struct.S2760* @s2760, i32 0, i32 1, i32 1), align 8 + %cmp = icmp ne i64 %9, %10 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %11 = load i32* @fails, align 4 + %inc = add nsw i32 %11, 1 + store i32 %inc, i32* @fails, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + %12 = load i64* getelementptr inbounds (%struct.S2760* @s2760, i32 0, i32 1, i32 1), align 8 + %b3 = getelementptr inbounds %struct.S2760* %ret, i32 0, i32 1 + %g4 = getelementptr inbounds %struct.anon* %b3, i32 0, i32 1 + store i64 %12, i64* %g4, align 8 + %13 = bitcast %struct.S2760* %agg.result to i8* + %14 = bitcast %struct.S2760* %ret to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %13, i8* %14, i64 11104, i32 32, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) + diff --git a/test/CodeGen/PowerPC/sections.ll b/test/CodeGen/PowerPC/sections.ll index 0ff4a89ff379..d77dfddd0f90 100644 --- a/test/CodeGen/PowerPC/sections.ll +++ b/test/CodeGen/PowerPC/sections.ll @@ -1,8 +1,12 @@ ; Test to make sure that bss sections are printed with '.section' directive. ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=PIC @A = global i32 0 ; CHECK: .section .bss,"aw",@nobits ; CHECK: .globl A +; PIC: .section .got2,"aw",@progbits +; PIC: .section .bss,"aw",@nobits +; PIC: .globl A diff --git a/test/CodeGen/PowerPC/stack-realign.ll b/test/CodeGen/PowerPC/stack-realign.ll index 1c7a36aeeabf..a59fceb5bdd0 100644 --- a/test/CodeGen/PowerPC/stack-realign.ll +++ b/test/CodeGen/PowerPC/stack-realign.ll @@ -1,5 +1,7 @@ ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -disable-fp-elim < %s | FileCheck -check-prefix=CHECK-FP %s +; RUN: llc -mtriple=powerpc-unknown-linux-gnu -disable-fp-elim < %s | FileCheck -check-prefix=CHECK-32 %s +; RUN: llc -mtriple=powerpc-unknown-linux-gnu -disable-fp-elim -relocation-model=pic < %s | FileCheck -check-prefix=CHECK-32-PIC %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -7,6 +9,8 @@ target triple = "powerpc64-unknown-linux-gnu" declare void @bar(i32*) +@barbaz = external global i32 + define void @goo(%struct.s* byval nocapture readonly %a) { entry: %x = alloca [2 x i32], align 32 @@ -16,8 +20,9 @@ entry: store i32 %0, i32* %arrayidx, align 32 %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1 %1 = load i32* %b, align 4 + %2 = load i32* @barbaz, align 4 %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1 - store i32 %1, i32* %arrayidx2, align 4 + store i32 %2, i32* %arrayidx2, align 4 call void @bar(i32* %arrayidx) ret void } @@ -69,6 +74,24 @@ entry: ; CHECK-FP-DAG: mtlr 0 ; CHECK-FP: blr +; CHECK-32-LABEL: @goo +; CHECK-32-DAG: mflr 0 +; CHECK-32-DAG: rlwinm [[REG:[0-9]+]], 1, 0, 27, 31 +; CHECK-32-DAG: stw 30, -8(1) +; CHECK-32-DAG: mr 30, 1 +; CHECK-32-DAG: stw 0, 4(1) +; CHECK-32-DAG: subfic 0, [[REG]], -64 +; CHECK-32: stwux 1, 1, 0 + +; CHECK-32-PIC-LABEL: @goo +; CHECK-32-PIC-DAG: mflr 0 +; CHECK-32-PIC-DAG: rlwinm [[REG:[0-9]+]], 1, 0, 27, 31 +; CHECK-32-PIC-DAG: stw 29, -12(1) +; CHECK-32-PIC-DAG: mr 29, 1 +; CHECK-32-PIC-DAG: stw 0, 4(1) +; CHECK-32-PIC-DAG: subfic 0, [[REG]], -64 +; CHECK-32-PIC: stwux 1, 1, 0 + ; The large-frame-size case. define void @hoo(%struct.s* byval nocapture readonly %a) { entry: @@ -99,6 +122,34 @@ entry: ; CHECK: blr +; CHECK-32-LABEL: @hoo + +; CHECK-32-DAG: lis [[REG1:[0-9]+]], -13 +; CHECK-32-DAG: rlwinm [[REG3:[0-9]+]], 1, 0, 27, 31 +; CHECK-32-DAG: mflr 0 +; CHECK-32-DAG: ori [[REG2:[0-9]+]], [[REG1]], 51904 +; CHECK-32-DAG: stw 30, -8(1) +; CHECK-32-DAG: mr 30, 1 +; CHECK-32-DAG: stw 0, 4(1) +; CHECK-32-DAG: subfc 0, [[REG3]], [[REG2]] +; CHECK-32: stwux 1, 1, 0 + +; CHECK-32: blr + +; CHECK-32-PIC-LABEL: @hoo + +; CHECK-32-PIC-DAG: lis [[REG1:[0-9]+]], -13 +; CHECK-32-PIC-DAG: rlwinm [[REG3:[0-9]+]], 1, 0, 27, 31 +; CHECK-32-PIC-DAG: mflr 0 +; CHECK-32-PIC-DAG: ori [[REG2:[0-9]+]], [[REG1]], 51904 +; CHECK-32-PIC-DAG: stw 29, -12(1) +; CHECK-32-PIC-DAG: mr 29, 1 +; CHECK-32-PIC-DAG: stw 0, 4(1) +; CHECK-32-PIC-DAG: subfc 0, [[REG3]], [[REG2]] +; CHECK-32: stwux 1, 1, 0 + +; CHECK-32: blr + ; Make sure that the FP save area is still allocated correctly relative to ; where r30 is saved. define void @loo(%struct.s* byval nocapture readonly %a) { diff --git a/test/CodeGen/PowerPC/svr4-redzone.ll b/test/CodeGen/PowerPC/svr4-redzone.ll index 7c51b67aeecb..bee3ac32b648 100644 --- a/test/CodeGen/PowerPC/svr4-redzone.ll +++ b/test/CodeGen/PowerPC/svr4-redzone.ll @@ -36,4 +36,4 @@ entry: ; PPC32: stwu 1, -240(1) ; PPC64-LABEL: bigstack: -; PPC64: stdu 1, -352(1) +; PPC64: stdu 1, -288(1) diff --git a/test/CodeGen/PowerPC/varargs-struct-float.ll b/test/CodeGen/PowerPC/varargs-struct-float.ll index fb1835f580b2..0fd9fc50892e 100644 --- a/test/CodeGen/PowerPC/varargs-struct-float.ll +++ b/test/CodeGen/PowerPC/varargs-struct-float.ll @@ -16,8 +16,8 @@ entry: ret void } -; CHECK: stfs {{[0-9]+}}, 60(1) -; CHECK: ld 4, 56(1) +; CHECK: stfs {{[0-9]+}}, 116(1) +; CHECK: lwz 4, 116(1) ; CHECK: bl declare void @testvaSf1(i32, ...) diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll index 4bce8c80fc6a..2733089fcb10 100644 --- a/test/CodeGen/PowerPC/vec_cmp.ll +++ b/test/CodeGen/PowerPC/vec_cmp.ll @@ -36,7 +36,7 @@ define <8 x i8> @v8si8_cmp(<8 x i8> %x, <8 x i8> %y) nounwind readnone { ; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} -; Adicional tests for v16i8 since it is a altivec native type +; Additional tests for v16i8 since it is a altivec native type define <16 x i8> @v16si8_cmp_eq(<16 x i8> %x, <16 x i8> %y) nounwind readnone { %cmp = icmp eq <16 x i8> %x, %y @@ -165,7 +165,7 @@ define <4 x i16> @v4si16_cmp(<4 x i16> %x, <4 x i16> %y) nounwind readnone { ; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} -; Adicional tests for v8i16 since it is an altivec native type +; Additional tests for v8i16 since it is an altivec native type define <8 x i16> @v8si16_cmp_eq(<8 x i16> %x, <8 x i16> %y) nounwind readnone { entry: @@ -298,7 +298,7 @@ define <2 x i32> @v2si32_cmp(<2 x i32> %x, <2 x i32> %y) nounwind readnone { ; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} -; Adicional tests for v4si32 since it is an altivec native type +; Additional tests for v4si32 since it is an altivec native type define <4 x i32> @v4si32_cmp_eq(<4 x i32> %x, <4 x i32> %y) nounwind readnone { entry: @@ -449,7 +449,7 @@ entry: ; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} -; Adicional tests for v4f32 since it is a altivec native type +; Additional tests for v4f32 since it is a altivec native type define <4 x float> @v4f32_cmp_eq(<4 x float> %x, <4 x float> %y) nounwind readnone { entry: diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll index f5ac577a75da..2f226e1f614c 100644 --- a/test/CodeGen/PowerPC/vsx.ll +++ b/test/CodeGen/PowerPC/vsx.ll @@ -634,7 +634,7 @@ define <2 x i32> @test80(i32 %v) { ; CHECK-DAG: addi [[R1:[0-9]+]], 3, 3 ; CHECK-DAG: addi [[R2:[0-9]+]], 1, -16 ; CHECK-DAG: addi [[R3:[0-9]+]], 3, 2 -; CHECK: std [[R1]], 8([[R2]]) +; CHECK: std [[R1]], -8(1) ; CHECK: std [[R3]], -16(1) ; CHECK: lxvd2x 34, 0, [[R2]] ; CHECK-NOT: stxvd2x diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll index c9eaedae4a31..f733d9040421 100644 --- a/test/CodeGen/R600/add_i64.ll +++ b/test/CodeGen/R600/add_i64.ll @@ -70,9 +70,9 @@ define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> add } ; SI-LABEL: @trunc_i64_add_to_i32 -; SI: S_LOAD_DWORD [[SREG0:s[0-9]+]], -; SI: S_LOAD_DWORD [[SREG1:s[0-9]+]], -; SI: S_ADD_I32 [[SRESULT:s[0-9]+]], [[SREG1]], [[SREG0]] +; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]] +; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]] +; SI: S_ADD_I32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]] ; SI-NOT: ADDC ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: BUFFER_STORE_DWORD [[VRESULT]], diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll index cf1148123a2a..e20037e6bb67 100644 --- a/test/CodeGen/R600/and.ll +++ b/test/CodeGen/R600/and.ll @@ -80,6 +80,15 @@ define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ret void } +; FIXME: Should use SGPRs +; FUNC-LABEL: @s_and_i1 +; SI: V_AND_B32 +define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { + %and = and i1 %a, %b + store i1 %and, i1 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: @s_and_constant_i64 ; SI: S_AND_B64 define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) { diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll index 3230353c36c7..a2b697823519 100644 --- a/test/CodeGen/R600/array-ptr-calc-i32.ll +++ b/test/CodeGen/R600/array-ptr-calc-i32.ll @@ -1,4 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s declare i32 @llvm.SI.tid() nounwind readnone declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate @@ -9,13 +10,20 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate ; be 32-bits. ; SI-LABEL: @test_private_array_ptr_calc: -; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]] + +; FIXME: We end up with zero argument for ADD, because +; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index +; with the appropriate offset. We should fold this into the store. +; SI-ALLOCA: V_ADD_I32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}} +; SI-ALLOCA: BUFFER_STORE_DWORD {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], [[PTRREG]] ; ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this ; alloca to a vector. It currently fails because it does not know how ; to interpret: ; getelementptr [4 x i32]* %alloca, i32 1, i32 %b -; SI: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]] + +; SI-PROMOTE: V_ADD_I32_e32 [[PTRREG:v[0-9]+]] +; SI-PROMOTE: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]] define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { %alloca = alloca [4 x i32], i32 4, align 16 %tid = call i32 @llvm.SI.tid() readnone diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll index bbfe856fc930..d18702a1de98 100644 --- a/test/CodeGen/R600/bfi_int.ll +++ b/test/CodeGen/R600/bfi_int.ll @@ -38,7 +38,7 @@ entry: ; R600-CHECK: @bfi_sha256_ma ; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W ; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W -; SI-CHECK: V_XOR_B32_e64 [[DST:v[0-9]+]], {{[sv][0-9]+, v[0-9]+}} +; SI-CHECK: V_XOR_B32_e32 [[DST:v[0-9]+]], {{[sv][0-9]+, v[0-9]+}} ; SI-CHECK: V_BFI_B32 {{v[0-9]+}}, [[DST]], {{[sv][0-9]+, [sv][0-9]+}} define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { diff --git a/test/CodeGen/R600/big_alu.ll b/test/CodeGen/R600/big_alu.ll index 6b683769fe06..511e8ef62951 100644 --- a/test/CodeGen/R600/big_alu.ll +++ b/test/CodeGen/R600/big_alu.ll @@ -101,7 +101,7 @@ IF137: ; preds = %main_body %88 = insertelement <4 x float> %87, float %32, i32 2 %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89) - %91 = call float @llvm.AMDGPU.rsq(float %90) + %91 = call float @llvm.AMDGPU.rsq.f32(float %90) %92 = fmul float %30, %91 %93 = fmul float %31, %91 %94 = fmul float %32, %91 @@ -344,7 +344,7 @@ ENDIF136: ; preds = %main_body, %ENDIF15 %325 = insertelement <4 x float> %324, float %318, i32 2 %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3 %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326) - %328 = call float @llvm.AMDGPU.rsq(float %327) + %328 = call float @llvm.AMDGPU.rsq.f32(float %327) %329 = fmul float %314, %328 %330 = fmul float %316, %328 %331 = fmul float %318, %328 @@ -377,7 +377,7 @@ ENDIF136: ; preds = %main_body, %ENDIF15 %358 = insertelement <4 x float> %357, float %45, i32 2 %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3 %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359) - %361 = call float @llvm.AMDGPU.rsq(float %360) + %361 = call float @llvm.AMDGPU.rsq.f32(float %360) %362 = fmul float %45, %361 %363 = call float @fabs(float %362) %364 = fmul float %176, 0x3FECCCCCC0000000 @@ -403,7 +403,7 @@ ENDIF136: ; preds = %main_body, %ENDIF15 %384 = insertelement <4 x float> %383, float %45, i32 2 %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3 %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385) - %387 = call float @llvm.AMDGPU.rsq(float %386) + %387 = call float @llvm.AMDGPU.rsq.f32(float %386) %388 = fmul float %45, %387 %389 = call float @fabs(float %388) %390 = fmul float %176, 0x3FF51EB860000000 @@ -1041,7 +1041,7 @@ IF179: ; preds = %ENDIF175 %896 = insertelement <4 x float> %895, float %45, i32 2 %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3 %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897) - %899 = call float @llvm.AMDGPU.rsq(float %898) + %899 = call float @llvm.AMDGPU.rsq.f32(float %898) %900 = fmul float %45, %899 %901 = call float @fabs(float %900) %902 = fmul float %176, 0x3FECCCCCC0000000 @@ -1150,7 +1150,7 @@ ENDIF178: ; preds = %ENDIF175, %IF179 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #1 +declare float @llvm.AMDGPU.rsq.f32(float) #1 ; Function Attrs: readnone declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 diff --git a/test/CodeGen/R600/concat_vectors.ll b/test/CodeGen/R600/concat_vectors.ll new file mode 100644 index 000000000000..9abc5a627c1c --- /dev/null +++ b/test/CodeGen/R600/concat_vectors.ll @@ -0,0 +1,249 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: @test_concat_v1i32 +; SI-NOT: MOVREL +define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { + %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> + store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_concat_v2i32 +; SI-NOT: MOVREL +define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> + store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_concat_v4i32 +; SI-NOT: MOVREL +define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_concat_v8i32 +; SI-NOT: MOVREL +define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { + %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> + store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @test_concat_v16i32 +; SI-NOT: MOVREL +define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind { + %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> + store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: @test_concat_v1f32 +; SI-NOT: MOVREL +define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind { + %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> + store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_concat_v2f32 +; SI-NOT: MOVREL +define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { + %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> + store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_concat_v4f32 +; SI-NOT: MOVREL +define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { + %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> + store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_concat_v8f32 +; SI-NOT: MOVREL +define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { + %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> + store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @test_concat_v16f32 +; SI-NOT: MOVREL +define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { + %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> + store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: @test_concat_v1i64 +; SI-NOT: MOVREL +define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { + %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> + store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_concat_v2i64 +; SI-NOT: MOVREL +define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { + %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> + store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_concat_v4i64 +; SI-NOT: MOVREL +define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { + %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> + store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @test_concat_v8i64 +; SI-NOT: MOVREL +define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { + %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> + store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: @test_concat_v16i64 +; SI-NOT: MOVREL +define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { + %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> + store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 + ret void +} + +; FUNC-LABEL: @test_concat_v1f64 +; SI-NOT: MOVREL +define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { + %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> + store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_concat_v2f64 +; SI-NOT: MOVREL +define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { + %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> + store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_concat_v4f64 +; SI-NOT: MOVREL +define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { + %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> + store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @test_concat_v8f64 +; SI-NOT: MOVREL +define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { + %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> + store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: @test_concat_v16f64 +; SI-NOT: MOVREL +define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { + %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> + store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 + ret void +} + +; FUNC-LABEL: @test_concat_v1i1 +; SI-NOT: MOVREL +define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind { + %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> + store <2 x i1> %concat, <2 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @test_concat_v2i1 +; SI-NOT: MOVREL +define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind { + %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> + store <4 x i1> %concat, <4 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @test_concat_v4i1 +; SI-NOT: MOVREL +define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind { + %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> + store <8 x i1> %concat, <8 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @test_concat_v8i1 +; SI-NOT: MOVREL +define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind { + %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> + store <16 x i1> %concat, <16 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @test_concat_v16i1 +; SI-NOT: MOVREL +define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind { + %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> + store <32 x i1> %concat, <32 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @test_concat_v32i1 +; SI-NOT: MOVREL +define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind { + %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> + store <64 x i1> %concat, <64 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @test_concat_v1i16 +; SI-NOT: MOVREL +define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { + %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> + store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_concat_v2i16 +; SI-NOT: MOVREL +define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { + %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> + store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_concat_v4i16 +; SI-NOT: MOVREL +define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { + %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_concat_v8i16 +; SI-NOT: MOVREL +define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { + %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> + store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_concat_v16i16 +; SI-NOT: MOVREL +define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind { + %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> + store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64 + ret void +} diff --git a/test/CodeGen/R600/copy-illegal-type.ll b/test/CodeGen/R600/copy-illegal-type.ll new file mode 100644 index 000000000000..f7c2321ae8fe --- /dev/null +++ b/test/CodeGen/R600/copy-illegal-type.ll @@ -0,0 +1,166 @@ +; RUN: llc -march=r600 -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: @test_copy_v4i8 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x2 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x3 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x4 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_extra_use +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE + +; After scalarizing v4i8 loads is fixed. +; XSI: BUFFER_LOAD_DWORD +; XSI: V_BFE +; XSI: V_ADD +; XSI: V_ADD +; XSI: V_ADD +; XSI: BUFFER_STORE_DWORD +; XSI: BUFFER_STORE_DWORD + +; SI: S_ENDPGM +define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x2_extra_use +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE + +; XSI: BUFFER_LOAD_DWORD +; XSI: BFE +; XSI: BUFFER_STORE_DWORD +; XSI: V_ADD +; XSI: BUFFER_STORE_DWORD +; XSI-NEXT: BUFFER_STORE_DWORD + +; SI: S_ENDPGM +define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v3i8 +; SI-NOT: BFE +; SI-NOT: BFI +; SI: S_ENDPGM +define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8> addrspace(1)* %in, align 4 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_volatile_load +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: S_ENDPGM +define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load volatile <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_volatile_store +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: S_ENDPGM +define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll index 15b5188efd60..1340ef98c605 100644 --- a/test/CodeGen/R600/ctlz_zero_undef.ll +++ b/test/CodeGen/R600/ctlz_zero_undef.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone @@ -10,6 +11,8 @@ declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: BUFFER_STORE_DWORD [[VRESULT]], ; SI: S_ENDPGM +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 @@ -21,6 +24,8 @@ define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou ; SI: V_FFBH_U32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; SI: BUFFER_STORE_DWORD [[RESULT]], ; SI: S_ENDPGM +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32 addrspace(1)* %valptr, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone @@ -34,6 +39,9 @@ define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace ; SI: V_FFBH_U32_e32 ; SI: BUFFER_STORE_DWORDX2 ; SI: S_ENDPGM +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <2 x i32> addrspace(1)* %valptr, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone @@ -49,6 +57,11 @@ define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x ; SI: V_FFBH_U32_e32 ; SI: BUFFER_STORE_DWORDX4 ; SI: S_ENDPGM +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <4 x i32> addrspace(1)* %valptr, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll index e4d11e003696..22a3022145f1 100644 --- a/test/CodeGen/R600/ctpop.ll +++ b/test/CodeGen/R600/ctpop.ll @@ -43,7 +43,7 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali ; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0 ; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]] ; SI-NOT: ADD -; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] +; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] ; SI: BUFFER_STORE_DWORD [[RESULT]], ; SI: S_ENDPGM @@ -252,3 +252,33 @@ define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrsp store i32 %add, i32 addrspace(1)* %out, align 4 ret void } + +; FIXME: We currently disallow SALU instructions in all branches, +; but there are some cases when the should be allowed. + +; FUNC-LABEL: @ctpop_i32_in_br +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0 +; SI: BUFFER_STORE_DWORD [[RESULT]], +; SI: S_ENDPGM +; EG: BCNT_INT +define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { +entry: + %0 = icmp eq i32 %cond, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i32 addrspace(1)* %in + %2 = call i32 @llvm.ctpop.i32(i32 %1) + br label %endif + +else: + %3 = getelementptr i32 addrspace(1)* %in, i32 1 + %4 = load i32 addrspace(1)* %3 + br label %endif + +endif: + %5 = phi i32 [%2, %if], [%4, %else] + store i32 %5, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll index 798d8f55ecc1..b36ecc68d895 100644 --- a/test/CodeGen/R600/ctpop64.ll +++ b/test/CodeGen/R600/ctpop64.ll @@ -89,3 +89,34 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 ret void } + +; FIXME: We currently disallow SALU instructions in all branches, +; but there are some cases when the should be allowed. + +; FUNC-LABEL: @ctpop_i64_in_br +; SI: V_BCNT_U32_B32_e64 [[BCNT_LO:v[0-9]+]], v{{[0-9]+}}, 0 +; SI: V_BCNT_U32_B32_e32 v[[BCNT:[0-9]+]], v{{[0-9]+}}, [[BCNT_LO]] +; SI: V_MOV_B32_e32 v[[ZERO:[0-9]+]], 0 +; SI: BUFFER_STORE_DWORDX2 v[ +; SI: [[BCNT]]:[[ZERO]]] +; SI: S_ENDPGM +define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i32 %cond) { +entry: + %0 = icmp eq i32 %cond, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64 addrspace(1)* %in + %2 = call i64 @llvm.ctpop.i64(i64 %1) + br label %endif + +else: + %3 = getelementptr i64 addrspace(1)* %in, i32 1 + %4 = load i64 addrspace(1)* %3 + br label %endif + +endif: + %5 = phi i64 [%2, %if], [%4, %else] + store i64 %5, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/cttz_zero_undef.ll b/test/CodeGen/R600/cttz_zero_undef.ll index cf44f8e60d01..9c4a3558d094 100644 --- a/test/CodeGen/R600/cttz_zero_undef.ll +++ b/test/CodeGen/R600/cttz_zero_undef.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone @@ -10,6 +11,8 @@ declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: BUFFER_STORE_DWORD [[VRESULT]], ; SI: S_ENDPGM +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, i32 addrspace(1)* %out, align 4 @@ -21,6 +24,8 @@ define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou ; SI: V_FFBL_B32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; SI: BUFFER_STORE_DWORD [[RESULT]], ; SI: S_ENDPGM +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32 addrspace(1)* %valptr, align 4 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone @@ -34,6 +39,9 @@ define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace ; SI: V_FFBL_B32_e32 ; SI: BUFFER_STORE_DWORDX2 ; SI: S_ENDPGM +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <2 x i32> addrspace(1)* %valptr, align 8 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone @@ -49,6 +57,11 @@ define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x ; SI: V_FFBL_B32_e32 ; SI: BUFFER_STORE_DWORDX4 ; SI: S_ENDPGM +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <4 x i32> addrspace(1)* %valptr, align 16 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll index fe97a4485626..06a601065c3e 100644 --- a/test/CodeGen/R600/cvt_f32_ubyte.ll +++ b/test/CodeGen/R600/cvt_f32_ubyte.ll @@ -43,7 +43,11 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> } ; SI-LABEL: @load_v4i8_to_v4f32: -; SI: BUFFER_LOAD_DWORD [[LOADREG:v[0-9]+]], +; We can't use BUFFER_LOAD_DWORD here, because the load is byte aligned, and +; BUFFER_LOAD_DWORD requires dword alignment. +; SI: BUFFER_LOAD_USHORT +; SI: BUFFER_LOAD_USHORT +; SI: V_OR_B32_e32 [[LOADREG:v[0-9]+]] ; SI-NOT: BFE ; SI-NOT: LSHR ; SI-DAG: V_CVT_F32_UBYTE3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] diff --git a/test/CodeGen/R600/default-fp-mode.ll b/test/CodeGen/R600/default-fp-mode.ll new file mode 100644 index 000000000000..b24a7a246fda --- /dev/null +++ b/test/CodeGen/R600/default-fp-mode.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s + +; FUNC-LABEL: @test_kernel + +; DEFAULT: FloatMode: 192 +; DEFAULT: IeeeMode: 0 + +; FP64-DENORMAL: FloatMode: 192 +; FP64-DENORMAL: IeeeMode: 0 + +; FP32-DENORMAL: FloatMode: 48 +; FP32-DENORMAL: IeeeMode: 0 + +; BOTH-DENORMAL: FloatMode: 240 +; BOTH-DENORMAL: IeeeMode: 0 + +; NO-DENORMAL: FloatMode: 0 +; NO-DENORMAL: IeeeMode: 0 +define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { + store float 0.0, float addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll index bcc7a8c8567a..8cbe9f686648 100644 --- a/test/CodeGen/R600/fcmp64.ll +++ b/test/CodeGen/R600/fcmp64.ll @@ -53,7 +53,7 @@ define void @fge_f64(double addrspace(1)* %out, double addrspace(1)* %in1, } ; CHECK: @fne_f64 -; CHECK: V_CMP_NEQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: V_CMP_NEQ_F64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll index 3d21524de0f4..20db65c5eb60 100644 --- a/test/CodeGen/R600/fdiv.ll +++ b/test/CodeGen/R600/fdiv.ll @@ -1,20 +1,37 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; These tests check that fdiv is expanded correctly and also test that the ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate ; instruction groups. -; R600-CHECK: @fdiv_v2f32 -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; SI-CHECK: @fdiv_v2f32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 +; FUNC-LABEL: @fdiv_f32 +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS + +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fdiv float %a, %b + store float %0, float addrspace(1)* %out + ret void +} + + + +; FUNC-LABEL: @fdiv_v2f32 +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS + +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { entry: %0 = fdiv <2 x float> %a, %b @@ -22,24 +39,24 @@ entry: ret void } -; R600-CHECK: @fdiv_v4f32 -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; SI-CHECK: @fdiv_v4f32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 +; FUNC-LABEL: @fdiv_v4f32 +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS + +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float> addrspace(1) * %in diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll index 51e9d29a5ca2..d72ffeceb921 100644 --- a/test/CodeGen/R600/fma.ll +++ b/test/CodeGen/R600/fma.ll @@ -1,8 +1,15 @@ -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; CHECK: @fma_f32 -; CHECK: V_FMA_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} +declare float @llvm.fma.f32(float, float, float) nounwind readnone +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +declare double @llvm.fma.f64(double, double, double) nounwind readnone +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone + +; FUNC-LABEL: @fma_f32 +; SI: V_FMA_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) { %r0 = load float addrspace(1)* %in1 @@ -13,11 +20,36 @@ define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, ret void } -declare float @llvm.fma.f32(float, float, float) +; FUNC-LABEL: @fma_v2f32 +; SI: V_FMA_F32 +; SI: V_FMA_F32 +define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, + <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) { + %r0 = load <2 x float> addrspace(1)* %in1 + %r1 = load <2 x float> addrspace(1)* %in2 + %r2 = load <2 x float> addrspace(1)* %in3 + %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) + store <2 x float> %r3, <2 x float> addrspace(1)* %out + ret void +} -; CHECK: @fma_f64 -; CHECK: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; FUNC-LABEL: @fma_v4f32 +; SI: V_FMA_F32 +; SI: V_FMA_F32 +; SI: V_FMA_F32 +; SI: V_FMA_F32 +define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, + <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) { + %r0 = load <4 x float> addrspace(1)* %in1 + %r1 = load <4 x float> addrspace(1)* %in2 + %r2 = load <4 x float> addrspace(1)* %in3 + %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2) + store <4 x float> %r3, <4 x float> addrspace(1)* %out + ret void +} +; FUNC-LABEL: @fma_f64 +; SI: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double addrspace(1)* %in1 @@ -28,4 +60,30 @@ define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ret void } -declare double @llvm.fma.f64(double, double, double) +; FUNC-LABEL: @fma_v2f64 +; SI: V_FMA_F64 +; SI: V_FMA_F64 +define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, + <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) { + %r0 = load <2 x double> addrspace(1)* %in1 + %r1 = load <2 x double> addrspace(1)* %in2 + %r2 = load <2 x double> addrspace(1)* %in3 + %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) + store <2 x double> %r3, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @fma_v4f64 +; SI: V_FMA_F64 +; SI: V_FMA_F64 +; SI: V_FMA_F64 +; SI: V_FMA_F64 +define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, + <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) { + %r0 = load <4 x double> addrspace(1)* %in1 + %r1 = load <4 x double> addrspace(1)* %in2 + %r2 = load <4 x double> addrspace(1)* %in3 + %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2) + store <4 x double> %r3, <4 x double> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/fnearbyint.ll b/test/CodeGen/R600/fnearbyint.ll new file mode 100644 index 000000000000..1c1d7315189f --- /dev/null +++ b/test/CodeGen/R600/fnearbyint.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s +; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s + +; This should have the exactly the same output as the test for rint, +; so no need to check anything. + +declare float @llvm.nearbyint.f32(float) #0 +declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #0 +declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #0 +declare double @llvm.nearbyint.f64(double) #0 +declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0 +declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 + + +define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 { +entry: + %0 = call float @llvm.nearbyint.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +entry: + %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { +entry: + %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +define void @nearbyint_f64(double addrspace(1)* %out, double %in) { +entry: + %0 = call double @llvm.nearbyint.f64(double %in) + store double %0, double addrspace(1)* %out + ret void +} +define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +entry: + %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in) + store <2 x double> %0, <2 x double> addrspace(1)* %out + ret void +} + +define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +entry: + %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in) + store <4 x double> %0, <4 x double> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readonly } +attributes #1 = { nounwind } diff --git a/test/CodeGen/R600/fp16_to_fp.ll b/test/CodeGen/R600/fp16_to_fp.ll new file mode 100644 index 000000000000..777eadc34ead --- /dev/null +++ b/test/CodeGen/R600/fp16_to_fp.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone +declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone + +; SI-LABEL: @test_convert_fp16_to_fp32: +; SI: BUFFER_LOAD_USHORT [[VAL:v[0-9]+]] +; SI: V_CVT_F32_F16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: BUFFER_STORE_DWORD [[RESULT]] +define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { + %val = load i16 addrspace(1)* %in, align 2 + %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + + +; SI-LABEL: @test_convert_fp16_to_fp64: +; SI: BUFFER_LOAD_USHORT [[VAL:v[0-9]+]] +; SI: V_CVT_F32_F16_e32 [[RESULT32:v[0-9]+]], [[VAL]] +; SI: V_CVT_F64_F32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]] +; SI: BUFFER_STORE_DWORDX2 [[RESULT]] +define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { + %val = load i16 addrspace(1)* %in, align 2 + %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone + store double %cvt, double addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/fp32_to_fp16.ll b/test/CodeGen/R600/fp32_to_fp16.ll new file mode 100644 index 000000000000..6b5ff00b5f60 --- /dev/null +++ b/test/CodeGen/R600/fp32_to_fp16.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone + +; SI-LABEL: @test_convert_fp32_to_fp16: +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]] +; SI: V_CVT_F16_F32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: BUFFER_STORE_SHORT [[RESULT]] +define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %val = load float addrspace(1)* %in, align 4 + %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone + store i16 %cvt, i16 addrspace(1)* %out, align 2 + ret void +} diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll index 8302b4f8233e..235045aaaaaa 100644 --- a/test/CodeGen/R600/fp_to_sint.ll +++ b/test/CodeGen/R600/fp_to_sint.ll @@ -1,31 +1,206 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK - -; R600-CHECK: @fp_to_sint_v2i32 -; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI-CHECK: @fp_to_sint_v2i32 -; SI-CHECK: V_CVT_I32_F32_e32 -; SI-CHECK: V_CVT_I32_F32_e32 +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +; FUNC-LABEL: @fp_to_sint_v2i32 +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: V_CVT_I32_F32_e32 +; SI: V_CVT_I32_F32_e32 define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { %result = fptosi <2 x float> %in to <2 x i32> store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void } -; R600-CHECK: @fp_to_sint_v4i32 -; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}} -; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI-CHECK: @fp_to_sint_v4i32 -; SI-CHECK: V_CVT_I32_F32_e32 -; SI-CHECK: V_CVT_I32_F32_e32 -; SI-CHECK: V_CVT_I32_F32_e32 -; SI-CHECK: V_CVT_I32_F32_e32 +; FUNC-LABEL: @fp_to_sint_v4i32 +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: V_CVT_I32_F32_e32 +; SI: V_CVT_I32_F32_e32 +; SI: V_CVT_I32_F32_e32 +; SI: V_CVT_I32_F32_e32 define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %value = load <4 x float> addrspace(1) * %in %result = fptosi <4 x float> %value to <4 x i32> store <4 x i32> %result, <4 x i32> addrspace(1)* %out ret void } + +; FUNC-LABEL: @fp_to_sint_i64 + +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT + +; Check that the compiler doesn't crash with a "cannot select" error +; SI: S_ENDPGM +define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) { +entry: + %0 = fptosi float %in to i64 + store i64 %0, i64 addrspace(1)* %out + ret void +} + +; FUNC: @fp_to_sint_v2i64 +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT + +; SI: S_ENDPGM +define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { + %conv = fptosi <2 x float> %x to <2 x i64> + store <2 x i64> %conv, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC: @fp_to_sint_v4i64 +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT + +; SI: S_ENDPGM +define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { + %conv = fptosi <4 x float> %x to <4 x i64> + store <4 x i64> %conv, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/fp_to_sint_i64.ll b/test/CodeGen/R600/fp_to_sint_i64.ll deleted file mode 100644 index ec3e19804c57..000000000000 --- a/test/CodeGen/R600/fp_to_sint_i64.ll +++ /dev/null @@ -1,12 +0,0 @@ -; FIXME: Merge into fp_to_sint.ll when EG/NI supports 64-bit types -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s - -; SI-LABEL: @fp_to_sint_i64 -; Check that the compiler doesn't crash with a "cannot select" error -; SI: S_ENDPGM -define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) { -entry: - %0 = fptosi float %in to i64 - store i64 %0, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll index 77db43b39c5f..a13018bdfecf 100644 --- a/test/CodeGen/R600/fp_to_uint.ll +++ b/test/CodeGen/R600/fp_to_uint.ll @@ -1,12 +1,11 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; R600-CHECK: @fp_to_uint_v2i32 -; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI-CHECK: @fp_to_uint_v2i32 -; SI-CHECK: V_CVT_U32_F32_e32 -; SI-CHECK: V_CVT_U32_F32_e32 +; FUNC-LABEL: @fp_to_uint_v2i32 +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; SI: V_CVT_U32_F32_e32 +; SI: V_CVT_U32_F32_e32 define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { %result = fptoui <2 x float> %in to <2 x i32> @@ -14,16 +13,15 @@ define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { ret void } -; R600-CHECK: @fp_to_uint_v4i32 -; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI-CHECK: @fp_to_uint_v4i32 -; SI-CHECK: V_CVT_U32_F32_e32 -; SI-CHECK: V_CVT_U32_F32_e32 -; SI-CHECK: V_CVT_U32_F32_e32 -; SI-CHECK: V_CVT_U32_F32_e32 +; FUNC-LABEL: @fp_to_uint_v4i32 +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: V_CVT_U32_F32_e32 +; SI: V_CVT_U32_F32_e32 +; SI: V_CVT_U32_F32_e32 +; SI: V_CVT_U32_F32_e32 define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %value = load <4 x float> addrspace(1) * %in @@ -31,3 +29,179 @@ define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspac store <4 x i32> %result, <4 x i32> addrspace(1)* %out ret void } +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s + +; FUNC: @fp_to_uint_i64 +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT + +; SI: S_ENDPGM +define void @fp_to_uint_i64(i64 addrspace(1)* %out, float %x) { + %conv = fptoui float %x to i64 + store i64 %conv, i64 addrspace(1)* %out + ret void +} + +; FUNC: @fp_to_uint_v2i64 +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT + +; SI: S_ENDPGM +define void @fp_to_uint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { + %conv = fptoui <2 x float> %x to <2 x i64> + store <2 x i64> %conv, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC: @fp_to_uint_v4i64 +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDGE_INT +; EG-DAG: CNDGE_INT + +; SI: S_ENDPGM +define void @fp_to_uint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { + %conv = fptoui <4 x float> %x to <4 x i64> + store <4 x i64> %conv, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/ftrunc.ll b/test/CodeGen/R600/ftrunc.ll index 3cd1deb921fc..0d7d4679fe3d 100644 --- a/test/CodeGen/R600/ftrunc.ll +++ b/test/CodeGen/R600/ftrunc.ll @@ -1,110 +1,119 @@ -; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s -declare double @llvm.trunc.f64(double) nounwind readnone -declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone -declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone -declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone -declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone -declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone +declare float @llvm.trunc.f32(float) nounwind readnone +declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone +declare <3 x float> @llvm.trunc.v3f32(<3 x float>) nounwind readnone +declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone +declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone +declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone -; FUNC-LABEL: @v_ftrunc_f64: -; CI: V_TRUNC_F64_e32 -; SI: V_BFE_I32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11 -; SI: S_ENDPGM -define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %x = load double addrspace(1)* %in, align 8 - %y = call double @llvm.trunc.f64(double %x) nounwind readnone - store double %y, double addrspace(1)* %out, align 8 +; FUNC-LABEL: @ftrunc_f32: +; EG: TRUNC +; SI: V_TRUNC_F32_e32 +define void @ftrunc_f32(float addrspace(1)* %out, float %x) { + %y = call float @llvm.trunc.f32(float %x) nounwind readnone + store float %y, float addrspace(1)* %out ret void } -; FUNC-LABEL: @ftrunc_f64: -; CI: V_TRUNC_F64_e32 - -; SI: S_BFE_I32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 -; SI: S_ADD_I32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 -; SI: S_LSHR_B64 -; SI: S_NOT_B64 -; SI: S_AND_B64 -; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI: CMP_LT_I32 -; SI: CNDMASK_B32 -; SI: CNDMASK_B32 -; SI: CMP_GT_I32 -; SI: CNDMASK_B32 -; SI: CNDMASK_B32 -; SI: S_ENDPGM -define void @ftrunc_f64(double addrspace(1)* %out, double %x) { - %y = call double @llvm.trunc.f64(double %x) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @ftrunc_v2f64: -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { - %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone - store <2 x double> %y, <2 x double> addrspace(1)* %out +; FUNC-LABEL: @ftrunc_v2f32: +; EG: TRUNC +; EG: TRUNC +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { + %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone + store <2 x float> %y, <2 x float> addrspace(1)* %out ret void } -; FIXME-FUNC-LABEL: @ftrunc_v3f64: -; FIXME-CI: V_TRUNC_F64_e32 -; FIXME-CI: V_TRUNC_F64_e32 -; FIXME-CI: V_TRUNC_F64_e32 -; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { -; %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone -; store <3 x double> %y, <3 x double> addrspace(1)* %out +; FIXME-FUNC-LABEL: @ftrunc_v3f32: +; FIXME-EG: TRUNC +; FIXME-EG: TRUNC +; FIXME-EG: TRUNC +; FIXME-SI: V_TRUNC_F32_e32 +; FIXME-SI: V_TRUNC_F32_e32 +; FIXME-SI: V_TRUNC_F32_e32 +; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { +; %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone +; store <3 x float> %y, <3 x float> addrspace(1)* %out ; ret void ; } -; FUNC-LABEL: @ftrunc_v4f64: -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { - %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone - store <4 x double> %y, <4 x double> addrspace(1)* %out +; FUNC-LABEL: @ftrunc_v4f32: +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { + %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone + store <4 x float> %y, <4 x float> addrspace(1)* %out ret void } -; FUNC-LABEL: @ftrunc_v8f64: -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { - %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone - store <8 x double> %y, <8 x double> addrspace(1)* %out +; FUNC-LABEL: @ftrunc_v8f32: +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { + %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone + store <8 x float> %y, <8 x float> addrspace(1)* %out ret void } -; FUNC-LABEL: @ftrunc_v16f64: -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -; CI: V_TRUNC_F64_e32 -define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { - %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone - store <16 x double> %y, <16 x double> addrspace(1)* %out +; FUNC-LABEL: @ftrunc_v16f32: +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +; SI: V_TRUNC_F32_e32 +define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { + %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone + store <16 x float> %y, <16 x float> addrspace(1)* %out ret void } diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll index db64a6fe8c7f..e0ac317f9986 100644 --- a/test/CodeGen/R600/gv-const-addrspace.ll +++ b/test/CodeGen/R600/gv-const-addrspace.ll @@ -4,11 +4,11 @@ @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2 -; XXX: Test on SI once 64-bit adds are supportes. - @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4 ; FUNC-LABEL: @float +; FIXME: We should be using S_LOAD_DWORD here. +; SI: BUFFER_LOAD_DWORD ; EG-DAG: MOV {{\** *}}T2.X ; EG-DAG: MOV {{\** *}}T3.X @@ -29,6 +29,9 @@ entry: ; FUNC-LABEL: @i32 +; FIXME: We should be using S_LOAD_DWORD here. +; SI: BUFFER_LOAD_DWORD + ; EG-DAG: MOV {{\** *}}T2.X ; EG-DAG: MOV {{\** *}}T3.X ; EG-DAG: MOV {{\** *}}T4.X @@ -50,6 +53,7 @@ entry: @struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ] ; FUNC-LABEL: @struct_foo_gv_load +; SI: S_LOAD_DWORD define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { %gep = getelementptr inbounds [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index @@ -64,9 +68,30 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { <1 x i32> ] ; FUNC-LABEL: @array_v1_gv_load +; FIXME: We should be using S_LOAD_DWORD here. +; SI: BUFFER_LOAD_DWORD define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { %gep = getelementptr inbounds [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index %load = load <1 x i32> addrspace(2)* %gep, align 4 store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4 ret void } + +define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) { +entry: + %0 = icmp eq i32 0, %a + br i1 %0, label %if, label %else + +if: + %1 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index + %2 = load float addrspace(2)* %1 + store float %2, float addrspace(1)* %out + br label %endif + +else: + store float 1.0, float addrspace(1)* %out + br label %endif + +endif: + ret void +} diff --git a/test/CodeGen/R600/half.ll b/test/CodeGen/R600/half.ll new file mode 100644 index 000000000000..42aa4faa99f4 --- /dev/null +++ b/test/CodeGen/R600/half.ll @@ -0,0 +1,61 @@ +; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s + +define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) { +; CHECK-LABEL: @test_load_store +; CHECK: BUFFER_LOAD_USHORT [[TMP:v[0-9]+]] +; CHECK: BUFFER_STORE_SHORT [[TMP]] + %val = load half addrspace(1)* %in + store half %val, half addrspace(1) * %out + ret void +} + +define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) { +; CHECK-LABEL: @test_bitcast_from_half +; CHECK: BUFFER_LOAD_USHORT [[TMP:v[0-9]+]] +; CHECK: BUFFER_STORE_SHORT [[TMP]] + %val = load half addrspace(1) * %in + %val_int = bitcast half %val to i16 + store i16 %val_int, i16 addrspace(1)* %out + ret void +} + +define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) { +; CHECK-LABEL: @test_bitcast_to_half +; CHECK: BUFFER_LOAD_USHORT [[TMP:v[0-9]+]] +; CHECK: BUFFER_STORE_SHORT [[TMP]] + %val = load i16 addrspace(1)* %in + %val_fp = bitcast i16 %val to half + store half %val_fp, half addrspace(1)* %out + ret void +} + +define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) { +; CHECK-LABEL: @test_extend32 +; CHECK: V_CVT_F32_F16_e32 + + %val16 = load half addrspace(1)* %in + %val32 = fpext half %val16 to float + store float %val32, float addrspace(1)* %out + ret void +} + +define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) { +; CHECK-LABEL: @test_extend64 +; CHECK: V_CVT_F32_F16_e32 +; CHECK: V_CVT_F64_F32_e32 + + %val16 = load half addrspace(1)* %in + %val64 = fpext half %val16 to double + store double %val64, double addrspace(1)* %out + ret void +} + +define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) { +; CHECK-LABEL: @test_trunc32 +; CHECK: V_CVT_F16_F32_e32 + + %val32 = load float addrspace(1)* %in + %val16 = fptrunc float %val32 to half + store half %val16, half addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll index b127b7ede2e8..5747434935b3 100644 --- a/test/CodeGen/R600/indirect-private-64.ll +++ b/test/CodeGen/R600/indirect-private-64.ll @@ -1,10 +1,16 @@ -; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s + declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind ; SI-LABEL: @private_access_f64_alloca: -; SI: DS_WRITE_B64 -; SI: DS_READ_B64 + +; SI-ALLOCA: BUFFER_STORE_DWORDX2 +; SI-ALLOCA: BUFFER_LOAD_DWORDX2 + +; SI-PROMOTE: DS_WRITE_B64 +; SI-PROMOTE: DS_READ_B64 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind { %val = load double addrspace(1)* %in, align 8 %array = alloca double, i32 16, align 8 @@ -17,10 +23,18 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double } ; SI-LABEL: @private_access_v2f64_alloca: -; SI: DS_WRITE_B64 -; SI: DS_WRITE_B64 -; SI: DS_READ_B64 -; SI: DS_READ_B64 + +; SI-ALLOCA: BUFFER_STORE_DWORDX4 +; SI-ALLOCA: BUFFER_LOAD_DWORDX4 + +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x double> addrspace(1)* %in, align 16 %array = alloca <2 x double>, i32 16, align 16 @@ -33,8 +47,12 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out } ; SI-LABEL: @private_access_i64_alloca: -; SI: DS_WRITE_B64 -; SI: DS_READ_B64 + +; SI-ALLOCA: BUFFER_STORE_DWORDX2 +; SI-ALLOCA: BUFFER_LOAD_DWORDX2 + +; SI-PROMOTE: DS_WRITE_B64 +; SI-PROMOTE: DS_READ_B64 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind { %val = load i64 addrspace(1)* %in, align 8 %array = alloca i64, i32 16, align 8 @@ -47,10 +65,18 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs } ; SI-LABEL: @private_access_v2i64_alloca: -; SI: DS_WRITE_B64 -; SI: DS_WRITE_B64 -; SI: DS_READ_B64 -; SI: DS_READ_B64 + +; SI-ALLOCA: BUFFER_STORE_DWORDX4 +; SI-ALLOCA: BUFFER_LOAD_DWORDX4 + +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x i64> addrspace(1)* %in, align 16 %array = alloca <2 x i64>, i32 16, align 16 diff --git a/test/CodeGen/R600/input-mods.ll b/test/CodeGen/R600/input-mods.ll new file mode 100644 index 000000000000..13bfbab85695 --- /dev/null +++ b/test/CodeGen/R600/input-mods.ll @@ -0,0 +1,26 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK + +;EG-CHECK-LABEL: @test +;EG-CHECK: EXP_IEEE * +;CM-CHECK-LABEL: @test +;CM-CHECK: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X| +;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X| +;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X| +;CM-CHECK: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X| + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = call float @llvm.fabs.f32(float %r0) + %r2 = fsub float -0.000000e+00, %r1 + %r3 = call float @llvm.exp2.f32(float %r2) + %vec = insertelement <4 x float> undef, float %r3, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @llvm.exp2.f32(float) readnone +declare float @llvm.fabs.f32(float) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/large-constant-initializer.ll b/test/CodeGen/R600/large-constant-initializer.ll index 552cd05e1373..191b5c3de912 100644 --- a/test/CodeGen/R600/large-constant-initializer.ll +++ b/test/CodeGen/R600/large-constant-initializer.ll @@ -1,6 +1,5 @@ -; XFAIL: * -; REQUIRES: asserts ; RUN: llc -march=r600 -mcpu=SI < %s +; CHECK: S_ENDPGM @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4 diff --git a/test/CodeGen/R600/llvm.AMDGPU.abs.ll b/test/CodeGen/R600/llvm.AMDGPU.abs.ll new file mode 100644 index 000000000000..a0a47b7c4701 --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.abs.ll @@ -0,0 +1,48 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone + +; Legacy name +declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone + +; FUNC-LABEL: @s_abs_i32 +; SI: S_SUB_I32 +; SI: S_MAX_I32 +; SI: S_ENDPGM + +; EG: SUB_INT +; EG: MAX_INT +define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind { + %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone + store i32 %abs, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_abs_i32 +; SI: V_SUB_I32_e32 +; SI: V_MAX_I32_e32 +; SI: S_ENDPGM + +; EG: SUB_INT +; EG: MAX_INT +define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { + %val = load i32 addrspace(1)* %src, align 4 + %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone + store i32 %abs, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @abs_i32_legacy_amdil +; SI: V_SUB_I32_e32 +; SI: V_MAX_I32_e32 +; SI: S_ENDPGM + +; EG: SUB_INT +; EG: MAX_INT +define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { + %val = load i32 addrspace(1)* %src, align 4 + %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone + store i32 %abs, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll new file mode 100644 index 000000000000..47f5255e5012 --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: @test_barrier_global +; EG: GROUP_BARRIER +; SI: S_BARRIER + +define void @test_barrier_global(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() + %1 = getelementptr i32 addrspace(1)* %out, i32 %0 + store i32 %0, i32 addrspace(1)* %1 + call void @llvm.AMDGPU.barrier.global() + %2 = call i32 @llvm.r600.read.local.size.x() + %3 = sub i32 %2, 1 + %4 = sub i32 %3, %0 + %5 = getelementptr i32 addrspace(1)* %out, i32 %4 + %6 = load i32 addrspace(1)* %5 + store i32 %6, i32 addrspace(1)* %1 + ret void +} + +declare void @llvm.AMDGPU.barrier.global() + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.local.size.x() #0 + +attributes #0 = { readnone } diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll index 8d3c9ca22300..7203675bb47b 100644 --- a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll +++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll @@ -1,8 +1,11 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; CHECK: GROUP_BARRIER +; FUNC-LABEL: @test_barrier_local +; EG: GROUP_BARRIER +; SI: S_BARRIER -define void @test(i32 addrspace(1)* %out) { +define void @test_barrier_local(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() %1 = getelementptr i32 addrspace(1)* %out, i32 %0 @@ -17,8 +20,9 @@ entry: ret void } -declare i32 @llvm.r600.read.tidig.x() #0 declare void @llvm.AMDGPU.barrier.local() + +declare i32 @llvm.r600.read.tidig.x() #0 declare i32 @llvm.r600.read.local.size.x() #0 attributes #0 = { readnone } diff --git a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll new file mode 100644 index 000000000000..d608953a0dd2 --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone +declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone + +; FUNC-LABEL: @clamp_0_1_f32 +; SI: S_LOAD_DWORD [[ARG:s[0-9]+]], +; SI: V_ADD_F32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0, 1, 0 +; SI: BUFFER_STORE_DWORD [[RESULT]] +; SI: S_ENDPGM + +; EG: MOV_SAT +define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind { + %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @clamp_0_1_amdil_legacy_f32 +; SI: S_LOAD_DWORD [[ARG:s[0-9]+]], +; SI: V_ADD_F32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0, 1, 0 +; SI: BUFFER_STORE_DWORD [[RESULT]] +define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind { + %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll new file mode 100644 index 000000000000..c8c73573e073 --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone +declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone + +; SI-LABEL: @test_div_fixup_f32: +; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]] +; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: V_DIV_FIXUP_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] +; SI: BUFFER_STORE_DWORD [[RESULT]], +; SI: S_ENDPGM +define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @test_div_fixup_f64: +; SI: V_DIV_FIXUP_F64 +define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { + %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll new file mode 100644 index 000000000000..4f1e827c2cbd --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.AMDGPU.div.fmas.f32(float, float, float) nounwind readnone +declare double @llvm.AMDGPU.div.fmas.f64(double, double, double) nounwind readnone + +; SI-LABEL: @test_div_fmas_f32: +; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]] +; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: V_DIV_FMAS_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] +; SI: BUFFER_STORE_DWORD [[RESULT]], +; SI: S_ENDPGM +define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @test_div_fmas_f64: +; SI: V_DIV_FMAS_F64 +define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { + %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll new file mode 100644 index 000000000000..527c8da10a3c --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll @@ -0,0 +1,48 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone +declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone + +; SI-LABEL @test_div_scale_f32_1: +; SI: V_DIV_SCALE_F32 +define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_2: +; SI: V_DIV_SCALE_F32 +define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_1: +; SI: V_DIV_SCALE_F64 +define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind { + %a = load double addrspace(1)* %aptr, align 8 + %b = load double addrspace(1)* %bptr, align 8 + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_1: +; SI: V_DIV_SCALE_F64 +define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind { + %a = load double addrspace(1)* %aptr, align 8 + %b = load double addrspace(1)* %bptr, align 8 + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.ll new file mode 100644 index 000000000000..72ec1c57571e --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.fract.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone + +; Legacy name +declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone + +; FUNC-LABEL: @fract_f32 +; SI: V_FRACT_F32 +; EG: FRACT +define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { + %val = load float addrspace(1)* %src, align 4 + %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone + store float %fract, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fract_f32_legacy_amdil +; SI: V_FRACT_F32 +; EG: FRACT +define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { + %val = load float addrspace(1)* %src, align 4 + %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone + store float %fract, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.kill.ll b/test/CodeGen/R600/llvm.AMDGPU.kill.ll index 4ab6a8ae09f1..1f82ffb53f1d 100644 --- a/test/CodeGen/R600/llvm.AMDGPU.kill.ll +++ b/test/CodeGen/R600/llvm.AMDGPU.kill.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: @kill_gs_const ; SI-NOT: V_CMPX_LE_F32 diff --git a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll new file mode 100644 index 000000000000..51964eefa64f --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone + +; FUNC-LABEL: @rsq_legacy_f32 +; SI: V_RSQ_LEGACY_F32_e32 +; EG: RECIPSQRT_IEEE +define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll new file mode 100644 index 000000000000..b5dda0ce81f9 --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone +declare double @llvm.sqrt.f64(double) nounwind readnone + +; FUNC-LABEL: @rcp_f64 +; SI: V_RCP_F64_e32 +define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @rcp_pat_f64 +; SI: V_RCP_F64_e32 +define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = fdiv double 1.0, %src + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @rsq_rcp_pat_f64 +; SI-UNSAFE: V_RSQ_F64_e32 +; SI-SAFE-NOT: V_RSQ_F64_e32 +define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone + %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll new file mode 100644 index 000000000000..8d5d66e149ba --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s + +; XUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone + + +declare float @llvm.sqrt.f32(float) nounwind readnone +declare double @llvm.sqrt.f64(double) nounwind readnone + +; FUNC-LABEL: @rcp_f32 +; SI: V_RCP_F32_e32 +define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @rcp_f64 +; SI: V_RCP_F64_e32 +define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @rcp_pat_f32 +; SI-SAFE: V_RCP_F32_e32 +; XSI-SAFE-SPDENORM-NOT: V_RCP_F32_e32 +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = fdiv float 1.0, %src + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @rcp_pat_f64 +; SI: V_RCP_F64_e32 +define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = fdiv double 1.0, %src + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @rsq_rcp_pat_f32 +; SI-UNSAFE: V_RSQ_F32_e32 +; SI-SAFE: V_SQRT_F32_e32 +; SI-SAFE: V_RCP_F32_e32 +define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone + %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @rsq_rcp_pat_f64 +; SI-UNSAFE: V_RSQ_F64_e32 +; SI-SAFE-NOT: V_RSQ_F64_e32 +define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone + %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll new file mode 100644 index 000000000000..100d6ff77707 --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll @@ -0,0 +1,11 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone + +; FUNC-LABEL: @rsq_clamped_f64 +; SI: V_RSQ_CLAMP_F64_e32 +define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { + %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone + store double %rsq_clamped, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll new file mode 100644 index 000000000000..683df7355ac6 --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone + +; FUNC-LABEL: @rsq_clamped_f32 +; SI: V_RSQ_CLAMP_F32_e32 +; EG: RECIPSQRT_CLAMPED +define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone + store float %rsq_clamped, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll new file mode 100644 index 000000000000..27cf6b28fd66 --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone + +; FUNC-LABEL: @rsq_f32 +; SI: V_RSQ_F32_e32 +; EG: RECIPSQRT_IEEE +define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll new file mode 100644 index 000000000000..1c736d447ea9 --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone + +; SI-LABEL: @test_trig_preop_f64: +; SI-DAG: BUFFER_LOAD_DWORD [[SEG:v[0-9]+]] +; SI-DAG: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]], +; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]] +; SI: BUFFER_STORE_DWORDX2 [[RESULT]], +; SI: S_ENDPGM +define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load double addrspace(1)* %aptr, align 8 + %b = load i32 addrspace(1)* %bptr, align 4 + %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: @test_trig_preop_f64_imm_segment: +; SI: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]], +; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7 +; SI: BUFFER_STORE_DWORDX2 [[RESULT]], +; SI: S_ENDPGM +define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { + %a = load double addrspace(1)* %aptr, align 8 + %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/R600/llvm.SI.gather4.ll b/test/CodeGen/R600/llvm.SI.gather4.ll new file mode 100644 index 000000000000..8402faaa4dca --- /dev/null +++ b/test/CodeGen/R600/llvm.SI.gather4.ll @@ -0,0 +1,508 @@ +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: @gather4_v2 +;CHECK: IMAGE_GATHER4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_v2() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4 +;CHECK: IMAGE_GATHER4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_cl +;CHECK: IMAGE_GATHER4_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_l +;CHECK: IMAGE_GATHER4_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_b +;CHECK: IMAGE_GATHER4_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_b_cl +;CHECK: IMAGE_GATHER4_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_b_cl_v8 +;CHECK: IMAGE_GATHER4_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_cl_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_lz_v2 +;CHECK: IMAGE_GATHER4_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_lz_v2() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_lz +;CHECK: IMAGE_GATHER4_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +;CHECK-LABEL: @gather4_o +;CHECK: IMAGE_GATHER4_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_cl_o +;CHECK: IMAGE_GATHER4_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_cl_o_v8 +;CHECK: IMAGE_GATHER4_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_cl_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_l_o +;CHECK: IMAGE_GATHER4_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_l_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_l_o_v8 +;CHECK: IMAGE_GATHER4_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_l_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_b_o +;CHECK: IMAGE_GATHER4_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_b_o_v8 +;CHECK: IMAGE_GATHER4_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_b_cl_o +;CHECK: IMAGE_GATHER4_B_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_lz_o +;CHECK: IMAGE_GATHER4_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_lz_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +;CHECK-LABEL: @gather4_c +;CHECK: IMAGE_GATHER4_C {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_cl +;CHECK: IMAGE_GATHER4_C_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_cl_v8 +;CHECK: IMAGE_GATHER4_C_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_cl_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_l +;CHECK: IMAGE_GATHER4_C_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_l_v8 +;CHECK: IMAGE_GATHER4_C_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_l_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_b +;CHECK: IMAGE_GATHER4_C_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_b_v8 +;CHECK: IMAGE_GATHER4_C_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_b_cl +;CHECK: IMAGE_GATHER4_C_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_lz +;CHECK: IMAGE_GATHER4_C_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +;CHECK-LABEL: @gather4_c_o +;CHECK: IMAGE_GATHER4_C_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_o_v8 +;CHECK: IMAGE_GATHER4_C_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_cl_o +;CHECK: IMAGE_GATHER4_C_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_l_o +;CHECK: IMAGE_GATHER4_C_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_l_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_b_o +;CHECK: IMAGE_GATHER4_C_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_b_cl_o +;CHECK: IMAGE_GATHER4_C_B_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_lz_o +;CHECK: IMAGE_GATHER4_C_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_lz_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @gather4_c_lz_o_v8 +;CHECK: IMAGE_GATHER4_C_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_lz_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.SI.getlod.ll b/test/CodeGen/R600/llvm.SI.getlod.ll new file mode 100644 index 000000000000..a7a17ec3fffa --- /dev/null +++ b/test/CodeGen/R600/llvm.SI.getlod.ll @@ -0,0 +1,44 @@ +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: @getlod +;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getlod() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) + ret void +} + +;CHECK-LABEL: @getlod_v2 +;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getlod_v2() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) + ret void +} + +;CHECK-LABEL: @getlod_v4 +;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getlod_v4() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) + ret void +} + + +declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.SI.image.ll b/test/CodeGen/R600/llvm.SI.image.ll new file mode 100644 index 000000000000..eac0b8eead3a --- /dev/null +++ b/test/CodeGen/R600/llvm.SI.image.ll @@ -0,0 +1,49 @@ +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: @image_load +;CHECK: IMAGE_LOAD {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @image_load() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @image_load_mip +;CHECK: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @image_load_mip() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @getresinfo +;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getresinfo() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.SI.image.sample.ll b/test/CodeGen/R600/llvm.SI.image.sample.ll new file mode 100644 index 000000000000..14dff7eb5fea --- /dev/null +++ b/test/CodeGen/R600/llvm.SI.image.sample.ll @@ -0,0 +1,289 @@ +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: @sample +;CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_cl +;CHECK: IMAGE_SAMPLE_CL {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_d +;CHECK: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_d_cl +;CHECK: IMAGE_SAMPLE_D_CL {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_l +;CHECK: IMAGE_SAMPLE_L {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_b +;CHECK: IMAGE_SAMPLE_B {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_b_cl +;CHECK: IMAGE_SAMPLE_B_CL {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_lz +;CHECK: IMAGE_SAMPLE_LZ {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_cd +;CHECK: IMAGE_SAMPLE_CD {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_cd_cl +;CHECK: IMAGE_SAMPLE_CD_CL {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c +;CHECK: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_cl +;CHECK: IMAGE_SAMPLE_C_CL {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_d +;CHECK: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_d_cl +;CHECK: IMAGE_SAMPLE_C_D_CL {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_l +;CHECK: IMAGE_SAMPLE_C_L {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_b +;CHECK: IMAGE_SAMPLE_C_B {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_b_cl +;CHECK: IMAGE_SAMPLE_C_B_CL {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_lz +;CHECK: IMAGE_SAMPLE_C_LZ {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_cd +;CHECK: IMAGE_SAMPLE_C_CD {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_cd_cl +;CHECK: IMAGE_SAMPLE_C_CD_CL {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + +declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.SI.image.sample.o.ll b/test/CodeGen/R600/llvm.SI.image.sample.o.ll new file mode 100644 index 000000000000..ed3ef9140143 --- /dev/null +++ b/test/CodeGen/R600/llvm.SI.image.sample.o.ll @@ -0,0 +1,289 @@ +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: @sample +;CHECK: IMAGE_SAMPLE_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_cl +;CHECK: IMAGE_SAMPLE_CL_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_d +;CHECK: IMAGE_SAMPLE_D_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_d_cl +;CHECK: IMAGE_SAMPLE_D_CL_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_l +;CHECK: IMAGE_SAMPLE_L_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_b +;CHECK: IMAGE_SAMPLE_B_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_b_cl +;CHECK: IMAGE_SAMPLE_B_CL_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_lz +;CHECK: IMAGE_SAMPLE_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_cd +;CHECK: IMAGE_SAMPLE_CD_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_cd_cl +;CHECK: IMAGE_SAMPLE_CD_CL_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c +;CHECK: IMAGE_SAMPLE_C_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_cl +;CHECK: IMAGE_SAMPLE_C_CL_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_d +;CHECK: IMAGE_SAMPLE_C_D_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_d_cl +;CHECK: IMAGE_SAMPLE_C_D_CL_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_l +;CHECK: IMAGE_SAMPLE_C_L_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_b +;CHECK: IMAGE_SAMPLE_C_B_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_b_cl +;CHECK: IMAGE_SAMPLE_C_B_CL_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_lz +;CHECK: IMAGE_SAMPLE_C_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_cd +;CHECK: IMAGE_SAMPLE_C_CD_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: @sample_c_cd_cl +;CHECK: IMAGE_SAMPLE_C_CD_CL_O {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + +declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.amdgpu.dp4.ll b/test/CodeGen/R600/llvm.amdgpu.dp4.ll new file mode 100644 index 000000000000..812b6a40ee59 --- /dev/null +++ b/test/CodeGen/R600/llvm.amdgpu.dp4.ll @@ -0,0 +1,11 @@ +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone + +define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind { + %src0 = load <4 x float> addrspace(1)* %a, align 16 + %src1 = load <4 x float> addrspace(1)* %b, align 16 + %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone + store float %dp4, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/llvm.amdgpu.kilp.ll b/test/CodeGen/R600/llvm.amdgpu.kilp.ll new file mode 100644 index 000000000000..1b8b1bfd2089 --- /dev/null +++ b/test/CodeGen/R600/llvm.amdgpu.kilp.ll @@ -0,0 +1,20 @@ +; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: @kilp_gs_const +; SI: S_MOV_B64 exec, 0 +define void @kilp_gs_const() #0 { +main_body: + %0 = icmp ule i32 0, 3 + %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kilp(float %1) + %2 = icmp ule i32 3, 0 + %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kilp(float %3) + ret void +} + +declare void @llvm.AMDGPU.kilp(float) + +attributes #0 = { "ShaderType"="2" } + +!0 = metadata !{metadata !"const", null, i32 1} diff --git a/test/CodeGen/R600/llvm.amdgpu.lrp.ll b/test/CodeGen/R600/llvm.amdgpu.lrp.ll new file mode 100644 index 000000000000..c493a016e330 --- /dev/null +++ b/test/CodeGen/R600/llvm.amdgpu.lrp.ll @@ -0,0 +1,12 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone + +; FUNC-LABEL: @test_lrp +; SI: V_SUB_F32 +; SI: V_MAD_F32 +define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind { + %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone + store float %mad, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/llvm.exp2.ll b/test/CodeGen/R600/llvm.exp2.ll index 13bfbab85695..119d5ef49a5e 100644 --- a/test/CodeGen/R600/llvm.exp2.ll +++ b/test/CodeGen/R600/llvm.exp2.ll @@ -1,26 +1,79 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK -;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK --check-prefix=FUNC +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK --check-prefix=FUNC +;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC -;EG-CHECK-LABEL: @test -;EG-CHECK: EXP_IEEE * -;CM-CHECK-LABEL: @test -;CM-CHECK: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X| -;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X| -;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X| -;CM-CHECK: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X| +;FUNC-LABEL: @test +;EG-CHECK: EXP_IEEE +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;SI-CHECK: V_EXP_F32 -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = call float @llvm.fabs.f32(float %r0) - %r2 = fsub float -0.000000e+00, %r1 - %r3 = call float @llvm.exp2.f32(float %r2) - %vec = insertelement <4 x float> undef, float %r3, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) +define void @test(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.exp2.f32(float %in) + store float %0, float addrspace(1)* %out ret void } -declare float @llvm.exp2.f32(float) readnone -declare float @llvm.fabs.f32(float) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) +;FUNC-LABEL: @testv2 +;EG-CHECK: EXP_IEEE +;EG-CHECK: EXP_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;SI-CHECK: V_EXP_F32 +;SI-CHECK: V_EXP_F32 + +define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} -attributes #0 = { "ShaderType"="0" } +;FUNC-LABEL: @testv4 +;EG-CHECK: EXP_IEEE +;EG-CHECK: EXP_IEEE +;EG-CHECK: EXP_IEEE +;EG-CHECK: EXP_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;SI-CHECK: V_EXP_F32 +;SI-CHECK: V_EXP_F32 +;SI-CHECK: V_EXP_F32 +;SI-CHECK: V_EXP_F32 +define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.exp2.f32(float) readnone +declare <2 x float> @llvm.exp2.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.exp2.v4f32(<4 x float>) readnone diff --git a/test/CodeGen/R600/llvm.log2.ll b/test/CodeGen/R600/llvm.log2.ll new file mode 100644 index 000000000000..4cba2d44a5c3 --- /dev/null +++ b/test/CodeGen/R600/llvm.log2.ll @@ -0,0 +1,79 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK --check-prefix=FUNC +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK --check-prefix=FUNC +;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC + +;FUNC-LABEL: @test +;EG-CHECK: LOG_IEEE +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;SI-CHECK: V_LOG_F32 + +define void @test(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.log2.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +;FUNC-LABEL: @testv2 +;EG-CHECK: LOG_IEEE +;EG-CHECK: LOG_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;SI-CHECK: V_LOG_F32 +;SI-CHECK: V_LOG_F32 + +define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +;FUNC-LABEL: @testv4 +;EG-CHECK: LOG_IEEE +;EG-CHECK: LOG_IEEE +;EG-CHECK: LOG_IEEE +;EG-CHECK: LOG_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;SI-CHECK: V_LOG_F32 +;SI-CHECK: V_LOG_F32 +;SI-CHECK: V_LOG_F32 +;SI-CHECK: V_LOG_F32 +define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.log2.f32(float) readnone +declare <2 x float> @llvm.log2.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.log2.v4f32(<4 x float>) readnone diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll index 41c363cc871f..53006bad5c4b 100644 --- a/test/CodeGen/R600/llvm.sin.ll +++ b/test/CodeGen/R600/llvm.sin.ll @@ -1,5 +1,6 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC -;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC +;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +;RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s +;RUN: llc -march=r600 -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s ;FUNC-LABEL: test ;EG: MULADD_IEEE * @@ -8,6 +9,7 @@ ;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} ;EG-NOT: SIN ;SI: V_MUL_F32 +;SI: V_FRACT_F32 ;SI: V_SIN_F32 ;SI-NOT: V_SIN_F32 @@ -17,6 +19,22 @@ define void @test(float addrspace(1)* %out, float %x) #1 { ret void } +;FUNC-LABEL: testf +;SI-UNSAFE: 4.774 +;SI-UNSAFE: V_MUL_F32 +;SI-SAFE: V_MUL_F32 +;SI-SAFE: V_MUL_F32 +;SI: V_FRACT_F32 +;SI: V_SIN_F32 +;SI-NOT: V_SIN_F32 + +define void @testf(float addrspace(1)* %out, float %x) #1 { + %y = fmul float 3.0, %x + %sin = call float @llvm.sin.f32(float %y) + store float %sin, float addrspace(1)* %out + ret void +} + ;FUNC-LABEL: testv ;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} ;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll index 1486c4dbff66..8905fbd3aeb6 100644 --- a/test/CodeGen/R600/load.ll +++ b/test/CodeGen/R600/load.ll @@ -254,8 +254,8 @@ entry: ; load a v2f32 value from the global address space ; FUNC-LABEL: @load_v2f32 +; R600-CHECK: MEM_RAT ; R600-CHECK: VTX_READ_64 - ; SI-CHECK: BUFFER_LOAD_DWORDX2 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { entry: @@ -265,9 +265,7 @@ entry: } ; FUNC-LABEL: @load_i64 -; R600-CHECK: MEM_RAT -; R600-CHECK: MEM_RAT - +; R600-CHECK: VTX_READ_64 ; SI-CHECK: BUFFER_LOAD_DWORDX2 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { entry: @@ -696,8 +694,7 @@ entry: ; R600-CHECK: LDS_READ_RET ; R600-CHECK: LDS_READ_RET ; SI-CHECK: S_MOV_B32 m0 -; SI-CHECK: DS_READ_B32 -; SI-CHECK: DS_READ_B32 +; SI-CHECK: DS_READ_B64 define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) { entry: %0 = load <2 x float> addrspace(3)* %in diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll index 9878366a8a80..3c3b475d077c 100644 --- a/test/CodeGen/R600/or.ll +++ b/test/CodeGen/R600/or.ll @@ -116,9 +116,9 @@ define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 } ; SI-LABEL: @trunc_i64_or_to_i32 -; SI: S_LOAD_DWORD [[SREG0:s[0-9]+]], -; SI: S_LOAD_DWORD [[SREG1:s[0-9]+]], -; SI: S_OR_B32 [[SRESULT:s[0-9]+]], [[SREG1]], [[SREG0]] +; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]] +; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]] +; SI: S_OR_B32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]] ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: BUFFER_STORE_DWORD [[VRESULT]], define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { @@ -127,3 +127,19 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { store i32 %trunc, i32 addrspace(1)* %out, align 8 ret void } + +; EG-CHECK: @or_i1 +; EG-CHECK: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} + +; SI-CHECK: @or_i1 +; SI-CHECK: S_OR_B64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] +define void @or_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { + %a = load float addrspace(1) * %in0 + %b = load float addrspace(1) * %in1 + %acmp = fcmp oge float %a, 0.000000e+00 + %bcmp = fcmp oge float %b, 0.000000e+00 + %or = or i1 %acmp, %bcmp + %result = select i1 %or, float %a, float %b + store float %result, float addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/private-memory-atomics.ll b/test/CodeGen/R600/private-memory-atomics.ll new file mode 100644 index 000000000000..def4f9dee521 --- /dev/null +++ b/test/CodeGen/R600/private-memory-atomics.ll @@ -0,0 +1,31 @@ +; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s + +; This works because promote allocas pass replaces these with LDS atomics. + +; Private atomics have no real use, but at least shouldn't crash on it. +define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %tmp3 = getelementptr [2 x i32]* %tmp, i32 0, i32 %in + %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel + store i32 %tmp4, i32 addrspace(1)* %out + ret void +} + +define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %tmp3 = getelementptr [2 x i32]* %tmp, i32 0, i32 %in + %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic + %val = extractvalue { i32, i1 } %tmp4, 0 + store i32 %val, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/private-memory-broken.ll b/test/CodeGen/R600/private-memory-broken.ll new file mode 100644 index 000000000000..40860858eb0f --- /dev/null +++ b/test/CodeGen/R600/private-memory-broken.ll @@ -0,0 +1,20 @@ +; RUN: not llc -verify-machineinstrs -march=r600 -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s + +; Make sure promote alloca pass doesn't crash + +; CHECK: unsupported call + +declare i32 @foo(i32*) nounwind + +define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %tmp3 = getelementptr [2 x i32]* %tmp, i32 0, i32 %in + %val = call i32 @foo(i32* %tmp3) nounwind + store i32 %val, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll index c60c05975600..3ce8c2cb03d4 100644 --- a/test/CodeGen/R600/private-memory.ll +++ b/test/CodeGen/R600/private-memory.ll @@ -1,17 +1,23 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC -; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC +; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; FUNC-LABEL: @mova_same_clause -; R600-CHECK: LDS_WRITE -; R600-CHECK: LDS_WRITE -; R600-CHECK: LDS_READ -; R600-CHECK: LDS_READ +; R600: LDS_WRITE +; R600: LDS_WRITE +; R600: LDS_READ +; R600: LDS_READ -; SI-CHECK: DS_WRITE_B32 -; SI-CHECK: DS_WRITE_B32 -; SI-CHECK: DS_READ_B32 -; SI-CHECK: DS_READ_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 + +; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}} +; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}} define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: %stack = alloca [5 x i32], align 4 @@ -40,8 +46,9 @@ entry: ; this. ; FUNC-LABEL: @multiple_structs -; R600-CHECK-NOT: MOVA_INT -; SI-CHECK-NOT: V_MOVREL +; R600-NOT: MOVA_INT +; SI-NOT: V_MOVREL +; SI-NOT: V_MOVREL %struct.point = type { i32, i32 } define void @multiple_structs(i32 addrspace(1)* %out) { @@ -70,8 +77,8 @@ entry: ; MOVA instructions. ; FUNC-LABEL: @direct_loop -; R600-CHECK-NOT: MOVA_INT -; SI-CHECK-NOT: V_MOVREL +; R600-NOT: MOVA_INT +; SI-NOT: V_MOVREL define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: @@ -107,9 +114,11 @@ for.end: ; FUNC-LABEL: @short_array -; R600-CHECK: MOVA_INT +; R600: MOVA_INT -; SI-CHECK: V_MOVRELS_B32_e32 +; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}} +; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}} +; SI_PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] + v{{[0-9]+}}, s{{[0-9]+}} define void @short_array(i32 addrspace(1)* %out, i32 %index) { entry: %0 = alloca [2 x i16] @@ -126,10 +135,10 @@ entry: ; FUNC-LABEL: @char_array -; R600-CHECK: MOVA_INT +; R600: MOVA_INT -; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100 -; SI-CHECK: V_MOVRELS_B32_e32 +; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x0 +; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x1 define void @char_array(i32 addrspace(1)* %out, i32 %index) { entry: %0 = alloca [2 x i8] @@ -148,11 +157,11 @@ entry: ; Make sure we don't overwrite workitem information with private memory ; FUNC-LABEL: @work_item_info -; R600-CHECK-NOT: MOV T0.X +; R600-NOT: MOV T0.X ; Additional check in case the move ends up in the last slot -; R600-CHECK-NOT: MOV * TO.X +; R600-NOT: MOV * TO.X -; SI-CHECK-NOT: V_MOV_B32_e{{(32|64)}} v0 +; SI-NOT: V_MOV_B32_e{{(32|64)}} v0 define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { entry: %0 = alloca [2 x i32] @@ -173,8 +182,8 @@ entry: ; FUNC-LABEL: @no_overlap ; R600_CHECK: MOV ; R600_CHECK: [[CHAN:[XYZW]]]+ -; R600-CHECK-NOT: [[CHAN]]+ -; SI-CHECK: V_MOV_B32_e32 v3 +; R600-NOT: [[CHAN]]+ +; SI: V_MOV_B32_e32 v3 define void @no_overlap(i32 addrspace(1)* %out, i32 %in) { entry: %0 = alloca [3 x i8], align 1 @@ -199,6 +208,85 @@ entry: ret void } +define void @char_array_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x i8]] + %gep0 = getelementptr [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1 + store i8 0, i8* %gep0 + store i8 1, i8* %gep1 + %gep2 = getelementptr [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index + %load = load i8* %gep2 + %sext = sext i8 %load to i32 + store i32 %sext, i32 addrspace(1)* %out + ret void +} + +define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x i32]] + %gep0 = getelementptr [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1 + store i32 0, i32* %gep0 + store i32 1, i32* %gep1 + %gep2 = getelementptr [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index + %load = load i32* %gep2 + store i32 %load, i32 addrspace(1)* %out + ret void +} +define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x i64]] + %gep0 = getelementptr [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1 + store i64 0, i64* %gep0 + store i64 1, i64* %gep1 + %gep2 = getelementptr [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index + %load = load i64* %gep2 + store i64 %load, i64 addrspace(1)* %out + ret void +} + +%struct.pair32 = type { i32, i32 } + +define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x %struct.pair32]] + %gep0 = getelementptr [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 + %gep1 = getelementptr [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1 + store i32 0, i32* %gep0 + store i32 1, i32* %gep1 + %gep2 = getelementptr [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0 + %load = load i32* %gep2 + store i32 %load, i32 addrspace(1)* %out + ret void +} + +define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x %struct.pair32] + %gep0 = getelementptr [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 + %gep1 = getelementptr [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0 + store i32 0, i32* %gep0 + store i32 1, i32* %gep1 + %gep2 = getelementptr [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0 + %load = load i32* %gep2 + store i32 %load, i32 addrspace(1)* %out + ret void +} + +define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %cmp = icmp eq i32 %in, 0 + %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2 + %load = load i32* %sel + store i32 %load, i32 addrspace(1)* %out + ret void +} -declare i32 @llvm.r600.read.tidig.x() nounwind readnone diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll index f322bc71c6bd..55eb56d3fb1d 100644 --- a/test/CodeGen/R600/pv.ll +++ b/test/CodeGen/R600/pv.ll @@ -103,7 +103,7 @@ main_body: %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3 %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95) %97 = call float @fabs(float %96) - %98 = call float @llvm.AMDGPU.rsq(float %97) + %98 = call float @llvm.AMDGPU.rsq.f32(float %97) %99 = fmul float %4, %98 %100 = fmul float %5, %98 %101 = fmul float %6, %98 @@ -225,7 +225,7 @@ declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 declare float @fabs(float) #2 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #1 +declare float @llvm.AMDGPU.rsq.f32(float) #1 ; Function Attrs: readnone declare float @llvm.AMDIL.clamp.(float, float, float) #1 diff --git a/test/CodeGen/R600/reorder-stores.ll b/test/CodeGen/R600/reorder-stores.ll new file mode 100644 index 000000000000..be2fcc6849fb --- /dev/null +++ b/test/CodeGen/R600/reorder-stores.ll @@ -0,0 +1,104 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: @no_reorder_v2f64_global_load_store +; SI: BUFFER_LOAD_DWORDX2 +; SI: BUFFER_LOAD_DWORDX2 +; SI: BUFFER_LOAD_DWORDX2 +; SI: BUFFER_LOAD_DWORDX2 +; SI: BUFFER_STORE_DWORDX2 +; SI: BUFFER_STORE_DWORDX2 +; SI: BUFFER_STORE_DWORDX2 +; SI: BUFFER_STORE_DWORDX2 +; SI: S_ENDPGM +define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { + %tmp1 = load <2 x double> addrspace(1)* %x, align 16 + %tmp4 = load <2 x double> addrspace(1)* %y, align 16 + store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16 + store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16 + ret void +} + +; SI-LABEL: @no_reorder_scalarized_v2f64_local_load_store +; SI: DS_READ_B64 +; SI: DS_READ_B64 +; SI: DS_WRITE_B64 +; SI: DS_WRITE_B64 +; SI: S_ENDPGM +define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { + %tmp1 = load <2 x double> addrspace(3)* %x, align 16 + %tmp4 = load <2 x double> addrspace(3)* %y, align 16 + store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16 + store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16 + ret void +} + +; SI-LABEL: @no_reorder_split_v8i32_global_load_store +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD + +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD + +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD + +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD +; SI: BUFFER_LOAD_DWORD + + +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD + +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD + +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD + +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD +; SI: BUFFER_STORE_DWORD +; SI: S_ENDPGM +define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { + %tmp1 = load <8 x i32> addrspace(1)* %x, align 32 + %tmp4 = load <8 x i32> addrspace(1)* %y, align 32 + store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32 + store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32 + ret void +} + +; SI-LABEL: @no_reorder_extload_64 +; SI: DS_READ_B64 +; SI: DS_READ_B64 +; SI: DS_WRITE_B64 +; SI-NOT: DS_READ +; SI: DS_WRITE_B64 +; SI: S_ENDPGM +define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind { + %tmp1 = load <2 x i32> addrspace(3)* %x, align 8 + %tmp4 = load <2 x i32> addrspace(3)* %y, align 8 + %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64> + %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64> + %tmp7 = add <2 x i64> %tmp1ext, + %tmp9 = add <2 x i64> %tmp4ext, + %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32> + %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32> + store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8 + store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8 + ret void +} diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll index 87c05701104f..3069f62724b7 100644 --- a/test/CodeGen/R600/rsq.ll +++ b/test/CodeGen/R600/rsq.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s declare float @llvm.sqrt.f32(float) nounwind readnone declare double @llvm.sqrt.f64(double) nounwind readnone @@ -15,7 +16,8 @@ define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noali } ; SI-LABEL: @rsq_f64 -; SI: V_RSQ_F64_e32 +; SI-UNSAFE: V_RSQ_F64_e32 +; SI-SAFE: V_SQRT_F64_e32 ; SI: S_ENDPGM define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { %val = load double addrspace(1)* %in, align 4 diff --git a/test/CodeGen/R600/saddo.ll b/test/CodeGen/R600/saddo.ll new file mode 100644 index 000000000000..c80480e85512 --- /dev/null +++ b/test/CodeGen/R600/saddo.ll @@ -0,0 +1,62 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s + +declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: @saddo_i64_zext +define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %sadd, 0 + %carry = extractvalue { i64, i1 } %sadd, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @s_saddo_i32 +define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %sadd, 0 + %carry = extractvalue { i32, i1 } %sadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @v_saddo_i32 +define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32 addrspace(1)* %aptr, align 4 + %b = load i32 addrspace(1)* %bptr, align 4 + %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %sadd, 0 + %carry = extractvalue { i32, i1 } %sadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @s_saddo_i64 +define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %sadd, 0 + %carry = extractvalue { i64, i1 } %sadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @v_saddo_i64 +; SI: V_ADD_I32 +; SI: V_ADDC_U32 +define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64 addrspace(1)* %aptr, align 4 + %b = load i64 addrspace(1)* %bptr, align 4 + %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %sadd, 0 + %carry = extractvalue { i64, i1 } %sadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/test/CodeGen/R600/select-i1.ll b/test/CodeGen/R600/select-i1.ll new file mode 100644 index 000000000000..009dd7f68dea --- /dev/null +++ b/test/CodeGen/R600/select-i1.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI + +; FUNC-LABEL: @select_i1 +; SI: V_CNDMASK_B32 +; SI-NOT: V_CNDMASK_B32 +define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %sel = select i1 %cmp, i1 %a, i1 %b + store i1 %sel, i1 addrspace(1)* %out, align 4 + ret void +} + diff --git a/test/CodeGen/R600/select.ll b/test/CodeGen/R600/select.ll index f9401424ac12..7d5156834b9d 100644 --- a/test/CodeGen/R600/select.ll +++ b/test/CodeGen/R600/select.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + ; Normally icmp + select is optimized to select_cc, when this happens the ; DAGLegalizer never sees the select and doesn't have a chance to leaglize it. @@ -6,13 +7,13 @@ ; In order to avoid the select_cc optimization, this test case calculates the ; condition for the select in a separate basic block. -; CHECK-LABEL: @select -; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X -; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X -; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY -; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY -; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW -; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW +; FUNC-LABEL: @select +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out, <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out, <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out, diff --git a/test/CodeGen/R600/select64.ll b/test/CodeGen/R600/select64.ll index 6b87d9865ad6..dba25e3bd21e 100644 --- a/test/CodeGen/R600/select64.ll +++ b/test/CodeGen/R600/select64.ll @@ -13,3 +13,38 @@ entry: store i64 %1, i64 addrspace(1)* %out ret void } + +; CHECK-LABEL: @select_trunc_i64 +; CHECK: V_CNDMASK_B32 +; CHECK-NOT: V_CNDMASK_B32 +define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %sel = select i1 %cmp, i64 0, i64 %in + %trunc = trunc i64 %sel to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: @select_trunc_i64_2 +; CHECK: V_CNDMASK_B32 +; CHECK-NOT: V_CNDMASK_B32 +define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %sel = select i1 %cmp, i64 %a, i64 %b + %trunc = trunc i64 %sel to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: @v_select_trunc_i64_2 +; CHECK: V_CNDMASK_B32 +; CHECK-NOT: V_CNDMASK_B32 +define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %a = load i64 addrspace(1)* %aptr, align 8 + %b = load i64 addrspace(1)* %bptr, align 8 + %sel = select i1 %cmp, i64 %a, i64 %b + %trunc = trunc i64 %sel to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll index 834c03069522..bdb6867850ba 100644 --- a/test/CodeGen/R600/selectcc-opt.ll +++ b/test/CodeGen/R600/selectcc-opt.ll @@ -1,8 +1,10 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; CHECK: @test_a -; CHECK-NOT: CND -; CHECK: SET{{[NEQGTL]+}}_DX10 + +; FUNC-LABEL: @test_a +; EG-NOT: CND +; EG: SET{{[NEQGTL]+}}_DX10 define void @test_a(i32 addrspace(1)* %out, float %in) { entry: @@ -28,10 +30,10 @@ ENDIF: ; Same as test_a, but the branch labels are swapped to produce the inverse cc ; for the icmp instruction -; CHECK: @test_b -; CHECK: SET{{[GTEQN]+}}_DX10 -; CHECK-NEXT: PRED_ -; CHECK-NEXT: ALU clause starting +; EG-LABEL: @test_b +; EG: SET{{[GTEQN]+}}_DX10 +; EG-NEXT: PRED_ +; EG-NEXT: ALU clause starting define void @test_b(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 0.0 @@ -54,8 +56,8 @@ ENDIF: } ; Test a CND*_INT instruction with float true/false values -; CHECK: @test_c -; CHECK: CND{{[GTE]+}}_INT +; EG-LABEL: @test_c +; EG: CND{{[GTE]+}}_INT define void @test_c(float addrspace(1)* %out, i32 %in) { entry: %0 = icmp sgt i32 %in, 0 @@ -63,3 +65,15 @@ entry: store float %1, float addrspace(1)* %out ret void } + +; FUNC-LABEL: @selectcc_bool +; SI: V_CMP_NE_I32 +; SI-NEXT: V_CNDMASK_B32_e64 +; SI-NOT: CMP +; SI-NOT: CNDMASK +define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = select i1 %icmp0, i32 -1, i32 0 + store i32 %ext, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/setcc-opt.ll b/test/CodeGen/R600/setcc-opt.ll new file mode 100644 index 000000000000..8e831e409191 --- /dev/null +++ b/test/CodeGen/R600/setcc-opt.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; SI-LABEL: @sext_bool_icmp_ne +; SI: V_CMP_NE_I32 +; SI-NEXT: V_CNDMASK_B32 +; SI-NOT: V_CMP_NE_I32 +; SI-NOT: V_CNDMASK_B32 +; SI: S_ENDPGM +define void @sext_bool_icmp_ne(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 0 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll index e90e7886a6a6..cc942c10a91e 100644 --- a/test/CodeGen/R600/seto.ll +++ b/test/CodeGen/R600/seto.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: @main -;CHECK: V_CMP_O_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0 +;CHECK: V_CMP_O_F32_e32 vcc, {{[sv][0-9]+, v[0-9]+}} define void @main(float %p) { main_body: diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll index 3b1db8b062bf..33007fc754b8 100644 --- a/test/CodeGen/R600/setuo.ll +++ b/test/CodeGen/R600/setuo.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: @main -;CHECK: V_CMP_U_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0 +;CHECK: V_CMP_U_F32_e32 vcc, {{[sv][0-9]+, v[0-9]+}} define void @main(float %p) { main_body: diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll index c581d86b99bd..c7d5bf90644e 100644 --- a/test/CodeGen/R600/sgpr-copy.ll +++ b/test/CodeGen/R600/sgpr-copy.ll @@ -70,7 +70,7 @@ main_body: %55 = fadd float %54, %53 %56 = fmul float %45, %45 %57 = fadd float %55, %56 - %58 = call float @llvm.AMDGPU.rsq(float %57) + %58 = call float @llvm.AMDGPU.rsq.f32(float %57) %59 = fmul float %43, %58 %60 = fmul float %44, %58 %61 = fmul float %45, %58 @@ -212,7 +212,7 @@ declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #3 +declare float @llvm.AMDGPU.rsq.f32(float) #3 ; Function Attrs: readnone declare float @llvm.AMDIL.exp.(float) #3 diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll index b34a757d9b65..53a096513bbc 100644 --- a/test/CodeGen/R600/si-sgpr-spill.ll +++ b/test/CodeGen/R600/si-sgpr-spill.ll @@ -203,7 +203,7 @@ main_body: %198 = fadd float %197, %196 %199 = fmul float %97, %97 %200 = fadd float %198, %199 - %201 = call float @llvm.AMDGPU.rsq(float %200) + %201 = call float @llvm.AMDGPU.rsq.f32(float %200) %202 = fmul float %95, %201 %203 = fmul float %96, %201 %204 = fmul float %202, %29 @@ -384,7 +384,7 @@ IF67: ; preds = %LOOP65 %355 = fadd float %354, %353 %356 = fmul float %352, %352 %357 = fadd float %355, %356 - %358 = call float @llvm.AMDGPU.rsq(float %357) + %358 = call float @llvm.AMDGPU.rsq.f32(float %357) %359 = fmul float %350, %358 %360 = fmul float %351, %358 %361 = fmul float %352, %358 @@ -512,7 +512,7 @@ IF67: ; preds = %LOOP65 %483 = fadd float %482, %481 %484 = fmul float %109, %109 %485 = fadd float %483, %484 - %486 = call float @llvm.AMDGPU.rsq(float %485) + %486 = call float @llvm.AMDGPU.rsq.f32(float %485) %487 = fmul float %107, %486 %488 = fmul float %108, %486 %489 = fmul float %109, %486 @@ -541,7 +541,7 @@ IF67: ; preds = %LOOP65 %512 = fadd float %511, %510 %513 = fmul float %97, %97 %514 = fadd float %512, %513 - %515 = call float @llvm.AMDGPU.rsq(float %514) + %515 = call float @llvm.AMDGPU.rsq.f32(float %514) %516 = fmul float %95, %515 %517 = fmul float %96, %515 %518 = fmul float %97, %515 @@ -658,7 +658,7 @@ declare i32 @llvm.SI.tid() #2 declare float @ceil(float) #3 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #2 +declare float @llvm.AMDGPU.rsq.f32(float) #2 ; Function Attrs: nounwind readnone declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1 @@ -887,7 +887,7 @@ main_body: %212 = fadd float %211, %210 %213 = fmul float %209, %209 %214 = fadd float %212, %213 - %215 = call float @llvm.AMDGPU.rsq(float %214) + %215 = call float @llvm.AMDGPU.rsq.f32(float %214) %216 = fmul float %205, %215 %217 = fmul float %207, %215 %218 = fmul float %209, %215 @@ -1123,7 +1123,7 @@ IF189: ; preds = %LOOP %434 = fsub float -0.000000e+00, %433 %435 = fadd float 0x3FF00068E0000000, %434 %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00) - %437 = call float @llvm.AMDGPU.rsq(float %436) + %437 = call float @llvm.AMDGPU.rsq.f32(float %436) %438 = fmul float %437, %436 %439 = fsub float -0.000000e+00, %436 %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00) @@ -1147,7 +1147,7 @@ IF189: ; preds = %LOOP %458 = fadd float %457, %456 %459 = fmul float %455, %455 %460 = fadd float %458, %459 - %461 = call float @llvm.AMDGPU.rsq(float %460) + %461 = call float @llvm.AMDGPU.rsq.f32(float %460) %462 = fmul float %451, %461 %463 = fmul float %453, %461 %464 = fmul float %455, %461 @@ -1257,7 +1257,7 @@ ENDIF197: ; preds = %IF189, %IF198 %559 = fadd float %558, %557 %560 = fmul float %556, %556 %561 = fadd float %559, %560 - %562 = call float @llvm.AMDGPU.rsq(float %561) + %562 = call float @llvm.AMDGPU.rsq.f32(float %561) %563 = fmul float %562, %561 %564 = fsub float -0.000000e+00, %561 %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00) diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll index dee432664e89..e6f8ce8ef0ee 100644 --- a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll +++ b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s - ; XFAIL: * +; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s ; 64-bit select was originally lowered with a build_pair, and this ; could be simplified to 1 cndmask instead of 2, but that broken when @@ -16,8 +15,8 @@ define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) { ; FIXME: Fix truncating store for local memory ; SI-LABEL: @trunc_load_alloca_i64: -; SI: DS_READ_B32 -; SI-NOT: DS_READ_B64 +; SI: V_MOVRELS_B32 +; SI-NOT: V_MOVRELS_B32 ; SI: S_ENDPGM define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) { %idx = add i32 %a, %b diff --git a/test/CodeGen/R600/ssubo.ll b/test/CodeGen/R600/ssubo.ll new file mode 100644 index 000000000000..b330276ae9e7 --- /dev/null +++ b/test/CodeGen/R600/ssubo.ll @@ -0,0 +1,64 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s + +declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: @ssubo_i64_zext +define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %ssub, 0 + %carry = extractvalue { i64, i1 } %ssub, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @s_ssubo_i32 +define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %ssub, 0 + %carry = extractvalue { i32, i1 } %ssub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @v_ssubo_i32 +define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32 addrspace(1)* %aptr, align 4 + %b = load i32 addrspace(1)* %bptr, align 4 + %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %ssub, 0 + %carry = extractvalue { i32, i1 } %ssub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @s_ssubo_i64 +; SI: S_SUB_I32 +; SI: S_SUBB_U32 +define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %ssub, 0 + %carry = extractvalue { i64, i1 } %ssub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @v_ssubo_i64 +; SI: V_SUB_I32_e32 +; SI: V_SUBB_U32_e32 +define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64 addrspace(1)* %aptr, align 4 + %b = load i64 addrspace(1)* %bptr, align 4 + %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %ssub, 0 + %carry = extractvalue { i64, i1 } %ssub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll index c0c8ccc15a3d..dd275338d7c0 100644 --- a/test/CodeGen/R600/store.ll +++ b/test/CodeGen/R600/store.ll @@ -263,8 +263,7 @@ entry: ; CM-CHECK: LDS_WRITE ; CM-CHECK: LDS_WRITE ; SI-CHECK-LABEL: @store_local_v2i32 -; SI-CHECK: DS_WRITE_B32 -; SI-CHECK: DS_WRITE_B32 +; SI-CHECK: DS_WRITE_B64 define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { entry: store <2 x i32> %in, <2 x i32> addrspace(3)* %out diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll index 58523d068e5e..8e64148142d2 100644 --- a/test/CodeGen/R600/sub.ll +++ b/test/CodeGen/R600/sub.ll @@ -1,5 +1,7 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s -;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +;RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() readnone ;FUNC-LABEL: @test2 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} @@ -37,23 +39,37 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ret void } -;FUNC_LABEL: @test5 +; FUNC-LABEL: @s_sub_i64: +; SI: S_SUB_I32 +; SI: S_SUBB_U32 -;EG-DAG: SETGE_UINT -;EG-DAG: CNDE_INT -;EG-DAG: SUB_INT -;EG-DAG: SUB_INT -;EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: CNDE_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { + %result = sub i64 %a, %b + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} -;SI: S_NOT_B64 -;SI-DAG: S_ADD_I32 -;SI-DAG: S_ADDC_U32 -;SI-DAG: S_ADD_I32 -;SI-DAG: S_ADDC_U32 +; FUNC-LABEL: @v_sub_i64: +; SI: V_SUB_I32_e32 +; SI: V_SUBB_U32_e32 -define void @test5(i64 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = sub i64 %a, %b - store i64 %0, i64 addrspace(1)* %out +; EG-DAG: SETGE_UINT +; EG-DAG: CNDE_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr i64 addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr i64 addrspace(1)* %inB, i32 %tid + %a = load i64 addrspace(1)* %a_ptr + %b = load i64 addrspace(1)* %b_ptr + %result = sub i64 %a, %b + store i64 %result, i64 addrspace(1)* %out, align 8 ret void } diff --git a/test/CodeGen/R600/uaddo.ll b/test/CodeGen/R600/uaddo.ll index 3b69687b362e..a80e502eef2a 100644 --- a/test/CodeGen/R600/uaddo.ll +++ b/test/CodeGen/R600/uaddo.ll @@ -1,8 +1,10 @@ -; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone -; SI-LABEL: @uaddo_i64_zext +; FUNC-LABEL: @uaddo_i64_zext ; SI: ADD ; SI: ADDC ; SI: ADDC @@ -15,3 +17,53 @@ define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { store i64 %add2, i64 addrspace(1)* %out, align 8 ret void } + +; FUNC-LABEL: @s_uaddo_i32 +; SI: S_ADD_I32 +define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %uadd, 0 + %carry = extractvalue { i32, i1 } %uadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @v_uaddo_i32 +; SI: V_ADD_I32 +define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32 addrspace(1)* %aptr, align 4 + %b = load i32 addrspace(1)* %bptr, align 4 + %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %uadd, 0 + %carry = extractvalue { i32, i1 } %uadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @s_uaddo_i64 +; SI: S_ADD_I32 +; SI: S_ADDC_U32 +define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %uadd, 0 + %carry = extractvalue { i64, i1 } %uadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @v_uaddo_i64 +; SI: V_ADD_I32 +; SI: V_ADDC_U32 +define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64 addrspace(1)* %aptr, align 4 + %b = load i64 addrspace(1)* %bptr, align 4 + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %uadd, 0 + %carry = extractvalue { i64, i1 } %uadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/test/CodeGen/R600/udivrem.ll b/test/CodeGen/R600/udivrem.ll new file mode 100644 index 000000000000..5f5753adca3f --- /dev/null +++ b/test/CodeGen/R600/udivrem.ll @@ -0,0 +1,358 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s + +; FUNC-LABEL: @test_udivrem +; EG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG: CNDE_INT +; EG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG: CNDE_INT +; EG: MULHI +; EG: MULLO_INT +; EG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: V_RCP_IFLAG_F32_e32 [[RCP:v[0-9]+]] +; SI-DAG: V_MUL_HI_U32 [[RCP_HI:v[0-9]+]], [[RCP]] +; SI-DAG: V_MUL_LO_I32 [[RCP_LO:v[0-9]+]], [[RCP]] +; SI-DAG: V_SUB_I32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]] +; SI: V_CNDMASK_B32_e64 +; SI: V_MUL_HI_U32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]] +; SI-DAG: V_ADD_I32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]] +; SI-DAG: V_SUBREV_I32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]] +; SI: V_CNDMASK_B32_e64 +; SI: V_MUL_HI_U32 [[Quotient:v[0-9]+]] +; SI: V_MUL_LO_I32 [[Num_S_Remainder:v[0-9]+]] +; SI-DAG: V_SUB_I32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI: V_AND_B32_e32 [[Tmp1:v[0-9]+]] +; SI-DAG: V_ADD_I32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]] +; SI-DAG: V_SUBREV_I32_e32 [[Quotient_S_One:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_ADD_I32_e32 [[Remainder_A_Den:v[0-9]+]], +; SI-DAG: V_SUBREV_I32_e32 [[Remainder_S_Den:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI: S_ENDPGM +define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) { + %result0 = udiv i32 %x, %y + store i32 %result0, i32 addrspace(1)* %out + %result1 = urem i32 %x, %y + store i32 %result1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @test_udivrem_v2 +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI-DAG: V_RCP_IFLAG_F32_e32 [[FIRST_RCP:v[0-9]+]] +; SI-DAG: V_MUL_HI_U32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: V_MUL_LO_I32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: V_SUB_I32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] +; SI-DAG: V_ADD_I32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: V_SUBREV_I32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[FIRST_Quotient:v[0-9]+]] +; SI-DAG: V_MUL_LO_I32 [[FIRST_Num_S_Remainder:v[0-9]+]] +; SI-DAG: V_SUB_I32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_AND_B32_e32 [[FIRST_Tmp1:v[0-9]+]] +; SI-DAG: V_ADD_I32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] +; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_ADD_I32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], +; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_RCP_IFLAG_F32_e32 [[SECOND_RCP:v[0-9]+]] +; SI-DAG: V_MUL_HI_U32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: V_MUL_LO_I32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: V_SUB_I32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] +; SI-DAG: V_ADD_I32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: V_SUBREV_I32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[SECOND_Quotient:v[0-9]+]] +; SI-DAG: V_MUL_LO_I32 [[SECOND_Num_S_Remainder:v[0-9]+]] +; SI-DAG: V_SUB_I32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_AND_B32_e32 [[SECOND_Tmp1:v[0-9]+]] +; SI-DAG: V_ADD_I32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] +; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_ADD_I32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], +; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI: S_ENDPGM +define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { + %result0 = udiv <2 x i32> %x, %y + store <2 x i32> %result0, <2 x i32> addrspace(1)* %out + %result1 = urem <2 x i32> %x, %y + store <2 x i32> %result1, <2 x i32> addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: @test_udivrem_v4 +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI-DAG: V_RCP_IFLAG_F32_e32 [[FIRST_RCP:v[0-9]+]] +; SI-DAG: V_MUL_HI_U32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: V_MUL_LO_I32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: V_SUB_I32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] +; SI-DAG: V_ADD_I32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: V_SUBREV_I32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[FIRST_Quotient:v[0-9]+]] +; SI-DAG: V_MUL_LO_I32 [[FIRST_Num_S_Remainder:v[0-9]+]] +; SI-DAG: V_SUB_I32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_AND_B32_e32 [[FIRST_Tmp1:v[0-9]+]] +; SI-DAG: V_ADD_I32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] +; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_ADD_I32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], +; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_RCP_IFLAG_F32_e32 [[SECOND_RCP:v[0-9]+]] +; SI-DAG: V_MUL_HI_U32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: V_MUL_LO_I32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: V_SUB_I32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] +; SI-DAG: V_ADD_I32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: V_SUBREV_I32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[SECOND_Quotient:v[0-9]+]] +; SI-DAG: V_MUL_LO_I32 [[SECOND_Num_S_Remainder:v[0-9]+]] +; SI-DAG: V_SUB_I32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_AND_B32_e32 [[SECOND_Tmp1:v[0-9]+]] +; SI-DAG: V_ADD_I32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] +; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_ADD_I32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], +; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_RCP_IFLAG_F32_e32 [[THIRD_RCP:v[0-9]+]] +; SI-DAG: V_MUL_HI_U32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]] +; SI-DAG: V_MUL_LO_I32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]] +; SI-DAG: V_SUB_I32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]] +; SI-DAG: V_ADD_I32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] +; SI-DAG: V_SUBREV_I32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[THIRD_Quotient:v[0-9]+]] +; SI-DAG: V_MUL_LO_I32 [[THIRD_Num_S_Remainder:v[0-9]+]] +; SI-DAG: V_SUB_I32_e32 [[THIRD_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[THIRD_Num_S_Remainder]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_AND_B32_e32 [[THIRD_Tmp1:v[0-9]+]] +; SI-DAG: V_ADD_I32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]] +; SI-DAG: V_SUBREV_I32_e32 [[THIRD_Quotient_S_One:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_ADD_I32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]], +; SI-DAG: V_SUBREV_I32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_RCP_IFLAG_F32_e32 [[FOURTH_RCP:v[0-9]+]] +; SI-DAG: V_MUL_HI_U32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]] +; SI-DAG: V_MUL_LO_I32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]] +; SI-DAG: V_SUB_I32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]] +; SI-DAG: V_ADD_I32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] +; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_MUL_HI_U32 [[FOURTH_Quotient:v[0-9]+]] +; SI-DAG: V_MUL_LO_I32 [[FOURTH_Num_S_Remainder:v[0-9]+]] +; SI-DAG: V_SUB_I32_e32 [[FOURTH_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FOURTH_Num_S_Remainder]] +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_AND_B32_e32 [[FOURTH_Tmp1:v[0-9]+]] +; SI-DAG: V_ADD_I32_e32 [[FOURTH_Quotient_A_One:v[0-9]+]], {{.*}}, [[FOURTH_Quotient]] +; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_Quotient_S_One:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_ADD_I32_e32 [[FOURTH_Remainder_A_Den:v[0-9]+]], +; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_Remainder_S_Den:v[0-9]+]], +; SI-DAG: V_CNDMASK_B32_e64 +; SI-DAG: V_CNDMASK_B32_e64 +; SI: S_ENDPGM +define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { + %result0 = udiv <4 x i32> %x, %y + store <4 x i32> %result0, <4 x i32> addrspace(1)* %out + %result1 = urem <4 x i32> %x, %y + store <4 x i32> %result1, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/usubo.ll b/test/CodeGen/R600/usubo.ll new file mode 100644 index 000000000000..d57a2c7f773e --- /dev/null +++ b/test/CodeGen/R600/usubo.ll @@ -0,0 +1,66 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s + +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: @usubo_i64_zext +define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %usub, 0 + %carry = extractvalue { i64, i1 } %usub, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @s_usubo_i32 +; SI: S_SUB_I32 +define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %usub, 0 + %carry = extractvalue { i32, i1 } %usub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @v_usubo_i32 +; SI: V_SUBREV_I32_e32 +define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32 addrspace(1)* %aptr, align 4 + %b = load i32 addrspace(1)* %bptr, align 4 + %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %usub, 0 + %carry = extractvalue { i32, i1 } %usub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @s_usubo_i64 +; SI: S_SUB_I32 +; SI: S_SUBB_U32 +define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %usub, 0 + %carry = extractvalue { i64, i1 } %usub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: @v_usubo_i64 +; SI: V_SUB_I32 +; SI: V_SUBB_U32 +define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64 addrspace(1)* %aptr, align 4 + %b = load i64 addrspace(1)* %bptr, align 4 + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %usub, 0 + %carry = extractvalue { i64, i1 } %usub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll index 6543f6d05933..ec1995f68089 100644 --- a/test/CodeGen/R600/vector-alloca.ll +++ b/test/CodeGen/R600/vector-alloca.ll @@ -1,5 +1,6 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s -; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: @vector_read ; EG: MOV @@ -53,7 +54,7 @@ entry: ; This test should be optimize to: ; store i32 0, i32 addrspace(1)* %out ; FUNC-LABEL: @bitcast_gep -; CHECK: STORE_RAW +; EG: STORE_RAW define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { entry: %0 = alloca [4 x i32] diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll index 90079b005bb1..01236590742a 100644 --- a/test/CodeGen/R600/work-item-intrinsics.ll +++ b/test/CodeGen/R600/work-item-intrinsics.ll @@ -127,12 +127,12 @@ entry: ret void } -; The tgid values are stored in ss offset by the number of user ss. -; Currently we always use exactly 2 user ss for the pointer to the +; The tgid values are stored in sgprs offset by the number of user sgprs. +; Currently we always use exactly 2 user sgprs for the pointer to the ; kernel arguments, but this may change in the future. ; SI-CHECK: @tgid_x -; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s2 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] define void @tgid_x (i32 addrspace(1)* %out) { entry: @@ -142,7 +142,7 @@ entry: } ; SI-CHECK: @tgid_y -; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s3 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s5 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] define void @tgid_y (i32 addrspace(1)* %out) { entry: @@ -152,7 +152,7 @@ entry: } ; SI-CHECK: @tgid_z -; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s6 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] define void @tgid_z (i32 addrspace(1)* %out) { entry: diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll index 004304173990..e14bd7127231 100644 --- a/test/CodeGen/R600/xor.ll +++ b/test/CodeGen/R600/xor.ll @@ -42,7 +42,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} ;SI-CHECK: @xor_i1 -;SI-CHECK: S_XOR_B64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] +;SI-CHECK: V_XOR_B32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { %a = load float addrspace(1) * %in0 @@ -130,3 +130,29 @@ define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 store i64 %result, i64 addrspace(1)* %out ret void } + +; Test that we have a pattern to match xor inside a branch. +; Note that in the future the backend may be smart enough to +; use an SALU instruction for this. + +; SI-CHECK-LABEL: @xor_cf +; SI-CHECK: V_XOR +; SI-CHECK: V_XOR +define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = xor i64 %a, %b + br label %endif + +else: + %2 = load i64 addrspace(1)* %in + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/Thumb2/tpsoft.ll b/test/CodeGen/Thumb2/tpsoft.ll new file mode 100644 index 000000000000..6ab8bf01761b --- /dev/null +++ b/test/CodeGen/Thumb2/tpsoft.ll @@ -0,0 +1,54 @@ +; RUN: llc %s -mtriple=thumbv7-linux-gnueabi -o - | \ +; RUN: FileCheck -check-prefix=ELFASM %s +; RUN: llc %s -mtriple=thumbebv7-linux-gnueabi -o - | \ +; RUN: FileCheck -check-prefix=ELFASM %s +; RUN: llc %s -mtriple=thumbv7-linux-gnueabi -filetype=obj -o - | \ +; RUN: llvm-readobj -s -sd | FileCheck -check-prefix=ELFOBJ -check-prefix=ELFOBJ-LE %s +; RUN: llc %s -mtriple=thumbebv7-linux-gnueabi -filetype=obj -o - | \ +; RUN: llvm-readobj -s -sd | FileCheck -check-prefix=ELFOBJ -check-prefix=ELFOBJ-BE %s + +;; Make sure that bl __aeabi_read_tp is materialized and fixed up correctly +;; in the obj case. + +@i = external thread_local global i32 +@a = external global i8 +@b = external global [10 x i8] + +define arm_aapcs_vfpcc i32 @main() nounwind { +entry: + %0 = load i32* @i, align 4 + switch i32 %0, label %bb2 [ + i32 12, label %bb + i32 13, label %bb1 + ] + +bb: ; preds = %entry + %1 = tail call arm_aapcs_vfpcc i32 @foo(i8* @a) nounwind + ret i32 %1 +; ELFASM: bl __aeabi_read_tp + + +; ELFOBJ: Sections [ +; ELFOBJ: Section { +; ELFOBJ: Name: .text +; ELFOBJ-LE: SectionData ( +;;; BL __aeabi_read_tp is ---------+ +;;; V +; ELFOBJ-LE-NEXT: 0000: 2DE90048 0E487844 0168FFF7 FEFF4058 +; ELFOBJ-BE: SectionData ( +;;; BL __aeabi_read_tp is ---------+ +;;; V +; ELFOBJ-BE-NEXT: 0000: E92D4800 480E4478 6801F7FF FFFE5840 + + +bb1: ; preds = %entry + %2 = tail call arm_aapcs_vfpcc i32 @bar(i32* bitcast ([10 x i8]* @b to i32*)) nounwind + ret i32 %2 + +bb2: ; preds = %entry + ret i32 -1 +} + +declare arm_aapcs_vfpcc i32 @foo(i8*) + +declare arm_aapcs_vfpcc i32 @bar(i32*) diff --git a/test/CodeGen/X86/2007-05-05-Personality.ll b/test/CodeGen/X86/2007-05-05-Personality.ll index 5b8fe72b5d0f..b99c58c6e4af 100644 --- a/test/CodeGen/X86/2007-05-05-Personality.ll +++ b/test/CodeGen/X86/2007-05-05-Personality.ll @@ -1,12 +1,14 @@ ; RUN: llc < %s -mtriple=i686-pc-linux-gnu -o - | FileCheck %s --check-prefix=LIN -; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu -o - | FileCheck %s --check-prefix=LIN ; RUN: llc < %s -mtriple=i386-pc-mingw32 -o - | FileCheck %s --check-prefix=WIN ; RUN: llc < %s -mtriple=i686-pc-windows-gnu -o - | FileCheck %s --check-prefix=WIN +; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu -o - | FileCheck %s --check-prefix=WIN64 ; LIN: .cfi_personality 0, __gnat_eh_personality ; LIN: .cfi_lsda 0, .Lexception0 ; WIN: .cfi_personality 0, ___gnat_eh_personality ; WIN: .cfi_lsda 0, Lexception0 +; WIN64: .seh_handler __gnat_eh_personality +; WIN64: .seh_handlerdata @error = external global i8 @@ -15,7 +17,7 @@ entry: invoke void @raise() to label %eh_then unwind label %unwind -unwind: ; preds = %entry +unwind: ; preds = %entry %eh_ptr = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*) catch i8* @error %eh_select = extractvalue { i8*, i32 } %eh_ptr, 1 diff --git a/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll b/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll deleted file mode 100644 index 0ae1897e60e9..000000000000 --- a/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep -- -86 - -define i16 @f(<4 x float>* %tmp116117.i1061.i) nounwind { -entry: - alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:0 [#uses=167] - alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:1 [#uses=170] - alloca [4 x <4 x i32>] ; <[4 x <4 x i32>]*>:2 [#uses=12] - %.sub6235.i = getelementptr [4 x <4 x float>]* %0, i32 0, i32 0 ; <<4 x float>*> [#uses=76] - %.sub.i = getelementptr [4 x <4 x float>]* %1, i32 0, i32 0 ; <<4 x float>*> [#uses=59] - - %tmp124.i1062.i = getelementptr <4 x float>* %tmp116117.i1061.i, i32 63 ; <<4 x float>*> [#uses=1] - %tmp125.i1063.i = load <4 x float>* %tmp124.i1062.i ; <<4 x float>> [#uses=5] - %tmp828.i1077.i = shufflevector <4 x float> %tmp125.i1063.i, <4 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > ; <<4 x float>> [#uses=4] - %tmp704.i1085.i = load <4 x float>* %.sub6235.i ; <<4 x float>> [#uses=1] - %tmp712.i1086.i = call <4 x float> @llvm.x86.sse.max.ps( <4 x float> %tmp704.i1085.i, <4 x float> %tmp828.i1077.i ) ; <<4 x float>> [#uses=1] - store <4 x float> %tmp712.i1086.i, <4 x float>* %.sub.i - - %tmp2587.i1145.gep.i = getelementptr [4 x <4 x float>]* %1, i32 0, i32 0, i32 2 ; [#uses=1] - %tmp5334.i = load float* %tmp2587.i1145.gep.i ; [#uses=5] - %tmp2723.i1170.i = insertelement <4 x float> undef, float %tmp5334.i, i32 2 ; <<4 x float>> [#uses=5] - store <4 x float> %tmp2723.i1170.i, <4 x float>* %.sub6235.i - - %tmp1406.i1367.i = shufflevector <4 x float> %tmp2723.i1170.i, <4 x float> undef, <4 x i32> < i32 2, i32 2, i32 2, i32 2 > ; <<4 x float>> [#uses=1] - %tmp84.i1413.i = load <4 x float>* %.sub6235.i ; <<4 x float>> [#uses=1] - %tmp89.i1415.i = fmul <4 x float> %tmp84.i1413.i, %tmp1406.i1367.i ; <<4 x float>> [#uses=1] - store <4 x float> %tmp89.i1415.i, <4 x float>* %.sub.i - ret i16 0 -} - -declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) diff --git a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll index d2d5149de3aa..35857b7e01e6 100644 --- a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll +++ b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll @@ -1,5 +1,6 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep xor | grep CPI +; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s +; CHECK: xorpd {{.*}}{{LCPI0_0|__xmm@}} define void @casin({ double, double }* sret %agg.result, double %z.0, double %z.1) nounwind { entry: %memtmp = alloca { double, double }, align 8 ; <{ double, double }*> [#uses=3] diff --git a/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll b/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll index 1259cf47b2bc..dfb98bb1ab39 100644 --- a/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll +++ b/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll @@ -1,7 +1,7 @@ ; RUN: llc -mcpu=generic -mtriple=x86_64-mingw32 < %s | FileCheck %s ; CHECK: subq $40, %rsp -; CHECK: movaps %xmm8, (%rsp) -; CHECK: movaps %xmm7, 16(%rsp) +; CHECK: movaps %xmm8, 16(%rsp) +; CHECK: movaps %xmm7, (%rsp) define i32 @a() nounwind { entry: diff --git a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll index f9bf3109ea10..850f678c9c2c 100644 --- a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll +++ b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll @@ -11,9 +11,9 @@ entry: ; CHECK: movl 4([[REG]]), %edx ; CHECK: LBB0_1: ; CHECK: movl %eax, %ebx -; CHECK: addl {{%[a-z]+}}, %ebx +; CHECK: addl $1, %ebx ; CHECK: movl %edx, %ecx -; CHECK: adcl {{%[a-z]+}}, %ecx +; CHECK: adcl $0, %ecx ; CHECK: lock ; CHECK-NEXT: cmpxchg8b ([[REG]]) ; CHECK-NEXT: jne diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll index 650839a657da..36667def6110 100644 --- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll +++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll @@ -69,15 +69,15 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...) !1 = metadata !{metadata !2} !2 = metadata !{} !4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815] -!5 = metadata !{i32 786443, metadata !6, i32 815, i32 0, metadata !14, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!6 = metadata !{i32 786443, metadata !7, i32 812, i32 0, metadata !14, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!7 = metadata !{i32 786443, metadata !8, i32 807, i32 0, metadata !14, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!8 = metadata !{i32 786443, metadata !9, i32 440, i32 0, metadata !14, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!9 = metadata !{i32 786443, metadata !10, i32 435, i32 0, metadata !14, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!10 = metadata !{i32 786443, metadata !11, i32 434, i32 0, metadata !14, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!11 = metadata !{i32 786443, metadata !12, i32 250, i32 0, metadata !14, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!12 = metadata !{i32 786443, metadata !13, i32 249, i32 0, metadata !14, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!13 = metadata !{i32 786443, metadata !2, i32 221, i32 0, metadata !14, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!5 = metadata !{i32 786443, metadata !14, metadata !6, i32 815, i32 0, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!6 = metadata !{i32 786443, metadata !14, metadata !7, i32 812, i32 0, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!7 = metadata !{i32 786443, metadata !14, metadata !8, i32 807, i32 0, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!8 = metadata !{i32 786443, metadata !14, metadata !9, i32 440, i32 0, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!9 = metadata !{i32 786443, metadata !14, metadata !10, i32 435, i32 0, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!10 = metadata !{i32 786443, metadata !14, metadata !11, i32 434, i32 0, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!11 = metadata !{i32 786443, metadata !14, metadata !12, i32 250, i32 0, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!12 = metadata !{i32 786443, metadata !14, metadata !13, i32 249, i32 0, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!13 = metadata !{i32 786443, metadata !14, metadata !2, i32 221, i32 0, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] !14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ] !15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char] !16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char] diff --git a/test/CodeGen/X86/Atomics-64.ll b/test/CodeGen/X86/Atomics-64.ll index f9c25fc82261..c392e947407e 100644 --- a/test/CodeGen/X86/Atomics-64.ll +++ b/test/CodeGen/X86/Atomics-64.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=x86-64 > %t.x86-64 -; RUN: llc < %s -march=x86 > %t.x86 +; RUN: llc < %s -march=x86 -mattr=cx16 > %t.x86 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-apple-darwin8" diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll index 1513fcba774b..9c24be4289ff 100644 --- a/test/CodeGen/X86/add-of-carry.ll +++ b/test/CodeGen/X86/add-of-carry.ll @@ -4,7 +4,7 @@ define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp { entry: ; CHECK-LABEL: test1: -; CHECK: cmpl %ecx, %eax +; CHECK: cmpl %ecx, %eax ; CHECK-NOT: addl ; CHECK: adcl $0, %eax %add4 = add i32 %x, %sum diff --git a/test/CodeGen/X86/address-type-promotion-constantexpr.ll b/test/CodeGen/X86/address-type-promotion-constantexpr.ll new file mode 100644 index 000000000000..32f29bd3cad9 --- /dev/null +++ b/test/CodeGen/X86/address-type-promotion-constantexpr.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=x86_64-pc-linux + +; PR20314 is a crashing bug. This program does nothing with the load, so just check that the return is 0. + +@c = common global [2 x i32] zeroinitializer, align 4 +@a = common global i32 0, align 4 +@b = internal unnamed_addr constant [2 x i8] c"\01\00", align 1 + +; CHECK-LABEL: main +; CHECK: xor %eax, %eax +define i32 @main() { +entry: + %foo = load i8* getelementptr ([2 x i8]* @b, i64 0, i64 sext (i8 or (i8 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i8), i8 1) to i64)), align 1 + ret i32 0 +} + diff --git a/test/CodeGen/X86/atomic-load-store-wide.ll b/test/CodeGen/X86/atomic-load-store-wide.ll index 17e04f059034..7352d5a58006 100644 --- a/test/CodeGen/X86/atomic-load-store-wide.ll +++ b/test/CodeGen/X86/atomic-load-store-wide.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mcpu=corei7 -march=x86 -verify-machineinstrs | FileCheck %s ; 64-bit load/store on x86-32 ; FIXME: The generated code can be substantially improved. diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll index 1cfbc49ab1c9..ffb7a3fd6f64 100644 --- a/test/CodeGen/X86/atomic-minmax-i6432.ll +++ b/test/CodeGen/X86/atomic-minmax-i6432.ll @@ -1,6 +1,5 @@ -; RUN: llc -march=x86 -mattr=+cmov -mtriple=i386-pc-linux -verify-machineinstrs < %s | FileCheck %s -check-prefix=LINUX -; RUN: llc -march=x86 -mattr=-cmov -mtriple=i386-pc-linux -verify-machineinstrs < %s | FileCheck %s -check-prefix=NOCMOV -; RUN: llc -march=x86 -mtriple=i386-macosx -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s -check-prefix=PIC +; RUN: llc -march=x86 -mattr=+cmov,cx16 -mtriple=i386-pc-linux -verify-machineinstrs < %s | FileCheck %s -check-prefix=LINUX +; RUN: llc -march=x86 -mattr=cx16 -mtriple=i386-macosx -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s -check-prefix=PIC @sc64 = external global i64 @@ -9,87 +8,39 @@ define void @atomic_maxmin_i6432() { %1 = atomicrmw max i64* @sc64, i64 5 acquire ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl -; LINUX: setl -; LINUX: cmpl -; LINUX: setl +; LINUX: seta ; LINUX: cmovne ; LINUX: cmovne ; LINUX: lock ; LINUX-NEXT: cmpxchg8b ; LINUX: jne [[LABEL]] -; NOCMOV: [[LABEL:.LBB[0-9]+_[0-9]+]] -; NOCMOV: cmpl -; NOCMOV: setl -; NOCMOV: cmpl -; NOCMOV: setl -; NOCMOV: jne -; NOCMOV: jne -; NOCMOV: lock -; NOCMOV-NEXT: cmpxchg8b -; NOCMOV: jne [[LABEL]] %2 = atomicrmw min i64* @sc64, i64 6 acquire ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl -; LINUX: setg -; LINUX: cmpl -; LINUX: setg +; LINUX: setb ; LINUX: cmovne ; LINUX: cmovne ; LINUX: lock ; LINUX-NEXT: cmpxchg8b ; LINUX: jne [[LABEL]] -; NOCMOV: [[LABEL:.LBB[0-9]+_[0-9]+]] -; NOCMOV: cmpl -; NOCMOV: setg -; NOCMOV: cmpl -; NOCMOV: setg -; NOCMOV: jne -; NOCMOV: jne -; NOCMOV: lock -; NOCMOV-NEXT: cmpxchg8b -; NOCMOV: jne [[LABEL]] %3 = atomicrmw umax i64* @sc64, i64 7 acquire ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl -; LINUX: setb -; LINUX: cmpl -; LINUX: setb +; LINUX: seta ; LINUX: cmovne ; LINUX: cmovne ; LINUX: lock ; LINUX-NEXT: cmpxchg8b ; LINUX: jne [[LABEL]] -; NOCMOV: [[LABEL:.LBB[0-9]+_[0-9]+]] -; NOCMOV: cmpl -; NOCMOV: setb -; NOCMOV: cmpl -; NOCMOV: setb -; NOCMOV: jne -; NOCMOV: jne -; NOCMOV: lock -; NOCMOV-NEXT: cmpxchg8b -; NOCMOV: jne [[LABEL]] %4 = atomicrmw umin i64* @sc64, i64 8 acquire ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl -; LINUX: seta -; LINUX: cmpl -; LINUX: seta +; LINUX: setb ; LINUX: cmovne ; LINUX: cmovne ; LINUX: lock ; LINUX-NEXT: cmpxchg8b ; LINUX: jne [[LABEL]] -; NOCMOV: [[LABEL:.LBB[0-9]+_[0-9]+]] -; NOCMOV: cmpl -; NOCMOV: seta -; NOCMOV: cmpl -; NOCMOV: seta -; NOCMOV: jne -; NOCMOV: jne -; NOCMOV: lock -; NOCMOV-NEXT: cmpxchg8b -; NOCMOV: jne [[LABEL]] ret void } @@ -98,8 +49,8 @@ define void @atomic_maxmin_i6432() { define void @tf_bug(i8* %ptr) nounwind { ; PIC-LABEL: tf_bug: -; PIC: movl _id-L1$pb( -; PIC: movl (_id-L1$pb)+4( +; PIC-DAG: movl _id-L1$pb( +; PIC-DAG: movl (_id-L1$pb)+4( %tmp1 = atomicrmw add i64* @id, i64 1 seq_cst %tmp2 = add i64 %tmp1, 1 %tmp3 = bitcast i8* %ptr to i64* diff --git a/test/CodeGen/X86/atomic-ops-ancient-64.ll b/test/CodeGen/X86/atomic-ops-ancient-64.ll new file mode 100644 index 000000000000..508d83b0ffe1 --- /dev/null +++ b/test/CodeGen/X86/atomic-ops-ancient-64.ll @@ -0,0 +1,44 @@ +; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s +; XFAIL: * + +define i64 @test_add(i64* %addr, i64 %inc) { +; CHECK-LABEL: test_add: +; CHECK: calll __sync_fetch_and_add_8 + %old = atomicrmw add i64* %addr, i64 %inc seq_cst + ret i64 %old +} + +define i64 @test_sub(i64* %addr, i64 %inc) { +; CHECK-LABEL: test_sub: +; CHECK: calll __sync_fetch_and_sub_8 + %old = atomicrmw sub i64* %addr, i64 %inc seq_cst + ret i64 %old +} + +define i64 @test_and(i64* %andr, i64 %inc) { +; CHECK-LABEL: test_and: +; CHECK: calll __sync_fetch_and_and_8 + %old = atomicrmw and i64* %andr, i64 %inc seq_cst + ret i64 %old +} + +define i64 @test_or(i64* %orr, i64 %inc) { +; CHECK-LABEL: test_or: +; CHECK: calll __sync_fetch_and_or_8 + %old = atomicrmw or i64* %orr, i64 %inc seq_cst + ret i64 %old +} + +define i64 @test_xor(i64* %xorr, i64 %inc) { +; CHECK-LABEL: test_xor: +; CHECK: calll __sync_fetch_and_xor_8 + %old = atomicrmw xor i64* %xorr, i64 %inc seq_cst + ret i64 %old +} + +define i64 @test_nand(i64* %nandr, i64 %inc) { +; CHECK-LABEL: test_nand: +; CHECK: calll __sync_fetch_and_nand_8 + %old = atomicrmw nand i64* %nandr, i64 %inc seq_cst + ret i64 %old +} diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll new file mode 100644 index 000000000000..741d2904229d --- /dev/null +++ b/test/CodeGen/X86/atomic128.ll @@ -0,0 +1,316 @@ +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16 | FileCheck %s + +@var = global i128 0 + +define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) { +; CHECK-LABEL: val_compare_and_swap: +; CHECK: movq %rsi, %rax +; CHECK: movq %rcx, %rbx +; CHECK: movq %r8, %rcx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) + + %pair = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire + %val = extractvalue { i128, i1 } %pair, 0 + ret i128 %val +} + +define void @fetch_and_nand(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_nand: +; CHECK-DAG: movq %rdx, [[INCHI:%[a-z0-9]+]] +; CHECK-DAG: movq (%rdi), %rax +; CHECK-DAG: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: movq %rdx, %rcx +; CHECK: andq [[INCHI]], %rcx +; CHECK: movq %rax, %rbx + ; INCLO equivalent comes in in %rsi, so it makes sense it stays there. +; CHECK: andq %rsi, %rbx +; CHECK: notq %rbx +; CHECK: notq %rcx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] + +; CHECK: movq %rax, _var +; CHECK: movq %rdx, _var+8 + %val = atomicrmw nand i128* %p, i128 %bits release + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_or(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_or: +; CHECK-DAG: movq %rdx, [[INCHI:%[a-z0-9]+]] +; CHECK-DAG: movq (%rdi), %rax +; CHECK-DAG: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: movq %rax, %rbx + ; INCLO equivalent comes in in %rsi, so it makes sense it stays there. +; CHECK: orq %rsi, %rbx +; CHECK: movq %rdx, %rcx +; CHECK: orq [[INCHI]], %rcx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] + +; CHECK: movq %rax, _var +; CHECK: movq %rdx, _var+8 + + %val = atomicrmw or i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_add(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_add: +; CHECK-DAG: movq %rdx, [[INCHI:%[a-z0-9]+]] +; CHECK-DAG: movq (%rdi), %rax +; CHECK-DAG: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: movq %rax, %rbx + ; INCLO equivalent comes in in %rsi, so it makes sense it stays there. +; CHECK: addq %rsi, %rbx +; CHECK: movq %rdx, %rcx +; CHECK: adcq [[INCHI]], %rcx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] + +; CHECK: movq %rax, _var +; CHECK: movq %rdx, _var+8 + + %val = atomicrmw add i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_sub(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_sub: +; CHECK-DAG: movq %rdx, [[INCHI:%[a-z0-9]+]] +; CHECK-DAG: movq (%rdi), %rax +; CHECK-DAG: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: movq %rax, %rbx + ; INCLO equivalent comes in in %rsi, so it makes sense it stays there. +; CHECK: subq %rsi, %rbx +; CHECK: movq %rdx, %rcx +; CHECK: sbbq [[INCHI]], %rcx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] + +; CHECK: movq %rax, _var +; CHECK: movq %rdx, _var+8 + + %val = atomicrmw sub i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_min(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_min: +; CHECK-DAG: movq %rdx, [[INCHI:%[a-z0-9]+]] +; CHECK-DAG: movq (%rdi), %rax +; CHECK-DAG: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: cmpq %rsi, %rax +; CHECK: setbe [[CMP:%[a-z0-9]+]] +; CHECK: cmpq [[INCHI]], %rdx +; CHECK: setle [[HICMP:%[a-z0-9]+]] +; CHECK: je [[USE_LO:.?LBB[0-9]+_[0-9]+]] + +; CHECK: movb [[HICMP]], [[CMP]] +; CHECK: [[USE_LO]]: +; CHECK: testb [[CMP]], [[CMP]] +; CHECK: movq %rsi, %rbx +; CHECK: cmovneq %rax, %rbx +; CHECK: movq [[INCHI]], %rcx +; CHECK: cmovneq %rdx, %rcx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] + +; CHECK: movq %rax, _var +; CHECK: movq %rdx, _var+8 + + %val = atomicrmw min i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_max(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_max: +; CHECK-DAG: movq %rdx, [[INCHI:%[a-z0-9]+]] +; CHECK-DAG: movq (%rdi), %rax +; CHECK-DAG: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: cmpq %rsi, %rax +; CHECK: setae [[CMP:%[a-z0-9]+]] +; CHECK: cmpq [[INCHI]], %rdx +; CHECK: setge [[HICMP:%[a-z0-9]+]] +; CHECK: je [[USE_LO:.?LBB[0-9]+_[0-9]+]] + +; CHECK: movb [[HICMP]], [[CMP]] +; CHECK: [[USE_LO]]: +; CHECK: testb [[CMP]], [[CMP]] +; CHECK: movq %rsi, %rbx +; CHECK: cmovneq %rax, %rbx +; CHECK: movq [[INCHI]], %rcx +; CHECK: cmovneq %rdx, %rcx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] + +; CHECK: movq %rax, _var +; CHECK: movq %rdx, _var+8 + + %val = atomicrmw max i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_umin(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_umin: +; CHECK-DAG: movq %rdx, [[INCHI:%[a-z0-9]+]] +; CHECK-DAG: movq (%rdi), %rax +; CHECK-DAG: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: cmpq %rsi, %rax +; CHECK: setbe [[CMP:%[a-z0-9]+]] +; CHECK: cmpq [[INCHI]], %rdx +; CHECK: setbe [[HICMP:%[a-z0-9]+]] +; CHECK: je [[USE_LO:.?LBB[0-9]+_[0-9]+]] + +; CHECK: movb [[HICMP]], [[CMP]] +; CHECK: [[USE_LO]]: +; CHECK: testb [[CMP]], [[CMP]] +; CHECK: movq %rsi, %rbx +; CHECK: cmovneq %rax, %rbx +; CHECK: movq [[INCHI]], %rcx +; CHECK: cmovneq %rdx, %rcx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] + +; CHECK: movq %rax, _var +; CHECK: movq %rdx, _var+8 + + %val = atomicrmw umin i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_umax(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_umax: +; CHECK-DAG: movq %rdx, [[INCHI:%[a-z0-9]+]] +; CHECK-DAG: movq (%rdi), %rax +; CHECK-DAG: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: cmpq %rax, %rsi +; CHECK: setb [[CMP:%[a-z0-9]+]] +; CHECK: cmpq [[INCHI]], %rdx +; CHECK: seta [[HICMP:%[a-z0-9]+]] +; CHECK: je [[USE_LO:.?LBB[0-9]+_[0-9]+]] + +; CHECK: movb [[HICMP]], [[CMP]] +; CHECK: [[USE_LO]]: +; CHECK: testb [[CMP]], [[CMP]] +; CHECK: movq %rsi, %rbx +; CHECK: cmovneq %rax, %rbx +; CHECK: movq [[INCHI]], %rcx +; CHECK: cmovneq %rdx, %rcx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] + +; CHECK: movq %rax, _var +; CHECK: movq %rdx, _var+8 + + %val = atomicrmw umax i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define i128 @atomic_load_seq_cst(i128* %p) { +; CHECK-LABEL: atomic_load_seq_cst: +; CHECK: xorl %eax, %eax +; CHECK: xorl %edx, %edx +; CHECK: xorl %ebx, %ebx +; CHECK: xorl %ecx, %ecx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) + + %r = load atomic i128* %p seq_cst, align 16 + ret i128 %r +} + +define i128 @atomic_load_relaxed(i128* %p) { +; CHECK: atomic_load_relaxed: +; CHECK: xorl %eax, %eax +; CHECK: xorl %edx, %edx +; CHECK: xorl %ebx, %ebx +; CHECK: xorl %ecx, %ecx +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) + + %r = load atomic i128* %p monotonic, align 16 + ret i128 %r +} + +define void @atomic_store_seq_cst(i128* %p, i128 %in) { +; CHECK-LABEL: atomic_store_seq_cst: +; CHECK: movq %rdx, %rcx +; CHECK: movq %rsi, %rbx +; CHECK: movq (%rdi), %rax +; CHECK: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] +; CHECK-NOT: callq ___sync_lock_test_and_set_16 + + store atomic i128 %in, i128* %p seq_cst, align 16 + ret void +} + +define void @atomic_store_release(i128* %p, i128 %in) { +; CHECK-LABEL: atomic_store_release: +; CHECK: movq %rdx, %rcx +; CHECK: movq %rsi, %rbx +; CHECK: movq (%rdi), %rax +; CHECK: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] + + store atomic i128 %in, i128* %p release, align 16 + ret void +} + +define void @atomic_store_relaxed(i128* %p, i128 %in) { +; CHECK-LABEL: atomic_store_relaxed: +; CHECK: movq %rdx, %rcx +; CHECK: movq %rsi, %rbx +; CHECK: movq (%rdi), %rax +; CHECK: movq 8(%rdi), %rdx + +; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]: +; CHECK: lock +; CHECK: cmpxchg16b (%rdi) +; CHECK: jne [[LOOP]] + + store atomic i128 %in, i128* %p unordered, align 16 + ret void +} diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll index 45d3ff46a040..faaa4c49d39b 100644 --- a/test/CodeGen/X86/atomic16.ll +++ b/test/CodeGen/X86/atomic16.ll @@ -4,8 +4,8 @@ @sc16 = external global i16 define void @atomic_fetch_add16() nounwind { -; X64: atomic_fetch_add16 -; X32: atomic_fetch_add16 +; X64-LABEL: atomic_fetch_add16 +; X32-LABEL: atomic_fetch_add16 entry: ; 32-bit %t1 = atomicrmw add i16* @sc16, i16 1 acquire @@ -34,8 +34,8 @@ entry: } define void @atomic_fetch_sub16() nounwind { -; X64: atomic_fetch_sub16 -; X32: atomic_fetch_sub16 +; X64-LABEL: atomic_fetch_sub16 +; X32-LABEL: atomic_fetch_sub16 %t1 = atomicrmw sub i16* @sc16, i16 1 acquire ; X64: lock ; X64: decw @@ -62,18 +62,18 @@ define void @atomic_fetch_sub16() nounwind { } define void @atomic_fetch_and16() nounwind { -; X64: atomic_fetch_and16 -; X32: atomic_fetch_and16 +; X64-LABEL: atomic_fetch_and16 +; X32-LABEL: atomic_fetch_and16 %t1 = atomicrmw and i16* @sc16, i16 3 acquire ; X64: lock ; X64: andw $3, {{.*}} # encoding: [0xf0,0x66 ; X32: lock ; X32: andw $3 %t2 = atomicrmw and i16* @sc16, i16 5 acquire -; X64: andw +; X64: andl ; X64: lock ; X64: cmpxchgw -; X32: andw +; X32: andl ; X32: lock ; X32: cmpxchgw %t3 = atomicrmw and i16* @sc16, i16 %t2 acquire @@ -87,18 +87,18 @@ define void @atomic_fetch_and16() nounwind { } define void @atomic_fetch_or16() nounwind { -; X64: atomic_fetch_or16 -; X32: atomic_fetch_or16 +; X64-LABEL: atomic_fetch_or16 +; X32-LABEL: atomic_fetch_or16 %t1 = atomicrmw or i16* @sc16, i16 3 acquire ; X64: lock ; X64: orw $3, {{.*}} # encoding: [0xf0,0x66 ; X32: lock ; X32: orw $3 %t2 = atomicrmw or i16* @sc16, i16 5 acquire -; X64: orw +; X64: orl ; X64: lock ; X64: cmpxchgw -; X32: orw +; X32: orl ; X32: lock ; X32: cmpxchgw %t3 = atomicrmw or i16* @sc16, i16 %t2 acquire @@ -112,18 +112,18 @@ define void @atomic_fetch_or16() nounwind { } define void @atomic_fetch_xor16() nounwind { -; X64: atomic_fetch_xor16 -; X32: atomic_fetch_xor16 +; X64-LABEL: atomic_fetch_xor16 +; X32-LABEL: atomic_fetch_xor16 %t1 = atomicrmw xor i16* @sc16, i16 3 acquire ; X64: lock ; X64: xorw $3, {{.*}} # encoding: [0xf0,0x66 ; X32: lock ; X32: xorw $3 %t2 = atomicrmw xor i16* @sc16, i16 5 acquire -; X64: xorw +; X64: xorl ; X64: lock ; X64: cmpxchgw -; X32: xorw +; X32: xorl ; X32: lock ; X32: cmpxchgw %t3 = atomicrmw xor i16* @sc16, i16 %t2 acquire @@ -137,15 +137,15 @@ define void @atomic_fetch_xor16() nounwind { } define void @atomic_fetch_nand16(i16 %x) nounwind { -; X64: atomic_fetch_nand16 -; X32: atomic_fetch_nand16 +; X64-LABEL: atomic_fetch_nand16 +; X32-LABEL: atomic_fetch_nand16 %t1 = atomicrmw nand i16* @sc16, i16 %x acquire -; X64: andw -; X64: notw +; X64: andl +; X64: notl ; X64: lock ; X64: cmpxchgw -; X32: andw -; X32: notw +; X32: andl +; X32: notl ; X32: lock ; X32: cmpxchgw ret void @@ -155,12 +155,16 @@ define void @atomic_fetch_nand16(i16 %x) nounwind { define void @atomic_fetch_max16(i16 %x) nounwind { %t1 = atomicrmw max i16* @sc16, i16 %x acquire -; X64: cmpw +; X64: movswl +; X64: movswl +; X64: subl ; X64: cmov ; X64: lock ; X64: cmpxchgw -; X32: cmpw +; X32: movswl +; X32: movswl +; X32: subl ; X32: cmov ; X32: lock ; X32: cmpxchgw @@ -171,12 +175,16 @@ define void @atomic_fetch_max16(i16 %x) nounwind { define void @atomic_fetch_min16(i16 %x) nounwind { %t1 = atomicrmw min i16* @sc16, i16 %x acquire -; X64: cmpw +; X64: movswl +; X64: movswl +; X64: subl ; X64: cmov ; X64: lock ; X64: cmpxchgw -; X32: cmpw +; X32: movswl +; X32: movswl +; X32: subl ; X32: cmov ; X32: lock ; X32: cmpxchgw @@ -187,12 +195,16 @@ define void @atomic_fetch_min16(i16 %x) nounwind { define void @atomic_fetch_umax16(i16 %x) nounwind { %t1 = atomicrmw umax i16* @sc16, i16 %x acquire -; X64: cmpw +; X64: movzwl +; X64: movzwl +; X64: subl ; X64: cmov ; X64: lock ; X64: cmpxchgw -; X32: cmpw +; X32: movzwl +; X32: movzwl +; X32: subl ; X32: cmov ; X32: lock ; X32: cmpxchgw @@ -203,11 +215,16 @@ define void @atomic_fetch_umax16(i16 %x) nounwind { define void @atomic_fetch_umin16(i16 %x) nounwind { %t1 = atomicrmw umin i16* @sc16, i16 %x acquire -; X64: cmpw +; X64: movzwl +; X64: movzwl +; X64: subl ; X64: cmov ; X64: lock ; X64: cmpxchgw -; X32: cmpw + +; X32: movzwl +; X32: movzwl +; X32: subl ; X32: cmov ; X32: lock ; X32: cmpxchgw diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll index 474c0e6a9852..4f2cbe0ce2d3 100644 --- a/test/CodeGen/X86/atomic32.ll +++ b/test/CodeGen/X86/atomic32.ll @@ -5,8 +5,8 @@ @sc32 = external global i32 define void @atomic_fetch_add32() nounwind { -; X64: atomic_fetch_add32 -; X32: atomic_fetch_add32 +; X64-LABEL: atomic_fetch_add32: +; X32-LABEL: atomic_fetch_add32: entry: ; 32-bit %t1 = atomicrmw add i32* @sc32, i32 1 acquire @@ -35,8 +35,8 @@ entry: } define void @atomic_fetch_sub32() nounwind { -; X64: atomic_fetch_sub32 -; X32: atomic_fetch_sub32 +; X64-LABEL: atomic_fetch_sub32: +; X32-LABEL: atomic_fetch_sub32: %t1 = atomicrmw sub i32* @sc32, i32 1 acquire ; X64: lock ; X64: decl @@ -63,8 +63,8 @@ define void @atomic_fetch_sub32() nounwind { } define void @atomic_fetch_and32() nounwind { -; X64: atomic_fetch_and32 -; X32: atomic_fetch_and32 +; X64-LABEL: atomic_fetch_and32: +; X32-LABEL: atomic_fetch_and32: %t1 = atomicrmw and i32* @sc32, i32 3 acquire ; X64: lock ; X64: andl $3 @@ -88,8 +88,8 @@ define void @atomic_fetch_and32() nounwind { } define void @atomic_fetch_or32() nounwind { -; X64: atomic_fetch_or32 -; X32: atomic_fetch_or32 +; X64-LABEL: atomic_fetch_or32: +; X32-LABEL: atomic_fetch_or32: %t1 = atomicrmw or i32* @sc32, i32 3 acquire ; X64: lock ; X64: orl $3 @@ -113,8 +113,8 @@ define void @atomic_fetch_or32() nounwind { } define void @atomic_fetch_xor32() nounwind { -; X64: atomic_fetch_xor32 -; X32: atomic_fetch_xor32 +; X64-LABEL: atomic_fetch_xor32: +; X32-LABEL: atomic_fetch_xor32: %t1 = atomicrmw xor i32* @sc32, i32 3 acquire ; X64: lock ; X64: xorl $3 @@ -138,8 +138,8 @@ define void @atomic_fetch_xor32() nounwind { } define void @atomic_fetch_nand32(i32 %x) nounwind { -; X64: atomic_fetch_nand32 -; X32: atomic_fetch_nand32 +; X64-LABEL: atomic_fetch_nand32: +; X32-LABEL: atomic_fetch_nand32: %t1 = atomicrmw nand i32* @sc32, i32 %x acquire ; X64: andl ; X64: notl @@ -155,19 +155,22 @@ define void @atomic_fetch_nand32(i32 %x) nounwind { } define void @atomic_fetch_max32(i32 %x) nounwind { +; X64-LABEL: atomic_fetch_max32: +; X32-LABEL: atomic_fetch_max32: + %t1 = atomicrmw max i32* @sc32, i32 %x acquire -; X64: cmpl +; X64: subl ; X64: cmov ; X64: lock ; X64: cmpxchgl -; X32: cmpl +; X32: subl ; X32: cmov ; X32: lock ; X32: cmpxchgl -; NOCMOV: cmpl -; NOCMOV: jl +; NOCMOV: subl +; NOCMOV: jge ; NOCMOV: lock ; NOCMOV: cmpxchgl ret void @@ -177,19 +180,23 @@ define void @atomic_fetch_max32(i32 %x) nounwind { } define void @atomic_fetch_min32(i32 %x) nounwind { +; X64-LABEL: atomic_fetch_min32: +; X32-LABEL: atomic_fetch_min32: +; NOCMOV-LABEL: atomic_fetch_min32: + %t1 = atomicrmw min i32* @sc32, i32 %x acquire -; X64: cmpl +; X64: subl ; X64: cmov ; X64: lock ; X64: cmpxchgl -; X32: cmpl +; X32: subl ; X32: cmov ; X32: lock ; X32: cmpxchgl -; NOCMOV: cmpl -; NOCMOV: jg +; NOCMOV: subl +; NOCMOV: jle ; NOCMOV: lock ; NOCMOV: cmpxchgl ret void @@ -199,19 +206,23 @@ define void @atomic_fetch_min32(i32 %x) nounwind { } define void @atomic_fetch_umax32(i32 %x) nounwind { +; X64-LABEL: atomic_fetch_umax32: +; X32-LABEL: atomic_fetch_umax32: +; NOCMOV-LABEL: atomic_fetch_umax32: + %t1 = atomicrmw umax i32* @sc32, i32 %x acquire -; X64: cmpl +; X64: subl ; X64: cmov ; X64: lock ; X64: cmpxchgl -; X32: cmpl +; X32: subl ; X32: cmov ; X32: lock ; X32: cmpxchgl -; NOCMOV: cmpl -; NOCMOV: jb +; NOCMOV: subl +; NOCMOV: ja ; NOCMOV: lock ; NOCMOV: cmpxchgl ret void @@ -221,19 +232,23 @@ define void @atomic_fetch_umax32(i32 %x) nounwind { } define void @atomic_fetch_umin32(i32 %x) nounwind { +; X64-LABEL: atomic_fetch_umin32: +; X32-LABEL: atomic_fetch_umin32: +; NOCMOV-LABEL: atomic_fetch_umin32: + %t1 = atomicrmw umin i32* @sc32, i32 %x acquire -; X64: cmpl +; X64: subl ; X64: cmov ; X64: lock ; X64: cmpxchgl -; X32: cmpl +; X32: subl ; X32: cmov ; X32: lock ; X32: cmpxchgl -; NOCMOV: cmpl -; NOCMOV: ja +; NOCMOV: subl +; NOCMOV: jb ; NOCMOV: lock ; NOCMOV: cmpxchgl ret void @@ -243,6 +258,9 @@ define void @atomic_fetch_umin32(i32 %x) nounwind { } define void @atomic_fetch_cmpxchg32() nounwind { +; X64-LABEL: atomic_fetch_cmpxchg32: +; X32-LABEL: atomic_fetch_cmpxchg32: + %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire acquire ; X64: lock ; X64: cmpxchgl @@ -254,6 +272,9 @@ define void @atomic_fetch_cmpxchg32() nounwind { } define void @atomic_fetch_store32(i32 %x) nounwind { +; X64-LABEL: atomic_fetch_store32: +; X32-LABEL: atomic_fetch_store32: + store atomic i32 %x, i32* @sc32 release, align 4 ; X64-NOT: lock ; X64: movl @@ -265,6 +286,9 @@ define void @atomic_fetch_store32(i32 %x) nounwind { } define void @atomic_fetch_swap32(i32 %x) nounwind { +; X64-LABEL: atomic_fetch_swap32: +; X32-LABEL: atomic_fetch_swap32: + %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire ; X64-NOT: lock ; X64: xchgl diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll index 4f55edc05676..11b4e6864da6 100644 --- a/test/CodeGen/X86/atomic64.ll +++ b/test/CodeGen/X86/atomic64.ll @@ -3,7 +3,8 @@ @sc64 = external global i64 define void @atomic_fetch_add64() nounwind { -; X64: atomic_fetch_add64 +; X64-LABEL: atomic_fetch_add64: +; X32-LABEL: atomic_fetch_add64: entry: %t1 = atomicrmw add i64* @sc64, i64 1 acquire ; X64: lock @@ -22,7 +23,8 @@ entry: } define void @atomic_fetch_sub64() nounwind { -; X64: atomic_fetch_sub64 +; X64-LABEL: atomic_fetch_sub64: +; X32-LABEL: atomic_fetch_sub64: %t1 = atomicrmw sub i64* @sc64, i64 1 acquire ; X64: lock ; X64: decq @@ -40,7 +42,8 @@ define void @atomic_fetch_sub64() nounwind { } define void @atomic_fetch_and64() nounwind { -; X64: atomic_fetch_and64 +; X64-LABEL: atomic_fetch_and64: +; X32-LABEL: atomic_fetch_and64: %t1 = atomicrmw and i64* @sc64, i64 3 acquire ; X64: lock ; X64: andq $3 @@ -56,7 +59,8 @@ define void @atomic_fetch_and64() nounwind { } define void @atomic_fetch_or64() nounwind { -; X64: atomic_fetch_or64 +; X64-LABEL: atomic_fetch_or64: +; X32-LABEL: atomic_fetch_or64: %t1 = atomicrmw or i64* @sc64, i64 3 acquire ; X64: lock ; X64: orq $3 @@ -72,7 +76,8 @@ define void @atomic_fetch_or64() nounwind { } define void @atomic_fetch_xor64() nounwind { -; X64: atomic_fetch_xor64 +; X64-LABEL: atomic_fetch_xor64: +; X32-LABEL: atomic_fetch_xor64: %t1 = atomicrmw xor i64* @sc64, i64 3 acquire ; X64: lock ; X64: xorq $3 @@ -88,8 +93,8 @@ define void @atomic_fetch_xor64() nounwind { } define void @atomic_fetch_nand64(i64 %x) nounwind { -; X64: atomic_fetch_nand64 -; X32: atomic_fetch_nand64 +; X64-LABEL: atomic_fetch_nand64: +; X32-LABEL: atomic_fetch_nand64: %t1 = atomicrmw nand i64* @sc64, i64 %x acquire ; X64: andq ; X64: notq @@ -107,8 +112,10 @@ define void @atomic_fetch_nand64(i64 %x) nounwind { } define void @atomic_fetch_max64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_max64: +; X32-LABEL: atomic_fetch_max64: %t1 = atomicrmw max i64* @sc64, i64 %x acquire -; X64: cmpq +; X64: subq ; X64: cmov ; X64: lock ; X64: cmpxchgq @@ -126,8 +133,10 @@ define void @atomic_fetch_max64(i64 %x) nounwind { } define void @atomic_fetch_min64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_min64: +; X32-LABEL: atomic_fetch_min64: %t1 = atomicrmw min i64* @sc64, i64 %x acquire -; X64: cmpq +; X64: subq ; X64: cmov ; X64: lock ; X64: cmpxchgq @@ -145,8 +154,10 @@ define void @atomic_fetch_min64(i64 %x) nounwind { } define void @atomic_fetch_umax64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_umax64: +; X32-LABEL: atomic_fetch_umax64: %t1 = atomicrmw umax i64* @sc64, i64 %x acquire -; X64: cmpq +; X64: subq ; X64: cmov ; X64: lock ; X64: cmpxchgq @@ -164,8 +175,10 @@ define void @atomic_fetch_umax64(i64 %x) nounwind { } define void @atomic_fetch_umin64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_umin64: +; X32-LABEL: atomic_fetch_umin64: %t1 = atomicrmw umin i64* @sc64, i64 %x acquire -; X64: cmpq +; X64: subq ; X64: cmov ; X64: lock ; X64: cmpxchgq @@ -183,6 +196,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind { } define void @atomic_fetch_cmpxchg64() nounwind { +; X64-LABEL: atomic_fetch_cmpxchg64: +; X32-LABEL: atomic_fetch_cmpxchg64: %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire acquire ; X64: lock ; X64: cmpxchgq @@ -194,6 +209,8 @@ define void @atomic_fetch_cmpxchg64() nounwind { } define void @atomic_fetch_store64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_store64: +; X32-LABEL: atomic_fetch_store64: store atomic i64 %x, i64* @sc64 release, align 8 ; X64-NOT: lock ; X64: movq @@ -205,6 +222,8 @@ define void @atomic_fetch_store64(i64 %x) nounwind { } define void @atomic_fetch_swap64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_swap64: +; X32-LABEL: atomic_fetch_swap64: %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire ; X64-NOT: lock ; X64: xchgq diff --git a/test/CodeGen/X86/atomic6432.ll b/test/CodeGen/X86/atomic6432.ll index c0f7267abe77..1c4b0f43bf76 100644 --- a/test/CodeGen/X86/atomic6432.ll +++ b/test/CodeGen/X86/atomic6432.ll @@ -3,7 +3,8 @@ @sc64 = external global i64 define void @atomic_fetch_add64() nounwind { -; X32: atomic_fetch_add64 +; X64-LABEL: atomic_fetch_add64: +; X32-LABEL: atomic_fetch_add64: entry: %t1 = atomicrmw add i64* @sc64, i64 1 acquire ; X32: addl @@ -30,20 +31,21 @@ entry: } define void @atomic_fetch_sub64() nounwind { -; X32: atomic_fetch_sub64 +; X64-LABEL: atomic_fetch_sub64: +; X32-LABEL: atomic_fetch_sub64: %t1 = atomicrmw sub i64* @sc64, i64 1 acquire -; X32: subl -; X32: sbbl +; X32: addl $-1 +; X32: adcl $-1 ; X32: lock ; X32: cmpxchg8b %t2 = atomicrmw sub i64* @sc64, i64 3 acquire -; X32: subl -; X32: sbbl +; X32: addl $-3 +; X32: adcl $-1 ; X32: lock ; X32: cmpxchg8b %t3 = atomicrmw sub i64* @sc64, i64 5 acquire -; X32: subl -; X32: sbbl +; X32: addl $-5 +; X32: adcl $-1 ; X32: lock ; X32: cmpxchg8b %t4 = atomicrmw sub i64* @sc64, i64 %t3 acquire @@ -56,15 +58,16 @@ define void @atomic_fetch_sub64() nounwind { } define void @atomic_fetch_and64() nounwind { -; X32: atomic_fetch_and64 +; X64-LABEL: atomic_fetch_and:64 +; X32-LABEL: atomic_fetch_and64: %t1 = atomicrmw and i64* @sc64, i64 3 acquire -; X32: andl -; X32: andl +; X32: andl $3 +; X32-NOT: andl ; X32: lock ; X32: cmpxchg8b - %t2 = atomicrmw and i64* @sc64, i64 5 acquire -; X32: andl -; X32: andl + %t2 = atomicrmw and i64* @sc64, i64 4294967297 acquire +; X32: andl $1 +; X32: andl $1 ; X32: lock ; X32: cmpxchg8b %t3 = atomicrmw and i64* @sc64, i64 %t2 acquire @@ -77,15 +80,16 @@ define void @atomic_fetch_and64() nounwind { } define void @atomic_fetch_or64() nounwind { -; X32: atomic_fetch_or64 +; X64-LABEL: atomic_fetch_or64: +; X32-LABEL: atomic_fetch_or64: %t1 = atomicrmw or i64* @sc64, i64 3 acquire -; X32: orl -; X32: orl +; X32: orl $3 +; X32-NOT: orl ; X32: lock ; X32: cmpxchg8b - %t2 = atomicrmw or i64* @sc64, i64 5 acquire -; X32: orl -; X32: orl + %t2 = atomicrmw or i64* @sc64, i64 4294967297 acquire +; X32: orl $1 +; X32: orl $1 ; X32: lock ; X32: cmpxchg8b %t3 = atomicrmw or i64* @sc64, i64 %t2 acquire @@ -98,15 +102,16 @@ define void @atomic_fetch_or64() nounwind { } define void @atomic_fetch_xor64() nounwind { -; X32: atomic_fetch_xor64 +; X64-LABEL: atomic_fetch_xor:64 +; X32-LABEL: atomic_fetch_xor64: %t1 = atomicrmw xor i64* @sc64, i64 3 acquire ; X32: xorl -; X32: xorl +; X32-NOT: xorl ; X32: lock ; X32: cmpxchg8b - %t2 = atomicrmw xor i64* @sc64, i64 5 acquire -; X32: xorl -; X32: xorl + %t2 = atomicrmw xor i64* @sc64, i64 4294967297 acquire +; X32: xorl $1 +; X32: xorl $1 ; X32: lock ; X32: cmpxchg8b %t3 = atomicrmw xor i64* @sc64, i64 %t2 acquire @@ -119,7 +124,8 @@ define void @atomic_fetch_xor64() nounwind { } define void @atomic_fetch_nand64(i64 %x) nounwind { -; X32: atomic_fetch_nand64 +; X64-LABEL: atomic_fetch_nand64: +; X32-LABEL: atomic_fetch_nand64: %t1 = atomicrmw nand i64* @sc64, i64 %x acquire ; X32: andl ; X32: andl @@ -132,10 +138,11 @@ define void @atomic_fetch_nand64(i64 %x) nounwind { } define void @atomic_fetch_max64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_max:64 +; X32-LABEL: atomic_fetch_max64: %t1 = atomicrmw max i64* @sc64, i64 %x acquire -; X32: cmpl -; X32: cmpl -; X32: cmov +; X32: subl +; X32: subl ; X32: cmov ; X32: cmov ; X32: lock @@ -145,10 +152,11 @@ define void @atomic_fetch_max64(i64 %x) nounwind { } define void @atomic_fetch_min64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_min64: +; X32-LABEL: atomic_fetch_min64: %t1 = atomicrmw min i64* @sc64, i64 %x acquire -; X32: cmpl -; X32: cmpl -; X32: cmov +; X32: subl +; X32: subl ; X32: cmov ; X32: cmov ; X32: lock @@ -158,10 +166,11 @@ define void @atomic_fetch_min64(i64 %x) nounwind { } define void @atomic_fetch_umax64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_umax:64 +; X32-LABEL: atomic_fetch_umax64: %t1 = atomicrmw umax i64* @sc64, i64 %x acquire -; X32: cmpl -; X32: cmpl -; X32: cmov +; X32: subl +; X32: subl ; X32: cmov ; X32: cmov ; X32: lock @@ -171,10 +180,11 @@ define void @atomic_fetch_umax64(i64 %x) nounwind { } define void @atomic_fetch_umin64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_umin64: +; X32-LABEL: atomic_fetch_umin64: %t1 = atomicrmw umin i64* @sc64, i64 %x acquire -; X32: cmpl -; X32: cmpl -; X32: cmov +; X32: subl +; X32: subl ; X32: cmov ; X32: cmov ; X32: lock @@ -184,6 +194,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind { } define void @atomic_fetch_cmpxchg64() nounwind { +; X64-LABEL: atomic_fetch_cmpxchg:64 +; X32-LABEL: atomic_fetch_cmpxchg64: %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire acquire ; X32: lock ; X32: cmpxchg8b @@ -192,6 +204,8 @@ define void @atomic_fetch_cmpxchg64() nounwind { } define void @atomic_fetch_store64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_store64: +; X32-LABEL: atomic_fetch_store64: store atomic i64 %x, i64* @sc64 release, align 8 ; X32: lock ; X32: cmpxchg8b @@ -200,6 +214,8 @@ define void @atomic_fetch_store64(i64 %x) nounwind { } define void @atomic_fetch_swap64(i64 %x) nounwind { +; X64-LABEL: atomic_fetch_swap64: +; X32-LABEL: atomic_fetch_swap64: %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire ; X32: lock ; X32: xchg8b diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll index 203b26f0ab9a..5eef9b295e80 100644 --- a/test/CodeGen/X86/atomic8.ll +++ b/test/CodeGen/X86/atomic8.ll @@ -4,8 +4,8 @@ @sc8 = external global i8 define void @atomic_fetch_add8() nounwind { -; X64: atomic_fetch_add8 -; X32: atomic_fetch_add8 +; X64-LABEL: atomic_fetch_add8: +; X32-LABEL: atomic_fetch_add8: entry: ; 32-bit %t1 = atomicrmw add i8* @sc8, i8 1 acquire @@ -34,8 +34,8 @@ entry: } define void @atomic_fetch_sub8() nounwind { -; X64: atomic_fetch_sub8 -; X32: atomic_fetch_sub8 +; X64-LABEL: atomic_fetch_sub8: +; X32-LABEL: atomic_fetch_sub8: %t1 = atomicrmw sub i8* @sc8, i8 1 acquire ; X64: lock ; X64: decb @@ -62,8 +62,8 @@ define void @atomic_fetch_sub8() nounwind { } define void @atomic_fetch_and8() nounwind { -; X64: atomic_fetch_and8 -; X32: atomic_fetch_and8 +; X64-LABEL: atomic_fetch_and8: +; X32-LABEL: atomic_fetch_and8: %t1 = atomicrmw and i8* @sc8, i8 3 acquire ; X64: lock ; X64: andb $3 @@ -87,8 +87,8 @@ define void @atomic_fetch_and8() nounwind { } define void @atomic_fetch_or8() nounwind { -; X64: atomic_fetch_or8 -; X32: atomic_fetch_or8 +; X64-LABEL: atomic_fetch_or8: +; X32-LABEL: atomic_fetch_or8: %t1 = atomicrmw or i8* @sc8, i8 3 acquire ; X64: lock ; X64: orb $3 @@ -112,8 +112,8 @@ define void @atomic_fetch_or8() nounwind { } define void @atomic_fetch_xor8() nounwind { -; X64: atomic_fetch_xor8 -; X32: atomic_fetch_xor8 +; X64-LABEL: atomic_fetch_xor8: +; X32-LABEL: atomic_fetch_xor8: %t1 = atomicrmw xor i8* @sc8, i8 3 acquire ; X64: lock ; X64: xorb $3 @@ -137,8 +137,8 @@ define void @atomic_fetch_xor8() nounwind { } define void @atomic_fetch_nand8(i8 %x) nounwind { -; X64: atomic_fetch_nand8 -; X32: atomic_fetch_nand8 +; X64-LABEL: atomic_fetch_nand8: +; X32-LABEL: atomic_fetch_nand8: %t1 = atomicrmw nand i8* @sc8, i8 %x acquire ; X64: andb ; X64: notb @@ -154,14 +154,18 @@ define void @atomic_fetch_nand8(i8 %x) nounwind { } define void @atomic_fetch_max8(i8 %x) nounwind { +; X64-LABEL: atomic_fetch_max8: +; X32-LABEL: atomic_fetch_max8: %t1 = atomicrmw max i8* @sc8, i8 %x acquire -; X64: cmpb -; X64: cmov +; X64: movsbl +; X64: movsbl +; X64: subl ; X64: lock ; X64: cmpxchgb -; X32: cmpb -; X32: cmov +; X32: movsbl +; X32: movsbl +; X32: subl ; X32: lock ; X32: cmpxchgb ret void @@ -170,14 +174,18 @@ define void @atomic_fetch_max8(i8 %x) nounwind { } define void @atomic_fetch_min8(i8 %x) nounwind { +; X64-LABEL: atomic_fetch_min8: +; X32-LABEL: atomic_fetch_min8: %t1 = atomicrmw min i8* @sc8, i8 %x acquire -; X64: cmpb -; X64: cmov +; X64: movsbl +; X64: movsbl +; X64: subl ; X64: lock ; X64: cmpxchgb -; X32: cmpb -; X32: cmov +; X32: movsbl +; X32: movsbl +; X32: subl ; X32: lock ; X32: cmpxchgb ret void @@ -186,14 +194,18 @@ define void @atomic_fetch_min8(i8 %x) nounwind { } define void @atomic_fetch_umax8(i8 %x) nounwind { +; X64-LABEL: atomic_fetch_umax8: +; X32-LABEL: atomic_fetch_umax8: %t1 = atomicrmw umax i8* @sc8, i8 %x acquire -; X64: cmpb -; X64: cmov +; X64: movzbl +; X64: movzbl +; X64: subl ; X64: lock ; X64: cmpxchgb -; X32: cmpb -; X32: cmov +; X32: movzbl +; X32: movzbl +; X32: subl ; X32: lock ; X32: cmpxchgb ret void @@ -202,13 +214,18 @@ define void @atomic_fetch_umax8(i8 %x) nounwind { } define void @atomic_fetch_umin8(i8 %x) nounwind { +; X64-LABEL: atomic_fetch_umin8: +; X32-LABEL: atomic_fetch_umin8: %t1 = atomicrmw umin i8* @sc8, i8 %x acquire -; X64: cmpb -; X64: cmov +; X64: movzbl +; X64: movzbl +; X64: subl ; X64: lock ; X64: cmpxchgb -; X32: cmpb -; X32: cmov + +; X32: movzbl +; X32: movzbl +; X32: subl ; X32: lock ; X32: cmpxchgb ret void @@ -217,6 +234,8 @@ define void @atomic_fetch_umin8(i8 %x) nounwind { } define void @atomic_fetch_cmpxchg8() nounwind { +; X64-LABEL: atomic_fetch_cmpxchg8: +; X32-LABEL: atomic_fetch_cmpxchg8: %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire acquire ; X64: lock ; X64: cmpxchgb @@ -228,6 +247,8 @@ define void @atomic_fetch_cmpxchg8() nounwind { } define void @atomic_fetch_store8(i8 %x) nounwind { +; X64-LABEL: atomic_fetch_store8: +; X32-LABEL: atomic_fetch_store8: store atomic i8 %x, i8* @sc8 release, align 4 ; X64-NOT: lock ; X64: movb @@ -239,6 +260,8 @@ define void @atomic_fetch_store8(i8 %x) nounwind { } define void @atomic_fetch_swap8(i8 %x) nounwind { +; X64-LABEL: atomic_fetch_swap8: +; X32-LABEL: atomic_fetch_swap8: %t1 = atomicrmw xchg i8* @sc8, i8 %x acquire ; X64-NOT: lock ; X64: xchgb diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll index cb639abadd6e..d0ab28aa61f9 100644 --- a/test/CodeGen/X86/atomic_op.ll +++ b/test/CodeGen/X86/atomic_op.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov,cx16 -verify-machineinstrs | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" @@ -110,19 +110,19 @@ entry: %17 = extractvalue { i32, i1 } %pair17, 0 store i32 %17, i32* %old ; CHECK: movl [[R17atomic:.*]], %eax - ; CHECK: movl $1401, %[[R17mask:[a-z]*]] - ; CHECK: andl %eax, %[[R17mask]] - ; CHECK: notl %[[R17mask]] + ; CHECK: movl %eax, %[[R17mask:[a-z]*]] + ; CHECK: notl %[[R17mask]] + ; CHECK: orl $-1402, %[[R17mask]] ; CHECK: lock ; CHECK: cmpxchgl %[[R17mask]], [[R17atomic]] ; CHECK: jne ; CHECK: movl %eax, %18 = atomicrmw nand i32* %val2, i32 1401 monotonic store i32 %18, i32* %old - ; CHECK: andl - ; CHECK: andl ; CHECK: notl ; CHECK: notl + ; CHECK: orl $252645135 + ; CHECK: orl $252645135 ; CHECK: lock ; CHECK: cmpxchg8b %19 = atomicrmw nand i64* %temp64, i64 17361641481138401520 monotonic diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll index 43cdf7edf70a..d2a22d709474 100644 --- a/test/CodeGen/X86/avx-blend.ll +++ b/test/CodeGen/X86/avx-blend.ll @@ -110,7 +110,7 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { ;CHECK-LABEL: vsel_double4: ;CHECK-NOT: vinsertf128 -;CHECK: vshufpd $10 +;CHECK: vblendpd $10 ;CHECK-NEXT: ret define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { %vsel = select <4 x i1> , <4 x double> %v1, <4 x double> %v2 diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll index 7337815a39ac..3e051bff768d 100644 --- a/test/CodeGen/X86/avx-intel-ocl.ll +++ b/test/CodeGen/X86/avx-intel-ocl.ll @@ -7,21 +7,21 @@ declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *) declare <16 x float> @func_float16(<16 x float>, <16 x float>) declare i32 @func_int(i32, i32) -; WIN64: testf16_inp +; WIN64-LABEL: testf16_inp ; WIN64: vaddps {{.*}}, {{%ymm[0-1]}} ; WIN64: vaddps {{.*}}, {{%ymm[0-1]}} ; WIN64: leaq {{.*}}(%rsp), %rcx ; WIN64: call ; WIN64: ret -; X32: testf16_inp +; X32-LABEL: testf16_inp ; X32: movl %eax, (%esp) ; X32: vaddps {{.*}}, {{%ymm[0-1]}} ; X32: vaddps {{.*}}, {{%ymm[0-1]}} ; X32: call ; X32: ret -; X64: testf16_inp +; X64-LABEL: testf16_inp ; X64: vaddps {{.*}}, {{%ymm[0-1]}} ; X64: vaddps {{.*}}, {{%ymm[0-1]}} ; X64: leaq {{.*}}(%rsp), %rdi @@ -41,14 +41,14 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind { ;test calling conventions - preserved registers ; preserved ymm6-ymm15 -; WIN64: testf16_regs +; WIN64-LABEL: testf16_regs ; WIN64: call ; WIN64: vaddps {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}} ; WIN64: vaddps {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}} ; WIN64: ret ; preserved ymm8-ymm15 -; X64: testf16_regs +; X64-LABEL: testf16_regs ; X64: call ; X64: vaddps {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}} ; X64: vaddps {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}} @@ -65,28 +65,30 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { } ; test calling conventions - prolog and epilog -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64-LABEL: test_prolog_epilog +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill ; WIN64: call -; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload - +; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload + +; X64-LABEL: test_prolog_epilog ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill @@ -111,12 +113,14 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; test functions with integer parameters ; pass parameters on stack for 32-bit platform +; X32-LABEL: test_int ; X32: movl {{.*}}, 4(%esp) ; X32: movl {{.*}}, (%esp) ; X32: call ; X32: addl {{.*}}, %eax ; pass parameters in registers for 64-bit platform +; X64-LABEL: test_int ; X64: leal {{.*}}, %edi ; X64: movl {{.*}}, %esi ; X64: call @@ -128,21 +132,21 @@ define i32 @test_int(i32 %a, i32 %b) nounwind { ret i32 %c } -; WIN64: test_float4 +; WIN64-LABEL: test_float4 ; WIN64-NOT: vzeroupper ; WIN64: call ; WIN64-NOT: vzeroupper ; WIN64: call ; WIN64: ret -; X64: test_float4 +; X64-LABEL: test_float4 ; X64-NOT: vzeroupper ; X64: call ; X64-NOT: vzeroupper ; X64: call ; X64: ret -; X32: test_float4 +; X32-LABEL: test_float4 ; X32: vzeroupper ; X32: call ; X32: vzeroupper diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll index f3f7e554a33b..4a996d79815c 100644 --- a/test/CodeGen/X86/avx-shuffle.ll +++ b/test/CodeGen/X86/avx-shuffle.ll @@ -25,7 +25,7 @@ define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind { %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %c ; CHECK-LABEL: test3: -; CHECK: vperm2f128 +; CHECK: vblendpd ; CHECK: ret } diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index 5d0781531f4d..b1b2f8b97a73 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -43,13 +43,10 @@ entry: ret <4 x double> %vecinit6.i } -; Test this simple opt: +; Test this turns into a broadcast: ; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> -; To: -; shuffle (vload ptr)), undef, <1, 1, 1, 1> -; CHECK: vmovdqa -; CHECK-NEXT: vpshufd $-1 -; CHECK-NEXT: vinsertf128 $1 +; +; CHECK: vbroadcastss define <8 x float> @funcE() nounwind { allocas: %udx495 = alloca [18 x [18 x float]], align 32 diff --git a/test/CodeGen/X86/avx-vperm2f128.ll b/test/CodeGen/X86/avx-vperm2f128.ll index caa21e5bacfe..c20775bacad2 100644 --- a/test/CodeGen/X86/avx-vperm2f128.ll +++ b/test/CodeGen/X86/avx-vperm2f128.ll @@ -9,7 +9,7 @@ entry: } ; CHECK: _B -; CHECK: vperm2f128 $48 +; CHECK: vblendps $240 define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> diff --git a/test/CodeGen/X86/avx-vshufp.ll b/test/CodeGen/X86/avx-vshufp.ll index 45883b717380..ad3dbc1ed893 100644 --- a/test/CodeGen/X86/avx-vshufp.ll +++ b/test/CodeGen/X86/avx-vshufp.ll @@ -32,14 +32,14 @@ entry: ret <8 x i32> %shuffle } -; CHECK: vshufpd $10, %ymm +; CHECK: vblendpd $10, %ymm define <4 x double> @B(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } -; CHECK: vshufpd $10, (%{{.*}}), %ymm +; CHECK: vblendpd $10, (%{{.*}}), %ymm define <4 x double> @B2(<4 x double>* %a, <4 x double>* %b) nounwind uwtable readnone ssp { entry: %a2 = load <4 x double>* %a @@ -48,14 +48,14 @@ entry: ret <4 x double> %shuffle } -; CHECK: vshufpd $10, %ymm +; CHECK: vblendpd $10, %ymm define <4 x i64> @B3(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } -; CHECK: vshufpd $10, (%{{.*}}), %ymm +; CHECK: vblendpd $10, (%{{.*}}), %ymm define <4 x i64> @B4(<4 x i64>* %a, <4 x i64>* %b) nounwind uwtable readnone ssp { entry: %a2 = load <4 x i64>* %a @@ -71,7 +71,7 @@ entry: ret <8 x float> %shuffle } -; CHECK: vshufpd $2, %ymm +; CHECK: vblendpd $2, %ymm define <4 x double> @D(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> diff --git a/test/CodeGen/X86/avx512-inc-dec.ll b/test/CodeGen/X86/avx512-inc-dec.ll new file mode 100644 index 000000000000..f04ca878f434 --- /dev/null +++ b/test/CodeGen/X86/avx512-inc-dec.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +;CHECK-LABEL: test +;CHECK-NOT: dec +;CHECK_NOT: enc +;CHECK: ret +define i32 @test(i32 %a, i32 %b) { + %a1 = add i32 %a, -1 + %b1 = add i32 %b, 1 + %res = mul i32 %a1, %b1 + ret i32 %res +} + diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 51175390a258..6f34d4596f9f 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -134,17 +134,17 @@ declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x flo define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { ; CHECK: vsqrtpd - %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1] + %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) ; <<8 x double>> [#uses=1] ret <8 x double> %res } -declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>) nounwind readnone +declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) { ; CHECK: vsqrtps - %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] + %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) ; <<16 x float>> [#uses=1] ret <16 x float> %res } -declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone +declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vsqrtss {{.*}}encoding: [0x62 @@ -594,6 +594,13 @@ define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%p ret <16 x float> %res } +define <16 x float> @test_vpermt2ps_mask(<16 x float>%x, <16 x float>%y, <16 x i32>%perm, i16 %mask) { +; CHECK-LABEL: test_vpermt2ps_mask: +; CHECK: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x7f,0xc1] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>%perm, <16 x float>%x, <16 x float>%y, i16 %mask) + ret <16 x float> %res +} + declare <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) define <8 x i64> @test_vmovntdqa(i8 *%x) { diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll index 23ddc3a6c1dc..b99e89a9a546 100644 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -56,6 +56,16 @@ define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { ret <8 x double> %c } +; The reg variant of vpermt2 with a writemask +; CHECK-LABEL: test5m: +; CHECK: vpermt2pd {{.* {%k[1-7]} {z}}} +define <8 x double> @test5m(<8 x double> %a, <8 x double> %b, i8 %mask) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + %m = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %m, <8 x double> %c, <8 x double> zeroinitializer + ret <8 x double> %res +} + ; CHECK-LABEL: test6: ; CHECK: vpermq $30 ; CHECK: ret @@ -72,6 +82,27 @@ define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { ret <8 x i64> %c } +; The reg variant of vpermt2 with a writemask +; CHECK-LABEL: test7m: +; CHECK: vpermt2q {{.* {%k[1-7]} {z}}} +define <8 x i64> @test7m(<8 x i64> %a, <8 x i64> %b, i8 %mask) nounwind { + %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + %m = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer + ret <8 x i64> %res +} + +; The mem variant of vpermt2 with a writemask +; CHECK-LABEL: test7mm: +; CHECK: vpermt2q {{\(.*\).* {%k[1-7]} {z}}} +define <8 x i64> @test7mm(<8 x i64> %a, <8 x i64> *%pb, i8 %mask) nounwind { + %b = load <8 x i64>* %pb + %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + %m = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer + ret <8 x i64> %res +} + ; CHECK-LABEL: test8: ; CHECK: vpermt2d ; CHECK: ret @@ -80,6 +111,27 @@ define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { ret <16 x i32> %c } +; The reg variant of vpermt2 with a writemask +; CHECK-LABEL: test8m: +; CHECK: vpermt2d {{.* {%k[1-7]} {z}}} +define <16 x i32> @test8m(<16 x i32> %a, <16 x i32> %b, i16 %mask) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + %m = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +; The mem variant of vpermt2 with a writemask +; CHECK-LABEL: test8mm: +; CHECK: vpermt2d {{\(.*\).* {%k[1-7]} {z}}} +define <16 x i32> @test8mm(<16 x i32> %a, <16 x i32> *%pb, i16 %mask) nounwind { + %b = load <16 x i32> * %pb + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + %m = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + ; CHECK-LABEL: test9: ; CHECK: vpermt2ps ; CHECK: ret @@ -88,6 +140,16 @@ define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { ret <16 x float> %c } +; The reg variant of vpermt2 with a writemask +; CHECK-LABEL: test9m: +; CHECK: vpermt2ps {{.*}} {%k{{.}}} {z} +define <16 x float> @test9m(<16 x float> %a, <16 x float> %b, i16 %mask) nounwind { + %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> + %m = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %m, <16 x float> %c, <16 x float> zeroinitializer + ret <16 x float> %res +} + ; CHECK-LABEL: test10: ; CHECK: vpermt2ps ( ; CHECK: ret diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll index 3c931db2e279..9dc960d7779f 100644 --- a/test/CodeGen/X86/bswap-vector.ll +++ b/test/CodeGen/X86/bswap-vector.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -mcpu=x86-64 | FileCheck %s -check-prefix=CHECK-NOSSSE3 ; RUN: llc < %s -mcpu=core2 | FileCheck %s -check-prefix=CHECK-SSSE3 ; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2 +; RUN: llc < %s -mcpu=core-avx2 -x86-experimental-vector-widening-legalization | FileCheck %s -check-prefix=CHECK-WIDE-AVX2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -31,6 +32,10 @@ entry: ; CHECK-AVX2-LABEL: @test1 ; CHECK-AVX2: vpshufb ; CHECK-AVX2-NEXT: retq + +; CHECK-WIDE-AVX2-LABEL: @test1 +; CHECK-WIDE-AVX2: vpshufb +; CHECK-WIDE-AVX2-NEXT: retq } define <4 x i32> @test2(<4 x i32> %v) #0 { @@ -52,6 +57,10 @@ entry: ; CHECK-AVX2-LABEL: @test2 ; CHECK-AVX2: vpshufb ; CHECK-AVX2-NEXT: retq + +; CHECK-WIDE-AVX2-LABEL: @test2 +; CHECK-WIDE-AVX2: vpshufb +; CHECK-WIDE-AVX2-NEXT: retq } define <2 x i64> @test3(<2 x i64> %v) #0 { @@ -71,6 +80,10 @@ entry: ; CHECK-AVX2-LABEL: @test3 ; CHECK-AVX2: vpshufb ; CHECK-AVX2-NEXT: retq + +; CHECK-WIDE-AVX2-LABEL: @test3 +; CHECK-WIDE-AVX2: vpshufb +; CHECK-WIDE-AVX2-NEXT: retq } declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>) @@ -90,6 +103,10 @@ entry: ; CHECK-AVX2-LABEL: @test4 ; CHECK-AVX2: vpshufb ; CHECK-AVX2-NEXT: retq + +; CHECK-WIDE-AVX2-LABEL: @test4 +; CHECK-WIDE-AVX2: vpshufb +; CHECK-WIDE-AVX2-NEXT: retq } define <8 x i32> @test5(<8 x i32> %v) #0 { @@ -105,6 +122,10 @@ entry: ; CHECK-AVX2-LABEL: @test5 ; CHECK-AVX2: vpshufb ; CHECK-AVX2-NEXT: retq + +; CHECK-WIDE-AVX2-LABEL: @test5 +; CHECK-WIDE-AVX2: vpshufb +; CHECK-WIDE-AVX2-NEXT: retq } define <4 x i64> @test6(<4 x i64> %v) #0 { @@ -120,6 +141,10 @@ entry: ; CHECK-AVX2-LABEL: @test6 ; CHECK-AVX2: vpshufb ; CHECK-AVX2-NEXT: retq + +; CHECK-WIDE-AVX2-LABEL: @test6 +; CHECK-WIDE-AVX2: vpshufb +; CHECK-WIDE-AVX2-NEXT: retq } declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) @@ -138,6 +163,10 @@ entry: ; CHECK-AVX2: vpshufb ; CHECK-AVX2: vpsrld $16 ; CHECK-AVX2-NEXT: retq + +; CHECK-WIDE-AVX2-LABEL: @test7 +; CHECK-WIDE-AVX2: vpshufb +; CHECK-WIDE-AVX2-NEXT: retq } attributes #0 = { nounwind uwtable } diff --git a/test/CodeGen/X86/coff-comdat.ll b/test/CodeGen/X86/coff-comdat.ll new file mode 100644 index 000000000000..bf27b2fff1fa --- /dev/null +++ b/test/CodeGen/X86/coff-comdat.ll @@ -0,0 +1,92 @@ +; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck %s + +$f1 = comdat any +@v1 = global i32 0, comdat $f1 +define void @f1() comdat $f1 { + ret void +} + +$f2 = comdat exactmatch +@v2 = global i32 0, comdat $f2 +define void @f2() comdat $f2 { + ret void +} + +$f3 = comdat largest +@v3 = global i32 0, comdat $f3 +define void @f3() comdat $f3 { + ret void +} + +$f4 = comdat noduplicates +@v4 = global i32 0, comdat $f4 +define void @f4() comdat $f4 { + ret void +} + +$f5 = comdat samesize +@v5 = global i32 0, comdat $f5 +define void @f5() comdat $f5 { + ret void +} + +$f6 = comdat samesize +@v6 = global i32 0, comdat $f6 +@f6 = global i32 0, comdat $f6 + +$"\01@f7@0" = comdat any +define x86_fastcallcc void @"\01@v7@0"() comdat $"\01@f7@0" { + ret void +} +define x86_fastcallcc void @"\01@f7@0"() comdat $"\01@f7@0" { + ret void +} + +$f8 = comdat any +define x86_fastcallcc void @v8() comdat $f8 { + ret void +} +define x86_fastcallcc void @f8() comdat $f8 { + ret void +} + +$vftable = comdat largest + +@some_name = private unnamed_addr constant [2 x i8*] zeroinitializer, comdat $vftable +@vftable = alias getelementptr([2 x i8*]* @some_name, i32 0, i32 1) + +; CHECK: .section .text,"xr",discard,_f1 +; CHECK: .globl _f1 +; CHECK: .section .text,"xr",same_contents,_f2 +; CHECK: .globl _f2 +; CHECK: .section .text,"xr",largest,_f3 +; CHECK: .globl _f3 +; CHECK: .section .text,"xr",one_only,_f4 +; CHECK: .globl _f4 +; CHECK: .section .text,"xr",same_size,_f5 +; CHECK: .globl _f5 +; CHECK: .section .text,"xr",associative,@f7@0 +; CHECK: .globl @v7@0 +; CHECK: .section .text,"xr",discard,@f7@0 +; CHECK: .globl @f7@0 +; CHECK: .section .text,"xr",associative,@f8@0 +; CHECK: .globl @v8@0 +; CHECK: .section .text,"xr",discard,@f8@0 +; CHECK: .globl @f8@0 +; CHECK: .section .bss,"bw",associative,_f1 +; CHECK: .globl _v1 +; CHECK: .section .bss,"bw",associative,_f2 +; CHECK: .globl _v2 +; CHECK: .section .bss,"bw",associative,_f3 +; CHECK: .globl _v3 +; CHECK: .section .bss,"bw",associative,_f4 +; CHECK: .globl _v4 +; CHECK: .section .bss,"bw",associative,_f5 +; CHECK: .globl _v5 +; CHECK: .section .bss,"bw",associative,_f6 +; CHECK: .globl _v6 +; CHECK: .section .bss,"bw",same_size,_f6 +; CHECK: .globl _f6 +; CHECK: .section .rdata,"rd",largest,_vftable +; CHECK: .globl _vftable +; CHECK: _vftable = L_some_name+4 diff --git a/test/CodeGen/X86/coff-comdat2.ll b/test/CodeGen/X86/coff-comdat2.ll new file mode 100644 index 000000000000..6744b5b02ad7 --- /dev/null +++ b/test/CodeGen/X86/coff-comdat2.ll @@ -0,0 +1,9 @@ +; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s + +target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" +target triple = "i686-pc-windows-msvc" + +$foo = comdat largest +@foo = global i32 0 +@bar = global i32 0, comdat $foo +; CHECK: Associative COMDAT symbol 'foo' is not a key for it's COMDAT. diff --git a/test/CodeGen/X86/coff-comdat3.ll b/test/CodeGen/X86/coff-comdat3.ll new file mode 100644 index 000000000000..76e464b27547 --- /dev/null +++ b/test/CodeGen/X86/coff-comdat3.ll @@ -0,0 +1,8 @@ +; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s + +target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" +target triple = "i686-pc-windows-msvc" + +$foo = comdat largest +@bar = global i32 0, comdat $foo +; CHECK: Associative COMDAT symbol 'foo' does not exist. diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index 572aded5e9a3..df3b9015adda 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -74,7 +74,7 @@ define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { } ; CHECK-LABEL: test6 ; CHECK-NOT: xorps -; CHECK: shufps +; CHECK: blendps $12 ; CHECK-NEXT: ret @@ -86,7 +86,7 @@ define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { } ; CHECK-LABEL: test7 ; CHECK-NOT: xorps -; CHECK: shufps +; CHECK: blendps $12 ; CHECK-NEXT: ret @@ -266,4 +266,16 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { ; CHECK-NEXT: pslldq ; CHECK-NEXT: ret +; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle +; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to +; handle legal vector value types. +define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) { + %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32> + %or = or <4 x i8> %shuf1, %shuf2 + ret <4 x i8> %or +} +; CHECK-LABEL: test_crash +; CHECK: movsd +; CHECK: ret diff --git a/test/CodeGen/X86/combine-vec-shuffle-2.ll b/test/CodeGen/X86/combine-vec-shuffle-2.ll new file mode 100644 index 000000000000..877d38260d61 --- /dev/null +++ b/test/CodeGen/X86/combine-vec-shuffle-2.ll @@ -0,0 +1,253 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s + +; Check that DAGCombiner correctly folds the following pairs of shuffles +; using the following rules: +; 1. shuffle(shuffle(x, y), undef) -> x +; 2. shuffle(shuffle(x, y), undef) -> y +; 3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef) +; 4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y) +; +; Rules 3. and 4. are used only if the resulting shuffle mask is legal. + +define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test1 +; Mask: [3,0,0,1] +; CHECK: pshufd $67 +; CHECK-NEXT: ret + + +define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test2 +; Mask: [2,0,0,3] +; CHECK: pshufd $-62 +; CHECK-NEXT: ret + + +define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test3 +; Mask: [2,0,0,3] +; CHECK: pshufd $-62 +; CHECK-NEXT: ret + + +define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test4 +; Mask: [0,0,0,1] +; CHECK: pshufd $64 +; CHECK-NEXT: ret + + +define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test5 +; Mask: [1,1] +; CHECK: movhlps +; CHECK-NEXT: ret + + +define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test6 +; Mask: [2,0,0,0] +; CHECK: pshufd $2 +; CHECK-NEXT: ret + + +define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test7 +; Mask: [0,2,0,2] +; CHECK: pshufd $-120 +; CHECK-NEXT: ret + + +define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test8 +; Mask: [1,0,3,0] +; CHECK: pshufd $49 +; CHECK-NEXT: ret + + +define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test9 +; Mask: [1,3,0,2] +; CHECK: pshufd $-115 +; CHECK-NEXT: ret + + +define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test10 +; Mask: [1,0,1,0] +; CHECK: pshufd $17 +; CHECK-NEXT: ret + + +define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test11 +; Mask: [1,0,2,1] +; CHECK: pshufd $97 +; CHECK-NEXT: ret + + +define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test12 +; Mask: [0,0,0,0] +; CHECK: pshufd $0 +; CHECK-NEXT: ret + + +; The following pair of shuffles is folded into vector %A. +define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test13 +; CHECK-NOT: pshufd +; CHECK: ret + + +; The following pair of shuffles is folded into vector %B. +define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test14 +; CHECK-NOT: pshufd +; CHECK: ret + + +; Verify that we don't optimize the following cases. We expect more than one shuffle. + +define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test15 +; CHECK: shufps $114 +; CHECK-NEXT: pshufd $-58 +; CHECK-NEXT: ret + + +define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test16 +; CHECK: blendps $10 +; CHECK-NEXT: pshufd $-58 +; CHECK-NEXT: ret + + +define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test17 +; CHECK: shufps $120 +; CHECK-NEXT: pshufd $-58 +; CHECK-NEXT: ret + + +define <4 x i32> @test18(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test18 +; CHECK: blendps $11 +; CHECK-NEXT: pshufd $-59 +; CHECK-NEXT: ret + +define <4 x i32> @test19(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test19 +; CHECK: shufps $-104 +; CHECK-NEXT: pshufd $2 +; CHECK-NEXT: ret + + +define <4 x i32> @test20(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test20 +; CHECK: shufps $11 +; CHECK-NEXT: pshufd $-58 +; CHECK-NEXT: ret + + +define <4 x i32> @test21(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test21 +; CHECK: shufps $120 +; CHECK-NEXT: pshufd $-60 +; CHECK-NEXT: ret + + +define <4 x i32> @test22(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test22 +; CHECK: blendps $11 +; CHECK-NEXT: pshufd $-43 +; CHECK-NEXT: ret + diff --git a/test/CodeGen/X86/combine-vec-shuffle-3.ll b/test/CodeGen/X86/combine-vec-shuffle-3.ll new file mode 100644 index 000000000000..bd2d34ca189a --- /dev/null +++ b/test/CodeGen/X86/combine-vec-shuffle-3.ll @@ -0,0 +1,380 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s + +define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test1 +; Mask: [0,1,2,3] +; CHECK: movaps +; CHECK: ret + +define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test2 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test3 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test4 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test5 +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + + +define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test6 +; Mask: [4,5,6,7] +; CHECK: movaps +; CHECK: ret + +define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test7 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x i32> @test8(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test8 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test9 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x i32> @test10(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test10 +; Mask: [4,1,6,7] +; CHECK: blendps +; CHECK: ret + +define <4 x float> @test11(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test11 +; Mask: [0,1,2,3] +; CHECK-NOT: movaps +; CHECK-NOT: blendps +; CHECK: ret + +define <4 x float> @test12(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test12 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x float> @test13(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test13 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test14 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK: ret + +define <4 x float> @test15(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test15 +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + +define <4 x i32> @test16(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test16 +; Mask: [0,1,2,3] +; CHECK-NOT: movaps +; CHECK-NOT: blendps +; CHECK: ret + +define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test17 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test18 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test19 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK: ret + +define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test20 +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + +; Check some negative cases. +define <4 x float> @test1b(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test1b +; CHECK: shufps +; CHECK: shufps +; CHECK: ret + +define <4 x float> @test2b(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test2b +; CHECK: shufps +; CHECK: pshufd +; CHECK: ret + +define <4 x float> @test3b(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test3b +; CHECK: shufps +; CHECK: shufps +; CHECK: ret + +define <4 x float> @test4b(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test4b +; CHECK: shufps +; CHECK: shufps +; CHECK: ret + + +; Verify that we correctly fold shuffles even when we use illegal vector types. +define <4 x i8> @test1c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test1c +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK-NEXT: ret + +define <4 x i8> @test2c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test2c +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK-NEXT: ret + +define <4 x i8> @test3c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test3c +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x i8> @test4c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test4c +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + +; The following test cases are generated from this C++ code +; +;__m128 blend_01(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<1 ); +; return s; +;} +; +;__m128 blend_02(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; return s; +;} +; +;__m128 blend_123(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<1 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; s = _mm_blend_ps( s, b, 1<<3 ); +; return s; +;} + +; Ideally, we should collapse the following shuffles into a single one. + +define <4 x float> @blend_01(<4 x float> %a, <4 x float> %b) { + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle6 +} +; CHECK-LABEL: blend_01 +; CHECK: movsd +; CHECK-NEXT: ret + +define <4 x float> @blend_02(<4 x float> %a, <4 x float> %b) { + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle6 +} +; CHECK-LABEL: blend_02 +; CHECK: blendps $5 +; CHECK-NEXT: ret + +define <4 x float> @blend_123(<4 x float> %a, <4 x float> %b) { + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> + %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle12 +} +; CHECK-LABEL: blend_123 +; CHECK: movss +; CHECK: ret + +define <4 x i32> @test_movhl_1(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test_movhl_1 +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x i32> @test_movhl_2(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test_movhl_2 +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x i32> @test_movhl_3(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test_movhl_3 +; CHECK: movhlps +; CHECK-NEXT: ret + diff --git a/test/CodeGen/X86/combine-vec-shuffle-4.ll b/test/CodeGen/X86/combine-vec-shuffle-4.ll new file mode 100644 index 000000000000..0ddec2c12fb5 --- /dev/null +++ b/test/CodeGen/X86/combine-vec-shuffle-4.ll @@ -0,0 +1,237 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s + +; Verify that we fold shuffles according to rule: +; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) + +define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test1 +; Mask: [4,5,2,3] +; CHECK: movsd +; CHECK: ret + +define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test2 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test3 +; Mask: [0,1,4,u] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test4 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test5 +; Mask: [0,1,6,7] +; CHECK: blendps $12 +; CHECK: ret + +; Verify that we fold shuffles according to rule: +; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) + +define <4 x float> @test6(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test6 +; Mask: [0,1,2,3] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK-NOT: movlhps +; CHECK: ret + +define <4 x float> @test7(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test7 +; Mask: [0,1,0,1] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK: movlhps +; CHECK-NEXT: ret + +define <4 x float> @test8(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test8 +; Mask: [0,1,0,u] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK: movlhps +; CHECK-NEXT: ret + +define <4 x float> @test9(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test9 +; Mask: [2,3,2,3] +; CHECK-NOT: movlhps +; CHECK-NOT: palignr +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x float> @test10(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test10 +; Mask: [0,1,2,3] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK-NOT: movlhps +; CHECK: ret + +define <4 x float> @test11(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test11 +; Mask: [4,5,2,3] +; CHECK: movsd +; CHECK: ret + +define <4 x float> @test12(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test12 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test13(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test13 +; Mask: [0,1,4,u] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test14 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x float> @test15(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test15 +; Mask: [0,1,6,7] +; CHECK: blendps $12 +; CHECK: ret + +; Verify that shuffles are canonicalized according to rules: +; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) +; +; This allows to trigger the following combine rule: +; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) +; +; As a result, all the shuffle pairs in each function below should be +; combined into a single legal shuffle operation. + +define <4 x float> @test16(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test16 +; Mask: [0,1,2,3] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK-NOT: movlhps +; CHECK: ret + +define <4 x float> @test17(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test17 +; Mask: [0,1,0,1] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK: movlhps +; CHECK-NEXT: ret + +define <4 x float> @test18(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test18 +; Mask: [0,1,0,u] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK: movlhps +; CHECK-NEXT: ret + +define <4 x float> @test19(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test19 +; Mask: [2,3,2,3] +; CHECK-NOT: movlhps +; CHECK-NOT: palignr +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x float> @test20(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test20 +; Mask: [0,1,2,3] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK-NOT: movlhps +; CHECK: ret + diff --git a/test/CodeGen/X86/combine-vec-shuffle-5.ll b/test/CodeGen/X86/combine-vec-shuffle-5.ll new file mode 100644 index 000000000000..16c45efe4be6 --- /dev/null +++ b/test/CodeGen/X86/combine-vec-shuffle-5.ll @@ -0,0 +1,257 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s + +; Verify that the DAGCombiner correctly folds all the shufflevector pairs +; into a single shuffle operation. + +define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test1 +; Mask: [0,1,2,3] +; CHECK: movaps +; CHECK: ret + +define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test2 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test3 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test4 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test5 +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + + +define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test6 +; Mask: [4,5,6,7] +; CHECK: movaps +; CHECK: ret + +define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test7 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x i32> @test8(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test8 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test9 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x i32> @test10(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test10 +; Mask: [4,1,6,7] +; CHECK: blendps +; CHECK: ret + +define <4 x float> @test11(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test11 +; Mask: [0,1,2,3] +; CHECK-NOT: movaps +; CHECK-NOT: blendps +; CHECK: ret + +define <4 x float> @test12(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test12 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x float> @test13(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test13 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test14 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK: ret + +define <4 x float> @test15(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test15 +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + +define <4 x i32> @test16(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %a, <4 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test16 +; Mask: [0,1,2,3] +; CHECK-NOT: movaps +; CHECK-NOT: blendps +; CHECK: ret + +define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %a, <4 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test17 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %a, <4 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test18 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %a, <4 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test19 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK: ret + +define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %a, <4 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test20 +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + +; Verify that we correctly fold shuffles even when we use illegal vector types. +define <4 x i8> @test1c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %B, <4 x i8> %1, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test1c +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK-NEXT: ret + +define <4 x i8> @test2c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %B, <4 x i8> %1, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test2c +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK-NEXT: ret + +define <4 x i8> @test3c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %B, <4 x i8> %1, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test3c +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK: ret + +define <4 x i8> @test4c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %B, <4 x i8> %1, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test4c +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + diff --git a/test/CodeGen/X86/constant-pool-remat-0.ll b/test/CodeGen/X86/constant-pool-remat-0.ll index 4a0110896ced..e42a87c6acde 100644 --- a/test/CodeGen/X86/constant-pool-remat-0.ll +++ b/test/CodeGen/X86/constant-pool-remat-0.ll @@ -1,7 +1,7 @@ ; REQUIRES: asserts ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-linux -regalloc=greedy | FileCheck %s -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-linux -mattr=+sse2 | FileCheck %s ; CHECK: LCPI ; CHECK: LCPI ; CHECK: LCPI diff --git a/test/CodeGen/X86/constant-pool-sharing.ll b/test/CodeGen/X86/constant-pool-sharing.ll index 26318dd6c558..3682165e3a25 100644 --- a/test/CodeGen/X86/constant-pool-sharing.ll +++ b/test/CodeGen/X86/constant-pool-sharing.ll @@ -1,12 +1,13 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s --check-prefix=COMMON --check-prefix=LINUX +; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s --check-prefix=COMMON --check-prefix=MSVC ; llc should share constant pool entries between this integer vector ; and this floating-point vector since they have the same encoding. -; CHECK: LCPI0_0(%rip), %xmm0 -; CHECK: movaps %xmm0, ({{%rdi|%rcx}}) -; CHECK: movaps %xmm0, ({{%rsi|%rdx}}) +; LINUX: LCPI0_0(%rip), %xmm0 +; MSVC: __xmm@40000000400000004000000040000000(%rip), %xmm0 +; COMMON: movaps %xmm0, ({{%rdi|%rcx}}) +; COMMON: movaps %xmm0, ({{%rsi|%rdx}}) define void @foo(<4 x i32>* %p, <4 x float>* %q, i1 %t) nounwind { entry: diff --git a/test/CodeGen/X86/cvt16.ll b/test/CodeGen/X86/cvt16.ll new file mode 100644 index 000000000000..4d920e2d23d2 --- /dev/null +++ b/test/CodeGen/X86/cvt16.ll @@ -0,0 +1,89 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=LIBCALL +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=F16C +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -soft-float=1 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=SOFTFLOAT +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -soft-float=1 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=SOFTFLOAT + +; This is a test for float to half float conversions on x86-64. +; +; If flag -soft-float is set, or if there is no F16C support, then: +; 1) half float to float conversions are +; translated into calls to __gnu_h2f_ieee defined +; by the compiler runtime library; +; 2) float to half float conversions are translated into calls +; to __gnu_f2h_ieee which expected to be defined by the +; compiler runtime library. +; +; Otherwise (we have F16C support): +; 1) half float to float conversion are translated using +; vcvtph2ps instructions; +; 2) float to half float conversions are translated using +; vcvtps2ph instructions + + +define void @test1(float %src, i16* %dest) { + %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src) + store i16 %1, i16* %dest, align 2 + ret void +} +; CHECK-LABEL: test1 +; LIBCALL: callq __gnu_f2h_ieee +; SOFTFLOAT: callq __gnu_f2h_ieee +; F16C: vcvtps2ph +; CHECK: ret + + +define float @test2(i16* nocapture %src) { + %1 = load i16* %src, align 2 + %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1) + ret float %2 +} +; CHECK-LABEL: test2: +; LIBCALL: jmp __gnu_h2f_ieee +; SOFTFLOAT: callq __gnu_h2f_ieee +; F16C: vcvtph2ps +; F16C: ret + + +define float @test3(float %src) nounwind uwtable readnone { + %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src) + %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1) + ret float %2 +} + +; CHECK-LABEL: test3: +; LIBCALL: callq __gnu_f2h_ieee +; LIBCALL: jmp __gnu_h2f_ieee +; SOFTFLOAT: callq __gnu_f2h_ieee +; SOFTFLOAT: callq __gnu_h2f_ieee +; F16C: vcvtps2ph +; F16C-NEXT: vcvtph2ps +; F16C: ret + +define double @test4(i16* nocapture %src) { + %1 = load i16* %src, align 2 + %2 = tail call double @llvm.convert.from.fp16.f64(i16 %1) + ret double %2 +} +; CHECK-LABEL: test4: +; LIBCALL: callq __gnu_h2f_ieee +; LIBCALL: cvtss2sd +; SOFTFLOAT: callq __gnu_h2f_ieee +; SOFTFLOAT: callq __extendsfdf2 +; F16C: vcvtph2ps +; F16C: vcvtss2sd +; F16C: ret + + +define i16 @test5(double %src) { + %val = tail call i16 @llvm.convert.to.fp16.f64(double %src) + ret i16 %val +} +; CHECK-LABEL: test5: +; LIBCALL: jmp __truncdfhf2 +; SOFTFLOAT: callq __truncdfhf2 +; F16C: jmp __truncdfhf2 + +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone +declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone diff --git a/test/CodeGen/X86/elf-comdat.ll b/test/CodeGen/X86/elf-comdat.ll new file mode 100644 index 000000000000..c7e6df7d64f0 --- /dev/null +++ b/test/CodeGen/X86/elf-comdat.ll @@ -0,0 +1,11 @@ +; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s + +$f = comdat any +@v = global i32 0, comdat $f +define void @f() comdat $f { + ret void +} +; CHECK: .section .text.f,"axG",@progbits,f,comdat +; CHECK: .globl f +; CHECK: .section .bss.v,"aGw",@nobits,f,comdat +; CHECK: .globl v diff --git a/test/CodeGen/X86/elf-comdat2.ll b/test/CodeGen/X86/elf-comdat2.ll new file mode 100644 index 000000000000..209da39ed881 --- /dev/null +++ b/test/CodeGen/X86/elf-comdat2.ll @@ -0,0 +1,12 @@ +; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s + +$foo = comdat any +@bar = global i32 42, comdat $foo +@foo = global i32 42 + +; CHECK: .type bar,@object +; CHECK-NEXT: .section .data.bar,"aGw",@progbits,foo,comdat +; CHECK-NEXT: .globl bar +; CHECK: .type foo,@object +; CHECK-NEXT: .data +; CHECK-NEXT: .globl foo diff --git a/test/CodeGen/X86/fast-isel-select-cmov.ll b/test/CodeGen/X86/fast-isel-select-cmov.ll new file mode 100644 index 000000000000..8008e283ad60 --- /dev/null +++ b/test/CodeGen/X86/fast-isel-select-cmov.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s + +; Test conditional move for the supported types (i16, i32, and i32) and +; conditon input (argument or cmp). Currently i8 is not supported. + +define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: select_cmov_i16 +; CHECK: testb $1, %dil +; CHECK-NEXT: cmovew %dx, %si +; CHECK-NEXT: movzwl %si, %eax + %1 = select i1 %cond, i16 %a, i16 %b + ret i16 %1 +} + +define zeroext i16 @select_cmp_cmov_i16(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: select_cmp_cmov_i16 +; CHECK: cmpw %si, %di +; CHECK-NEXT: cmovbw %di, %si +; CHECK-NEXT: movzwl %si, %eax + %1 = icmp ult i16 %a, %b + %2 = select i1 %1, i16 %a, i16 %b + ret i16 %2 +} + +define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) { +; CHECK-LABEL: select_cmov_i32 +; CHECK: testb $1, %dil +; CHECK-NEXT: cmovel %edx, %esi +; CHECK-NEXT: movl %esi, %eax + %1 = select i1 %cond, i32 %a, i32 %b + ret i32 %1 +} + +define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) { +; CHECK-LABEL: select_cmp_cmov_i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: cmovbl %edi, %esi +; CHECK-NEXT: movl %esi, %eax + %1 = icmp ult i32 %a, %b + %2 = select i1 %1, i32 %a, i32 %b + ret i32 %2 +} + +define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) { +; CHECK-LABEL: select_cmov_i64 +; CHECK: testb $1, %dil +; CHECK-NEXT: cmoveq %rdx, %rsi +; CHECK-NEXT: movq %rsi, %rax + %1 = select i1 %cond, i64 %a, i64 %b + ret i64 %1 +} + +define i64 @select_cmp_cmov_i64(i64 %a, i64 %b) { +; CHECK-LABEL: select_cmp_cmov_i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovbq %rdi, %rsi +; CHECK-NEXT: movq %rsi, %rax + %1 = icmp ult i64 %a, %b + %2 = select i1 %1, i64 %a, i64 %b + ret i64 %2 +} + diff --git a/test/CodeGen/X86/fast-isel-select-cmov2.ll b/test/CodeGen/X86/fast-isel-select-cmov2.ll new file mode 100644 index 000000000000..658098fe7c7a --- /dev/null +++ b/test/CodeGen/X86/fast-isel-select-cmov2.ll @@ -0,0 +1,255 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort | FileCheck %s + +; Test all the cmp predicates that can feed an integer conditional move. + +define i64 @select_fcmp_false_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_false_cmov +; CHECK: movq %rsi, %rax +; CHECK-NEXT: retq + %1 = fcmp false double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_oeq_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_oeq_cmov +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: setnp %al +; CHECK-NEXT: sete %cl +; CHECK-NEXT: testb %al, %cl +; CHECK-NEXT: cmoveq %rsi, %rdi + %1 = fcmp oeq double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_ogt_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_ogt_cmov +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: cmovbeq %rsi, %rdi + %1 = fcmp ogt double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_oge_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_oge_cmov +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: cmovbq %rsi, %rdi + %1 = fcmp oge double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_olt_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_olt_cmov +; CHECK: ucomisd %xmm0, %xmm1 +; CHECK-NEXT: cmovbeq %rsi, %rdi + %1 = fcmp olt double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_ole_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_ole_cmov +; CHECK: ucomisd %xmm0, %xmm1 +; CHECK-NEXT: cmovbq %rsi, %rdi + %1 = fcmp ole double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_one_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_one_cmov +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: cmoveq %rsi, %rdi + %1 = fcmp one double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_ord_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_ord_cmov +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: cmovpq %rsi, %rdi + %1 = fcmp ord double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_uno_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_uno_cmov +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: cmovnpq %rsi, %rdi + %1 = fcmp uno double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_ueq_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_ueq_cmov +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: cmovneq %rsi, %rdi + %1 = fcmp ueq double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_ugt_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_ugt_cmov +; CHECK: ucomisd %xmm0, %xmm1 +; CHECK-NEXT: cmovaeq %rsi, %rdi + %1 = fcmp ugt double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_uge_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_uge_cmov +; CHECK: ucomisd %xmm0, %xmm1 +; CHECK-NEXT: cmovaq %rsi, %rdi + %1 = fcmp uge double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_ult_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_ult_cmov +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: cmovaeq %rsi, %rdi + %1 = fcmp ult double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_ule_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_ule_cmov +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: cmovaq %rsi, %rdi + %1 = fcmp ule double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_une_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_une_cmov +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: setp %al +; CHECK-NEXT: setne %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: cmoveq %rsi, %rdi + %1 = fcmp une double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_fcmp_true_cmov(double %a, double %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_fcmp_true_cmov +; CHECK: movq %rdi, %rax + %1 = fcmp true double %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_icmp_eq_cmov(i64 %a, i64 %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_icmp_eq_cmov +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovneq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax + %1 = icmp eq i64 %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_icmp_ne_cmov(i64 %a, i64 %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_icmp_ne_cmov +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmoveq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax + %1 = icmp ne i64 %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_icmp_ugt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_icmp_ugt_cmov +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovbeq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax + %1 = icmp ugt i64 %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + + +define i64 @select_icmp_uge_cmov(i64 %a, i64 %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_icmp_uge_cmov +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovbq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax + %1 = icmp uge i64 %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_icmp_ult_cmov(i64 %a, i64 %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_icmp_ult_cmov +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovaeq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax + %1 = icmp ult i64 %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_icmp_ule_cmov(i64 %a, i64 %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_icmp_ule_cmov +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovaq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax + %1 = icmp ule i64 %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_icmp_sgt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_icmp_sgt_cmov +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovleq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax + %1 = icmp sgt i64 %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_icmp_sge_cmov(i64 %a, i64 %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_icmp_sge_cmov +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovlq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax + %1 = icmp sge i64 %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_icmp_slt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_icmp_slt_cmov +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovgeq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax + %1 = icmp slt i64 %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + +define i64 @select_icmp_sle_cmov(i64 %a, i64 %b, i64 %c, i64 %d) { +; CHECK-LABEL: select_icmp_sle_cmov +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovgq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax + %1 = icmp sle i64 %a, %b + %2 = select i1 %1, i64 %c, i64 %d + ret i64 %2 +} + diff --git a/test/CodeGen/X86/fast-isel-select-cmp.ll b/test/CodeGen/X86/fast-isel-select-cmp.ll new file mode 100644 index 000000000000..1af30e9f32fe --- /dev/null +++ b/test/CodeGen/X86/fast-isel-select-cmp.ll @@ -0,0 +1,50 @@ +; RUN: llc < %s -O0 -mtriple=x86_64-apple-darwin10 | FileCheck %s + +; Test if we do not fold the cmp into select if the instructions are in +; different basic blocks. + +define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) { +; CHECK-LABEL: select_cmp_cmov_i32 +; CHECK-LABEL: continue +; CHECK-NOT: cmp + %1 = icmp ult i32 %a, %b + br i1 %1, label %continue, label %exit + +continue: + %2 = select i1 %1, i32 %a, i32 %b + ret i32 %2 + +exit: + ret i32 -1 +} + +define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_oeq_f32 +; CHECK-LABEL: continue +; CHECK-NOT: cmp + %1 = fcmp oeq float %a, %b + br i1 %1, label %continue, label %exit + +continue: + %2 = select i1 %1, float %c, float %d + ret float %2 + +exit: + ret float -1.0 +} + +define float @select_fcmp_one_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_one_f32 +; CHECK-LABEL: continue +; CHECK-NOT: ucomi + %1 = fcmp one float %a, %b + br i1 %1, label %continue, label %exit + +continue: + %2 = select i1 %1, float %c, float %d + ret float %2 + +exit: + ret float -1.0 +} + diff --git a/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll b/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll new file mode 100644 index 000000000000..1ec4d64fe209 --- /dev/null +++ b/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll @@ -0,0 +1,138 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=corei7-avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort -mcpu=corei7-avx | FileCheck %s + + +define float @select_fcmp_one_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_one_f32 +; CHECK: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: jne [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: movaps %xmm2, %xmm0 + %1 = fcmp one float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_one_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_one_f64 +; CHECK: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: jne [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: movaps %xmm2, %xmm0 + %1 = fcmp one double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_icmp_eq_f32(i64 %a, i64 %b, float %c, float %d) { +; CHECK-LABEL: select_icmp_eq_f32 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: je [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: retq + %1 = icmp eq i64 %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define float @select_icmp_ne_f32(i64 %a, i64 %b, float %c, float %d) { +; CHECK-LABEL: select_icmp_ne_f32 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jne [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: retq + %1 = icmp ne i64 %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define float @select_icmp_ugt_f32(i64 %a, i64 %b, float %c, float %d) { +; CHECK-LABEL: select_icmp_ugt_f32 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: ja [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: retq + %1 = icmp ugt i64 %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define float @select_icmp_uge_f32(i64 %a, i64 %b, float %c, float %d) { +; CHECK-LABEL: select_icmp_uge_f32 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jae [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: retq + %1 = icmp uge i64 %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define float @select_icmp_ult_f32(i64 %a, i64 %b, float %c, float %d) { +; CHECK-LABEL: select_icmp_ult_f32 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jb [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: retq + %1 = icmp ult i64 %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define float @select_icmp_ule_f32(i64 %a, i64 %b, float %c, float %d) { +; CHECK-LABEL: select_icmp_ule_f32 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jbe [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: retq + %1 = icmp ule i64 %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define float @select_icmp_sgt_f32(i64 %a, i64 %b, float %c, float %d) { +; CHECK-LABEL: select_icmp_sgt_f32 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jg [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: retq + %1 = icmp sgt i64 %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define float @select_icmp_sge_f32(i64 %a, i64 %b, float %c, float %d) { +; CHECK-LABEL: select_icmp_sge_f32 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jge [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: retq + %1 = icmp sge i64 %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define float @select_icmp_slt_f32(i64 %a, i64 %b, float %c, float %d) { +; CHECK-LABEL: select_icmp_slt_f32 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jl [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: retq + %1 = icmp slt i64 %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define float @select_icmp_sle_f32(i64 %a, i64 %b, float %c, float %d) { +; CHECK-LABEL: select_icmp_sle_f32 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jle [[BB:LBB[0-9]+_2]] +; CHECK: [[BB]] +; CHECK-NEXT: retq + %1 = icmp sle i64 %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + diff --git a/test/CodeGen/X86/fast-isel-select-sse.ll b/test/CodeGen/X86/fast-isel-select-sse.ll new file mode 100644 index 000000000000..3c03a0312f5e --- /dev/null +++ b/test/CodeGen/X86/fast-isel-select-sse.ll @@ -0,0 +1,391 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX + +; Test all cmp predicates that can be used with SSE. + +define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_oeq_f32 +; CHECK: cmpeqss %xmm1, %xmm0 +; CHECK-NEXT: andps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_oeq_f32 +; AVX: vcmpeqss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp oeq float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_oeq_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_oeq_f64 +; CHECK: cmpeqsd %xmm1, %xmm0 +; CHECK-NEXT: andpd %xmm0, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm0 +; CHECK-NEXT: orpd %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_oeq_f64 +; AVX: vcmpeqsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp oeq double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_ogt_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_ogt_f32 +; CHECK: cmpltss %xmm0, %xmm1 +; CHECK-NEXT: andps %xmm1, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm1 +; CHECK-NEXT: orps %xmm2, %xmm1 +; AVX-LABEL: select_fcmp_ogt_f32 +; AVX: vcmpltss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp ogt float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_ogt_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_ogt_f64 +; CHECK: cmpltsd %xmm0, %xmm1 +; CHECK-NEXT: andpd %xmm1, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm1 +; CHECK-NEXT: orpd %xmm2, %xmm1 +; AVX-LABEL: select_fcmp_ogt_f64 +; AVX: vcmpltsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp ogt double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_oge_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_oge_f32 +; CHECK: cmpless %xmm0, %xmm1 +; CHECK-NEXT: andps %xmm1, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm1 +; CHECK-NEXT: orps %xmm2, %xmm1 +; AVX-LABEL: select_fcmp_oge_f32 +; AVX: vcmpless %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp oge float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_oge_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_oge_f64 +; CHECK: cmplesd %xmm0, %xmm1 +; CHECK-NEXT: andpd %xmm1, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm1 +; CHECK-NEXT: orpd %xmm2, %xmm1 +; AVX-LABEL: select_fcmp_oge_f64 +; AVX: vcmplesd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp oge double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_olt_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_olt_f32 +; CHECK: cmpltss %xmm1, %xmm0 +; CHECK-NEXT: andps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_olt_f32 +; AVX: vcmpltss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp olt float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_olt_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_olt_f64 +; CHECK: cmpltsd %xmm1, %xmm0 +; CHECK-NEXT: andpd %xmm0, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm0 +; CHECK-NEXT: orpd %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_olt_f64 +; AVX: vcmpltsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp olt double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_ole_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_ole_f32 +; CHECK: cmpless %xmm1, %xmm0 +; CHECK-NEXT: andps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_ole_f32 +; AVX: vcmpless %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp ole float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_ole_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_ole_f64 +; CHECK: cmplesd %xmm1, %xmm0 +; CHECK-NEXT: andpd %xmm0, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm0 +; CHECK-NEXT: orpd %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_ole_f64 +; AVX: vcmplesd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp ole double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_ord_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_ord_f32 +; CHECK: cmpordss %xmm1, %xmm0 +; CHECK-NEXT: andps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_ord_f32 +; AVX: vcmpordss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp ord float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_ord_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_ord_f64 +; CHECK: cmpordsd %xmm1, %xmm0 +; CHECK-NEXT: andpd %xmm0, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm0 +; CHECK-NEXT: orpd %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_ord_f64 +; AVX: vcmpordsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp ord double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_uno_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_uno_f32 +; CHECK: cmpunordss %xmm1, %xmm0 +; CHECK-NEXT: andps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_uno_f32 +; AVX: vcmpunordss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp uno float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_uno_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_uno_f64 +; CHECK: cmpunordsd %xmm1, %xmm0 +; CHECK-NEXT: andpd %xmm0, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm0 +; CHECK-NEXT: orpd %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_uno_f64 +; AVX: vcmpunordsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp uno double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_ugt_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_ugt_f32 +; CHECK: cmpnless %xmm1, %xmm0 +; CHECK-NEXT: andps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_ugt_f32 +; AVX: vcmpnless %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp ugt float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_ugt_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_ugt_f64 +; CHECK: cmpnlesd %xmm1, %xmm0 +; CHECK-NEXT: andpd %xmm0, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm0 +; CHECK-NEXT: orpd %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_ugt_f64 +; AVX: vcmpnlesd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp ugt double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_uge_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_uge_f32 +; CHECK: cmpnltss %xmm1, %xmm0 +; CHECK-NEXT: andps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_uge_f32 +; AVX: vcmpnltss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp uge float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_uge_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_uge_f64 +; CHECK: cmpnltsd %xmm1, %xmm0 +; CHECK-NEXT: andpd %xmm0, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm0 +; CHECK-NEXT: orpd %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_uge_f64 +; AVX: vcmpnltsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp uge double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_ult_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_ult_f32 +; CHECK: cmpnless %xmm0, %xmm1 +; CHECK-NEXT: andps %xmm1, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm1 +; CHECK-NEXT: orps %xmm2, %xmm1 +; AVX-LABEL: select_fcmp_ult_f32 +; AVX: vcmpnless %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp ult float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_ult_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_ult_f64 +; CHECK: cmpnlesd %xmm0, %xmm1 +; CHECK-NEXT: andpd %xmm1, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm1 +; CHECK-NEXT: orpd %xmm2, %xmm1 +; AVX-LABEL: select_fcmp_ult_f64 +; AVX: vcmpnlesd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp ult double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_ule_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_ule_f32 +; CHECK: cmpnltss %xmm0, %xmm1 +; CHECK-NEXT: andps %xmm1, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm1 +; CHECK-NEXT: orps %xmm2, %xmm1 +; AVX-LABEL: select_fcmp_ule_f32 +; AVX: vcmpnltss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp ule float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_ule_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_ule_f64 +; CHECK: cmpnltsd %xmm0, %xmm1 +; CHECK-NEXT: andpd %xmm1, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm1 +; CHECK-NEXT: orpd %xmm2, %xmm1 +; AVX-LABEL: select_fcmp_ule_f64 +; AVX: vcmpnltsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp ule double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + +define float @select_fcmp_une_f32(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: select_fcmp_une_f32 +; CHECK: cmpneqss %xmm1, %xmm0 +; CHECK-NEXT: andps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm3, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_une_f32 +; AVX: vcmpneqss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 + %1 = fcmp une float %a, %b + %2 = select i1 %1, float %c, float %d + ret float %2 +} + +define double @select_fcmp_une_f64(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: select_fcmp_une_f64 +; CHECK: cmpneqsd %xmm1, %xmm0 +; CHECK-NEXT: andpd %xmm0, %xmm2 +; CHECK-NEXT: andnpd %xmm3, %xmm0 +; CHECK-NEXT: orpd %xmm2, %xmm0 +; AVX-LABEL: select_fcmp_une_f64 +; AVX: vcmpneqsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 + %1 = fcmp une double %a, %b + %2 = select i1 %1, double %c, double %d + ret double %2 +} + diff --git a/test/CodeGen/X86/fast-isel-select.ll b/test/CodeGen/X86/fast-isel-select.ll index 53158bc5396d..7b3c99f13cca 100644 --- a/test/CodeGen/X86/fast-isel-select.ll +++ b/test/CodeGen/X86/fast-isel-select.ll @@ -4,10 +4,10 @@ ; lsb is zero. ; -; CHECK-LABEL: fastisel_select: +; CHECK-LABEL: fastisel_select: ; CHECK: subb {{%[a-z0-9]+}}, [[RES:%[a-z0-9]+]] ; CHECK: testb $1, [[RES]] -; CHECK: cmovel +; CHECK: cmovnel %edi, %esi define i32 @fastisel_select(i1 %exchSub2211_, i1 %trunc_8766) { %shuffleInternal15257_8932 = sub i1 %exchSub2211_, %trunc_8766 %counter_diff1345 = select i1 %shuffleInternal15257_8932, i32 1204476887, i32 0 diff --git a/test/CodeGen/X86/float-asmprint.ll b/test/CodeGen/X86/float-asmprint.ll index 4aeae7fe0469..5de9700fc064 100644 --- a/test/CodeGen/X86/float-asmprint.ll +++ b/test/CodeGen/X86/float-asmprint.ll @@ -16,8 +16,9 @@ ; CHECK-NEXT: .size ; CHECK: varppc128: -; CHECK-NEXT: .quad 0 # ppc_fp128 -0 -; CHECK-NEXT: .quad -9223372036854775808 +; For ppc_fp128, the high double always comes first. +; CHECK-NEXT: .quad -9223372036854775808 # ppc_fp128 -0 +; CHECK-NEXT: .quad 0 ; CHECK-NEXT: .size ; CHECK: var80: diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll index 47252ec0bf3d..2eb152b078ef 100644 --- a/test/CodeGen/X86/fma.ll +++ b/test/CodeGen/X86/fma.ll @@ -43,8 +43,8 @@ entry: } ; Test FMA3 variant selection -; CHECK: fma3_select231ssX: -; CHECK: vfmadd231ss xmm +; CHECK-FMA-INST: fma3_select231ssX: +; CHECK-FMA-INST: vfmadd231ss %xmm define float @fma3_select231ssX(float %x, float %y) #0 { entry: br label %while.body @@ -58,8 +58,8 @@ while.end: ; preds = %while.body, %entry } ; Test FMA3 variant selection -; CHECK: fma3_select231pdY: -; CHECK: vfmadd231pd ymm +; CHECK-FMA-INST: fma3_select231pdY: +; CHECK-FMA-INST: vfmadd231pd %ymm define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) #0 { entry: br label %while.body diff --git a/test/CodeGen/X86/gcc_except_table.ll b/test/CodeGen/X86/gcc_except_table.ll index 8c328ec58f93..a732eb1efbd7 100644 --- a/test/CodeGen/X86/gcc_except_table.ll +++ b/test/CodeGen/X86/gcc_except_table.ll @@ -13,14 +13,14 @@ define i32 @main() uwtable optsize ssp { ; APPLE: GCC_except_table0: ; APPLE: Lexception0: -; MINGW64: .cfi_startproc -; MINGW64: .cfi_personality 0, __gxx_personality_v0 -; MINGW64: .cfi_lsda 0, .Lexception0 -; MINGW64: .cfi_def_cfa_offset 16 +; MINGW64: .seh_proc +; MINGW64: .seh_handler __gxx_personality_v0 +; MINGW64: .seh_setframe 5, 0 ; MINGW64: callq _Unwind_Resume -; MINGW64: .cfi_endproc +; MINGW64: .seh_handlerdata ; MINGW64: GCC_except_table0: ; MINGW64: Lexception0: +; MINGW64: .seh_endproc ; MINGW32: .cfi_startproc ; MINGW32: .cfi_personality 0, ___gxx_personality_v0 diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll new file mode 100644 index 000000000000..954a9d994e61 --- /dev/null +++ b/test/CodeGen/X86/haddsub-undef.ll @@ -0,0 +1,325 @@ +; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE +; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX +; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 + +; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. + +define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext2 = extractelement <4 x float> %a, i32 2 + %vecext3 = extractelement <4 x float> %a, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 + %vecext10 = extractelement <4 x float> %b, i32 2 + %vecext11 = extractelement <4 x float> %b, i32 3 + %add12 = fadd float %vecext10, %vecext11 + %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3 + ret <4 x float> %vecinit13 +} +; CHECK-LABEL: test1_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NEXT: ret + + +define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext6 = extractelement <4 x float> %b, i32 0 + %vecext7 = extractelement <4 x float> %b, i32 1 + %add8 = fadd float %vecext6, %vecext7 + %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2 + %vecext10 = extractelement <4 x float> %b, i32 2 + %vecext11 = extractelement <4 x float> %b, i32 3 + %add12 = fadd float %vecext10, %vecext11 + %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 + ret <4 x float> %vecinit13 +} +; CHECK-LABEL: test2_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NEXT: ret + + +define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext2 = extractelement <4 x float> %a, i32 2 + %vecext3 = extractelement <4 x float> %a, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 + %vecext6 = extractelement <4 x float> %b, i32 0 + %vecext7 = extractelement <4 x float> %b, i32 1 + %add8 = fadd float %vecext6, %vecext7 + %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 + ret <4 x float> %vecinit9 +} +; CHECK-LABEL: test3_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NEXT: ret + + +define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + ret <4 x float> %vecinit +} +; CHECK-LABEL: test4_undef +; CHECK-NOT: haddps +; CHECK: ret + + +define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) { + %vecext = extractelement <2 x double> %a, i32 0 + %vecext1 = extractelement <2 x double> %a, i32 1 + %add = fadd double %vecext, %vecext1 + %vecinit = insertelement <2 x double> undef, double %add, i32 0 + ret <2 x double> %vecinit +} +; CHECK-LABEL: test5_undef +; CHECK-NOT: haddpd +; CHECK: ret + + +define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext2 = extractelement <4 x float> %a, i32 2 + %vecext3 = extractelement <4 x float> %a, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 + ret <4 x float> %vecinit5 +} +; CHECK-LABEL: test6_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NEXT: ret + + +define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %b, i32 0 + %vecext1 = extractelement <4 x float> %b, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 2 + %vecext2 = extractelement <4 x float> %b, i32 2 + %vecext3 = extractelement <4 x float> %b, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3 + ret <4 x float> %vecinit5 +} +; CHECK-LABEL: test7_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NEXT: ret + + +define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext2 = extractelement <4 x float> %a, i32 2 + %vecext3 = extractelement <4 x float> %a, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2 + ret <4 x float> %vecinit5 +} +; CHECK-LABEL: test8_undef +; CHECK-NOT: haddps +; CHECK: ret + + +define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext2 = extractelement <4 x float> %b, i32 2 + %vecext3 = extractelement <4 x float> %b, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3 + ret <4 x float> %vecinit5 +} +; CHECK-LABEL: test9_undef +; CHECK: haddps +; CHECK-NEXT: ret + +define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) { + %vecext = extractelement <8 x float> %a, i32 0 + %vecext1 = extractelement <8 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <8 x float> undef, float %add, i32 0 + %vecext2 = extractelement <8 x float> %b, i32 2 + %vecext3 = extractelement <8 x float> %b, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3 + ret <8 x float> %vecinit5 +} +; CHECK-LABEL: test10_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NOT: haddps +; CHECK: ret + +define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) { + %vecext = extractelement <8 x float> %a, i32 0 + %vecext1 = extractelement <8 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <8 x float> undef, float %add, i32 0 + %vecext2 = extractelement <8 x float> %b, i32 4 + %vecext3 = extractelement <8 x float> %b, i32 5 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6 + ret <8 x float> %vecinit5 +} +; CHECK-LABEL: test11_undef +; SSE-NOT: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK: ret + +define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) { + %vecext = extractelement <8 x float> %a, i32 0 + %vecext1 = extractelement <8 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <8 x float> undef, float %add, i32 0 + %vecext2 = extractelement <8 x float> %a, i32 2 + %vecext3 = extractelement <8 x float> %a, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 + ret <8 x float> %vecinit5 +} +; CHECK-LABEL: test12_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NOT: haddps +; CHECK: ret + +define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) { + %vecext = extractelement <8 x float> %a, i32 0 + %vecext1 = extractelement <8 x float> %a, i32 1 + %add1 = fadd float %vecext, %vecext1 + %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0 + %vecext2 = extractelement <8 x float> %a, i32 2 + %vecext3 = extractelement <8 x float> %a, i32 3 + %add2 = fadd float %vecext2, %vecext3 + %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1 + %vecext4 = extractelement <8 x float> %a, i32 4 + %vecext5 = extractelement <8 x float> %a, i32 5 + %add3 = fadd float %vecext4, %vecext5 + %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2 + %vecext6 = extractelement <8 x float> %a, i32 6 + %vecext7 = extractelement <8 x float> %a, i32 7 + %add4 = fadd float %vecext6, %vecext7 + %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3 + ret <8 x float> %vecinit4 +} +; CHECK-LABEL: test13_undef +; SSE: haddps +; SSE-NOT: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NOT: haddps +; CHECK: ret + +define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) { + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add = add i32 %vecext, %vecext1 + %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 + %vecext2 = extractelement <8 x i32> %b, i32 2 + %vecext3 = extractelement <8 x i32> %b, i32 3 + %add4 = add i32 %vecext2, %vecext3 + %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3 + ret <8 x i32> %vecinit5 +} +; CHECK-LABEL: test14_undef +; SSE: phaddd +; AVX: vphaddd +; AVX2: vphaddd +; CHECK-NOT: phaddd +; CHECK: ret + +; On AVX2, the following sequence can be folded into a single horizontal add. +; If the Subtarget doesn't support AVX2, then we avoid emitting two packed +; integer horizontal adds instead of two scalar adds followed by vector inserts. +define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add = add i32 %vecext, %vecext1 + %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 + %vecext2 = extractelement <8 x i32> %b, i32 4 + %vecext3 = extractelement <8 x i32> %b, i32 5 + %add4 = add i32 %vecext2, %vecext3 + %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6 + ret <8 x i32> %vecinit5 +} +; CHECK-LABEL: test15_undef +; SSE-NOT: phaddd +; AVX-NOT: vphaddd +; AVX2: vphaddd +; CHECK: ret + +define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) { + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add = add i32 %vecext, %vecext1 + %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 + %vecext2 = extractelement <8 x i32> %a, i32 2 + %vecext3 = extractelement <8 x i32> %a, i32 3 + %add4 = add i32 %vecext2, %vecext3 + %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 + ret <8 x i32> %vecinit5 +} +; CHECK-LABEL: test16_undef +; SSE: phaddd +; AVX: vphaddd +; AVX2: vphaddd +; CHECK-NOT: haddps +; CHECK: ret + +define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) { + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add1 = add i32 %vecext, %vecext1 + %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0 + %vecext2 = extractelement <8 x i32> %a, i32 2 + %vecext3 = extractelement <8 x i32> %a, i32 3 + %add2 = add i32 %vecext2, %vecext3 + %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1 + %vecext4 = extractelement <8 x i32> %a, i32 4 + %vecext5 = extractelement <8 x i32> %a, i32 5 + %add3 = add i32 %vecext4, %vecext5 + %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2 + %vecext6 = extractelement <8 x i32> %a, i32 6 + %vecext7 = extractelement <8 x i32> %a, i32 7 + %add4 = add i32 %vecext6, %vecext7 + %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3 + ret <8 x i32> %vecinit4 +} +; CHECK-LABEL: test17_undef +; SSE: phaddd +; AVX: vphaddd +; AVX2: vphaddd +; CHECK-NOT: haddps +; CHECK: ret + diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll new file mode 100644 index 000000000000..1dcf93939b8b --- /dev/null +++ b/test/CodeGen/X86/half.ll @@ -0,0 +1,69 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C + +define void @test_load_store(half* %in, half* %out) { +; CHECK-LABEL: test_load_store: +; CHECK: movw (%rdi), [[TMP:%[a-z0-9]+]] +; CHECK: movw [[TMP]], (%rsi) + %val = load half* %in + store half %val, half* %out + ret void +} + +define i16 @test_bitcast_from_half(half* %addr) { +; CHECK-LABEL: test_bitcast_from_half: +; CHECK: movzwl (%rdi), %eax + %val = load half* %addr + %val_int = bitcast half %val to i16 + ret i16 %val_int +} + +define void @test_bitcast_to_half(half* %addr, i16 %in) { +; CHECK-LABEL: test_bitcast_to_half: +; CHECK: movw %si, (%rdi) + %val_fp = bitcast i16 %in to half + store half %val_fp, half* %addr + ret void +} + +define float @test_extend32(half* %addr) { +; CHECK-LABEL: test_extend32: + +; CHECK-LIBCALL: jmp __gnu_h2f_ieee +; CHECK-FP16: vcvtph2ps + %val16 = load half* %addr + %val32 = fpext half %val16 to float + ret float %val32 +} + +define double @test_extend64(half* %addr) { +; CHECK-LABEL: test_extend64: + +; CHECK-LIBCALL: callq __gnu_h2f_ieee +; CHECK-LIBCALL: cvtss2sd +; CHECK-FP16: vcvtph2ps +; CHECK-FP16: vcvtss2sd + %val16 = load half* %addr + %val32 = fpext half %val16 to double + ret double %val32 +} + +define void @test_trunc32(float %in, half* %addr) { +; CHECK-LABEL: test_trunc32: + +; CHECK-LIBCALL: callq __gnu_f2h_ieee +; CHECK-FP16: vcvtps2ph + %val16 = fptrunc float %in to half + store half %val16, half* %addr + ret void +} + +define void @test_trunc64(double %in, half* %addr) { +; CHECK-LABEL: test_trunc64: + +; CHECK-LIBCALL: callq __truncdfhf2 +; CHECK-FP16: callq __truncdfhf2 + %val16 = fptrunc double %in to half + store half %val16, half* %addr + ret void +} diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll index 769831ee8185..f47161e5520c 100644 --- a/test/CodeGen/X86/lower-bitcast.ll +++ b/test/CodeGen/X86/lower-bitcast.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=x86-64 -mcpu=core2 -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=core2 -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE define double @test1(double %A) { @@ -9,14 +10,19 @@ define double @test1(double %A) { } ; FIXME: Ideally we should be able to fold the entire body of @test1 into a ; single paddd instruction. At the moment we produce the sequence -; pshufd+paddq+pshufd. - +; pshufd+paddq+pshufd. This is fixed with the widening legalization. +; ; CHECK-LABEL: test1 ; CHECK-NOT: movsd ; CHECK: pshufd ; CHECK-NEXT: paddd ; CHECK-NEXT: pshufd ; CHECK-NEXT: ret +; +; CHECK-WIDE-LABEL: test1 +; CHECK-WIDE-NOT: movsd +; CHECK-WIDE: paddd +; CHECK-WIDE-NEXT: ret define double @test2(double %A, double %B) { @@ -30,6 +36,11 @@ define double @test2(double %A, double %B) { ; CHECK-NOT: movsd ; CHECK: paddd ; CHECK-NEXT: ret +; +; CHECK-WIDE-LABEL: test2 +; CHECK-WIDE-NOT: movsd +; CHECK-WIDE: paddd +; CHECK-WIDE-NEXT: ret define i64 @test3(i64 %A) { @@ -43,6 +54,12 @@ define i64 @test3(i64 %A) { ; CHECK: addps ; CHECK-NOT: pshufd ; CHECK: ret +; +; CHECK-WIDE-LABEL: test3 +; CHECK-WIDE-NOT: pshufd +; CHECK-WIDE: addps +; CHECK-WIDE-NOT: pshufd +; CHECK-WIDE: ret define i64 @test4(i64 %A) { @@ -52,13 +69,20 @@ define i64 @test4(i64 %A) { ret i64 %2 } ; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd. -; Ideally, we should fold that sequence into a single paddd. - +; Ideally, we should fold that sequence into a single paddd. This is fixed with +; the widening legalization. +; ; CHECK-LABEL: test4 ; CHECK: pshufd ; CHECK-NEXT: paddq ; CHECK-NEXT: pshufd ; CHECK: ret +; +; CHECK-WIDE-LABEL: test4 +; CHECK-WIDE: movd %{{rdi|rcx}}, +; CHECK-WIDE-NEXT: paddd +; CHECK-WIDE-NEXT: movd {{.*}}, %rax +; CHECK-WIDE: ret define double @test5(double %A) { @@ -70,6 +94,10 @@ define double @test5(double %A) { ; CHECK-LABEL: test5 ; CHECK: addps ; CHECK-NEXT: ret +; +; CHECK-WIDE-LABEL: test5 +; CHECK-WIDE: addps +; CHECK-WIDE-NEXT: ret define double @test6(double %A) { @@ -79,14 +107,20 @@ define double @test6(double %A) { ret double %2 } ; FIXME: Ideally we should be able to fold the entire body of @test6 into a -; single paddw instruction. - +; single paddw instruction. This is fixed with the widening legalization. +; ; CHECK-LABEL: test6 ; CHECK-NOT: movsd ; CHECK: punpcklwd ; CHECK-NEXT: paddw ; CHECK-NEXT: pshufb ; CHECK-NEXT: ret +; +; CHECK-WIDE-LABEL: test6 +; CHECK-WIDE-NOT: mov +; CHECK-WIDE-NOT: punpcklwd +; CHECK-WIDE: paddw +; CHECK-WIDE-NEXT: ret define double @test7(double %A, double %B) { @@ -101,6 +135,12 @@ define double @test7(double %A, double %B) { ; CHECK-NOT: punpcklwd ; CHECK: paddw ; CHECK-NEXT: ret +; +; CHECK-WIDE-LABEL: test7 +; CHECK-WIDE-NOT: movsd +; CHECK-WIDE-NOT: punpcklwd +; CHECK-WIDE: paddw +; CHECK-WIDE-NEXT: ret define double @test8(double %A) { @@ -111,14 +151,20 @@ define double @test8(double %A) { } ; FIXME: Ideally we should be able to fold the entire body of @test8 into a ; single paddb instruction. At the moment we produce the sequence -; pshufd+paddw+pshufd. - +; pshufd+paddw+pshufd. This is fixed with the widening legalization. +; ; CHECK-LABEL: test8 ; CHECK-NOT: movsd ; CHECK: punpcklbw ; CHECK-NEXT: paddb ; CHECK-NEXT: pshufb ; CHECK-NEXT: ret +; +; CHECK-WIDE-LABEL: test8 +; CHECK-WIDE-NOT: movsd +; CHECK-WIDE-NOT: punpcklbw +; CHECK-WIDE: paddb +; CHECK-WIDE-NEXT: ret define double @test9(double %A, double %B) { @@ -133,4 +179,10 @@ define double @test9(double %A, double %B) { ; CHECK-NOT: punpcklbw ; CHECK: paddb ; CHECK-NEXT: ret +; +; CHECK-WIDE-LABEL: test9 +; CHECK-WIDE-NOT: movsd +; CHECK-WIDE-NOT: punpcklbw +; CHECK-WIDE: paddb +; CHECK-WIDE-NEXT: ret diff --git a/test/CodeGen/X86/macho-comdat.ll b/test/CodeGen/X86/macho-comdat.ll new file mode 100644 index 000000000000..3c2d997b4594 --- /dev/null +++ b/test/CodeGen/X86/macho-comdat.ll @@ -0,0 +1,6 @@ +; RUN: not llc -mtriple x86_64-apple-darwin < %s 2> %t +; RUN: FileCheck < %t %s + +$f = comdat any +@v = global i32 0, comdat $f +; CHECK: LLVM ERROR: MachO doesn't support COMDATs, 'f' cannot be lowered. diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll index 7c0e82f08f93..fa77fcb1d138 100644 --- a/test/CodeGen/X86/null-streamer.ll +++ b/test/CodeGen/X86/null-streamer.ll @@ -1,6 +1,7 @@ ; Check the MCNullStreamer operates correctly, at least on a minimal test case. ; ; RUN: llc -filetype=null -o %t -march=x86 %s +; RUN: llc -filetype=null -o %t -mtriple=i686-cygwin %s define void @f0() { ret void @@ -9,3 +10,20 @@ define void @f0() { define void @f1() { ret void } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!11, !13} + +!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !" ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""} +!1 = metadata !{metadata !"", metadata !""} +!2 = metadata !{} +!3 = metadata !{metadata !4} +!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"", metadata !"", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* null, null, null, metadata !2, i32 2} +!5 = metadata !{i32 786473, metadata !1} +!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} +!7 = metadata !{metadata !8} +!8 = metadata !{i32 786468, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} +!9 = metadata !{metadata !10} +!10 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"_ZL1i", metadata !5, i32 1, metadata !8, i32 1, i32 1, null, null} +!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3} +!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} diff --git a/test/CodeGen/X86/pr20020.ll b/test/CodeGen/X86/pr20020.ll new file mode 100644 index 000000000000..83dae369dd75 --- /dev/null +++ b/test/CodeGen/X86/pr20020.ll @@ -0,0 +1,73 @@ +; RUN: llc < %s -mtriple=x86_64-apple-macosx -disable-lsr -post-RA-scheduler=1 -break-anti-dependencies=critical | FileCheck %s + +; In PR20020, the critical anti-dependency breaker algorithm mistakenly +; changes the register operands of an 'xorl %eax, %eax' to 'xorl %ecx, %ecx' +; and then immediately reloads %rcx with a value based on the wrong %rax + +; CHECK-NOT: xorl %ecx, %ecx +; CHECK: leaq 1(%rax), %rcx + + +%struct.planet = type { double, double, double } + +; Function Attrs: nounwind ssp uwtable +define void @advance(i32 %nbodies, %struct.planet* nocapture %bodies) #0 { +entry: + %cmp4 = icmp sgt i32 %nbodies, 0 + br i1 %cmp4, label %for.body.preheader, label %for.end38 + +for.body.preheader: ; preds = %entry + %gep = getelementptr %struct.planet* %bodies, i64 1, i32 1 + %gep13 = bitcast double* %gep to %struct.planet* + %0 = add i32 %nbodies, -1 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.inc20 + %iv19 = phi i32 [ %0, %for.body.preheader ], [ %iv.next, %for.inc20 ] + %iv = phi %struct.planet* [ %gep13, %for.body.preheader ], [ %gep14, %for.inc20 ] + %iv9 = phi i64 [ %iv.next10, %for.inc20 ], [ 0, %for.body.preheader ] + %iv.next10 = add nuw nsw i64 %iv9, 1 + %1 = trunc i64 %iv.next10 to i32 + %cmp22 = icmp slt i32 %1, %nbodies + br i1 %cmp22, label %for.body3.lr.ph, label %for.inc20 + +for.body3.lr.ph: ; preds = %for.body + %x = getelementptr inbounds %struct.planet* %bodies, i64 %iv9, i32 0 + %y = getelementptr inbounds %struct.planet* %bodies, i64 %iv9, i32 1 + %vx = getelementptr inbounds %struct.planet* %bodies, i64 %iv9, i32 2 + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body3.lr.ph + %iv20 = phi i32 [ %iv.next21, %for.body3 ], [ %iv19, %for.body3.lr.ph ] + %iv15 = phi %struct.planet* [ %gep16, %for.body3 ], [ %iv, %for.body3.lr.ph ] + %iv1517 = bitcast %struct.planet* %iv15 to double* + %2 = load double* %x, align 8 + %gep18 = getelementptr double* %iv1517, i64 -1 + %3 = load double* %gep18, align 8 + %sub = fsub double %2, %3 + %4 = load double* %y, align 8 + %5 = load double* %iv1517, align 8 + %sub8 = fsub double %4, %5 + %add10 = fadd double %sub, %sub8 + %call = tail call double @sqrt(double %sub8) #2 + store double %add10, double* %vx, align 8 + %gep16 = getelementptr %struct.planet* %iv15, i64 1 + %iv.next21 = add i32 %iv20, -1 + %exitcond = icmp eq i32 %iv.next21, 0 + br i1 %exitcond, label %for.inc20, label %for.body3 + +for.inc20: ; preds = %for.body3, %for.body + %lftr.wideiv11 = trunc i64 %iv.next10 to i32 + %gep14 = getelementptr %struct.planet* %iv, i64 1 + %iv.next = add i32 %iv19, -1 + %exitcond12 = icmp eq i32 %lftr.wideiv11, %nbodies + br i1 %exitcond12, label %for.end38, label %for.body + +for.end38: ; preds = %for.inc20, %entry + ret void +} + +; Function Attrs: nounwind +declare double @sqrt(double) #1 + +attributes #0 = { "no-frame-pointer-elim-non-leaf" } diff --git a/test/CodeGen/X86/pr20088.ll b/test/CodeGen/X86/pr20088.ll new file mode 100644 index 000000000000..3a829622424c --- /dev/null +++ b/test/CodeGen/X86/pr20088.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s + +declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) + +define <16 x i8> @foo(<16 x i8> %x) { +; CHECK: vpblendvb + %res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> zeroinitializer, <16 x i8> , <16 x i8> %x) + ret <16 x i8> %res; +} diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll index d048db8a850d..32a797ba138a 100644 --- a/test/CodeGen/X86/pr5145.ll +++ b/test/CodeGen/X86/pr5145.ll @@ -5,29 +5,29 @@ define void @atomic_maxmin_i8() { ; CHECK: atomic_maxmin_i8 %1 = atomicrmw max i8* @sc8, i8 5 acquire ; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]: -; CHECK: cmpb -; CHECK: cmovl +; CHECK: movsbl +; CHECK: cmpl ; CHECK: lock ; CHECK-NEXT: cmpxchgb ; CHECK: jne [[LABEL1]] %2 = atomicrmw min i8* @sc8, i8 6 acquire ; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]: -; CHECK: cmpb -; CHECK: cmovg +; CHECK: movsbl +; CHECK: cmpl ; CHECK: lock ; CHECK-NEXT: cmpxchgb ; CHECK: jne [[LABEL3]] %3 = atomicrmw umax i8* @sc8, i8 7 acquire ; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]: -; CHECK: cmpb -; CHECK: cmovb +; CHECK: movzbl +; CHECK: cmpl ; CHECK: lock ; CHECK-NEXT: cmpxchgb ; CHECK: jne [[LABEL5]] %4 = atomicrmw umin i8* @sc8, i8 8 acquire ; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]: -; CHECK: cmpb -; CHECK: cmova +; CHECK: movzbl +; CHECK: cmpl ; CHECK: lock ; CHECK-NEXT: cmpxchgb ; CHECK: jne [[LABEL7]] diff --git a/test/CodeGen/X86/pshufd-combine-crash.ll b/test/CodeGen/X86/pshufd-combine-crash.ll new file mode 100644 index 000000000000..84c69e32bcc3 --- /dev/null +++ b/test/CodeGen/X86/pshufd-combine-crash.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=x86-64 -mcpu=corei7 -debug + +; REQUIRES: asserts + +; Test that the dag combiner doesn't assert if we try to replace a sequence of two +; v4f32 X86ISD::PSHUFD nodes with a single PSHUFD. + + +define <4 x float> @test(<4 x float> %V) { + %1 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> + ret <4 x float> %2 +} + diff --git a/test/CodeGen/X86/rdpmc.ll b/test/CodeGen/X86/rdpmc.ll new file mode 100644 index 000000000000..7f1ca469c0b6 --- /dev/null +++ b/test/CodeGen/X86/rdpmc.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s -march=x86-64 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86-64 +; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86 + +; Verify that we correctly lower the "Read Performance-Monitoring Counters" +; x86 builtin. + + +define i64 @test_builtin_read_pmc(i32 %ID) { + %1 = tail call i64 @llvm.x86.rdpmc(i32 %ID) + ret i64 %1 +} +; CHECK-LABEL: test_builtin_read_pmc +; CHECK: rdpmc +; X86-NOT: shlq +; X86-NOT: or +; X86-64: shlq +; X86-64: or +; CHECK-NOT: mov +; CHECK: ret + +declare i64 @llvm.x86.rdpmc(i32 %ID) + diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index cdd258d92031..654e8652cfcb 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -357,3 +357,11 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind { ; ATOM: cmpl $15, %edi ; ATOM: cmovgel %edx } + +; CHECK-LABEL: @trunc_select_miscompile +; CHECK-NOT: sarb +define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) { + %tmp1 = select i1 %cc, i32 3, i32 2 + %tmp2 = shl i32 %a, %tmp1 + ret i32 %tmp2 +} \ No newline at end of file diff --git a/test/CodeGen/X86/shift-parts.ll b/test/CodeGen/X86/shift-parts.ll index ce4f538f4de4..763da6397101 100644 --- a/test/CodeGen/X86/shift-parts.ll +++ b/test/CodeGen/X86/shift-parts.ll @@ -1,17 +1,19 @@ -; RUN: llc < %s -march=x86-64 | grep shrdq +; RUN: llc -march=x86-64 < %s | FileCheck %s ; PR4736 %0 = type { i32, i8, [35 x i8] } @g_144 = external global %0, align 8 ; <%0*> [#uses=1] -define i32 @int87(i32 %uint64p_8) nounwind { +; CHECK: shrdq + +define i32 @int87(i32 %uint64p_8, i1 %cond) nounwind { entry: %srcval4 = load i320* bitcast (%0* @g_144 to i320*), align 8 ; [#uses=1] br label %for.cond for.cond: ; preds = %for.cond, %entry - %call3.in.in.in.v = select i1 undef, i320 192, i320 128 ; [#uses=1] + %call3.in.in.in.v = select i1 %cond, i320 192, i320 128 ; [#uses=1] %call3.in.in.in = lshr i320 %srcval4, %call3.in.in.in.v ; [#uses=1] %call3.in = trunc i320 %call3.in.in.in to i32 ; [#uses=1] %tobool = icmp eq i32 %call3.in, 0 ; [#uses=1] diff --git a/test/CodeGen/X86/shuffle-combine-crash.ll b/test/CodeGen/X86/shuffle-combine-crash.ll new file mode 100644 index 000000000000..6ab7b97e6a7b --- /dev/null +++ b/test/CodeGen/X86/shuffle-combine-crash.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 + +; Verify that DAGCombiner does not crash when checking if it is +; safe to fold the shuffles in function @sample_test according to rule +; (shuffle (shuffle A, Undef, M0), Undef, M1) -> (shuffle A, Undef, M2) +; +; The DAGCombiner avoids folding shuffles if +; the resulting shuffle dag node is not legal for the target. +; That means, the shuffle must have legal type and legal mask. +; +; Before, the DAGCombiner forgot to check if the resulting shuffle +; was legal. It instead just called method +; 'X86TargetLowering::isShuffleMaskLegal'; however, that was not enough since +; that method always expect to have a valid vector type in input. +; As a consequence, compiling the function below would have caused a crash. + +define void @sample_test() { + br i1 undef, label %5, label %1 + +;