| /* Copyright 2023 The OpenXLA Authors. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include <memory> |
| #include <optional> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include <gmock/gmock.h> |
| #include <gtest/gtest.h> |
| #include "absl/memory/memory.h" |
| #include "absl/status/status_matchers.h" |
| #include "absl/status/statusor.h" |
| #include "absl/strings/string_view.h" |
| #include "mlir/Dialect/Func/IR/FuncOps.h" |
| #include "mlir/IR/BuiltinOps.h" |
| #include "mlir/IR/MLIRContext.h" |
| #include "mlir/Parser/Parser.h" |
| #include "xla/hlo/builder/xla_computation.h" |
| #include "xla/hlo/ir/hlo_module.h" |
| #include "xla/hlo/parser/hlo_parser.h" |
| #include "xla/layout.h" |
| #include "xla/literal.h" |
| #include "xla/literal_util.h" |
| #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h" |
| #include "xla/pjrt/compiled_memory_stats.h" |
| #include "xla/pjrt/gpu/se_gpu_pjrt_client.h" |
| #include "xla/pjrt/gpu/se_gpu_pjrt_compiler.h" |
| #include "xla/pjrt/maybe_owning_mlir_module.h" |
| #include "xla/pjrt/pjrt_client.h" |
| #include "xla/pjrt/pjrt_compiler.h" |
| #include "xla/pjrt/pjrt_executable.h" |
| #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h" |
| #include "xla/service/compiler.h" |
| #include "xla/shape.h" |
| #include "xla/shape_util.h" |
| #include "xla/tests/literal_test_util.h" |
| #include "xla/tsl/platform/statusor.h" |
| #include "xla/xla_data.pb.h" |
| #include "tsl/platform/casts.h" |
| |
| namespace xla { |
| namespace { |
| |
| using ::absl_testing::IsOkAndHolds; |
| using ::testing::SizeIs; |
| |
| constexpr absl::string_view kProgram = R"(HloModule Computation |
| |
| ENTRY Computation() -> s32[] { |
| ROOT result = s32[] constant(2) |
| })"; |
| |
| constexpr absl::string_view mlir_str = R"mlir( |
| module { |
| func.func @main() -> tensor<i32> { |
| %0 = mhlo.constant dense<2> : tensor<i32> |
| return %0 : tensor<i32> |
| } |
| })mlir"; |
| |
| absl::StatusOr<xla::XlaComputation> GetXlaComputation( |
| absl::string_view program) { |
| TF_ASSIGN_OR_RETURN(auto hlo_module, |
| xla::ParseAndReturnUnverifiedModule(program, {})); |
| |
| return XlaComputation(hlo_module->ToProto()); |
| } |
| |
| void ValidateResult( |
| std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>& result) { |
| ASSERT_EQ(result.size(), 1); |
| std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0]; |
| ASSERT_EQ(result_buffers.size(), 1); |
| TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal, |
| result_buffers[0]->ToLiteral().Await()); |
| EXPECT_TRUE( |
| LiteralTestUtil::Equal(LiteralUtil::CreateR0(2), *result_literal)); |
| } |
| |
| TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) { |
| TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client, |
| GetStreamExecutorGpuClient(GpuClientOptions())); |
| auto se_client = absl::WrapUnique( |
| tensorflow::down_cast<StreamExecutorGpuClient*>(client.release())); |
| Compiler::GpuTargetConfig gpu_target_config = xla::Compiler::GpuTargetConfig( |
| se_client->client()->backend().default_stream_executor()); |
| StreamExecutorGpuCompiler compiler(se_client->platform_id(), |
| se_client->client()->platform()->id()); |
| |
| auto context = std::make_unique<mlir::MLIRContext>(); |
| context->loadDialect<mlir::mhlo::MhloDialect, mlir::func::FuncDialect>(); |
| auto mlir_module = |
| mlir::parseSourceString<mlir::ModuleOp>(mlir_str, context.get()); |
| TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription()); |
| xla::CompileOptions opts; |
| opts.gpu_target_config = gpu_target_config; |
| |
| TF_ASSERT_OK_AND_ASSIGN( |
| auto executable, |
| compiler.Compile( |
| opts, |
| MaybeOwningMlirModule(std::move(context), std::move(mlir_module)), |
| *topology, /*client=*/nullptr)); |
| EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1))); |
| TF_ASSERT_OK_AND_ASSIGN( |
| auto loaded_executable, |
| se_client->Load(std::move(executable), LoadOptions())); |
| |
| TF_ASSERT_OK_AND_ASSIGN( |
| std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> result, |
| loaded_executable->Execute(/*argument_handles=*/{{}}, {})); |
| ValidateResult(result); |
| } |
| |
| TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) { |
| TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client, |
| GetStreamExecutorGpuClient(GpuClientOptions())); |
| auto se_client = absl::WrapUnique( |
| tensorflow::down_cast<StreamExecutorGpuClient*>(client.release())); |
| Compiler::GpuTargetConfig gpu_target_config{ |
| se_client->client()->backend().default_stream_executor()}; |
| StreamExecutorGpuCompiler compiler(se_client->platform_id(), |
| se_client->client()->platform()->id()); |
| |
| TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, |
| GetXlaComputation(kProgram)); |
| TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology, |
| se_client->GetTopologyDescription()); |
| xla::CompileOptions opts; |
| opts.gpu_target_config = gpu_target_config; |
| |
| TF_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<PjRtExecutable> executable, |
| compiler.Compile(opts, computation, *topology, /*client=*/nullptr)); |
| EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1))); |
| TF_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<PjRtLoadedExecutable> loaded_executable, |
| se_client->Load(std::move(executable), LoadOptions())); |
| TF_ASSERT_OK_AND_ASSIGN( |
| std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> result, |
| loaded_executable->Execute(/*argument_handles=*/{{}}, {})); |
| ValidateResult(result); |
| } |
| |
| TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) { |
| TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client, |
| GetStreamExecutorGpuClient(GpuClientOptions())); |
| auto se_client = absl::WrapUnique( |
| tensorflow::down_cast<StreamExecutorGpuClient*>(client.release())); |
| StreamExecutorGpuCompiler compiler(se_client->platform_id(), |
| se_client->client()->platform()->id()); |
| xla::CompileOptions opts; |
| opts.gpu_target_config = Compiler::GpuTargetConfig( |
| se_client->client()->backend().default_stream_executor()); |
| |
| TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, |
| GetXlaComputation(kProgram)); |
| TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology, |
| se_client->GetTopologyDescription()); |
| TF_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<PjRtExecutable> executable, |
| compiler.Compile(opts, computation, *topology, /*client=*/nullptr)); |
| EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1))); |
| |
| // Serialize the executable and load it. |
| TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable, |
| executable->SerializeExecutable()); |
| TF_ASSERT_OK_AND_ASSIGN( |
| auto loaded_executable, |
| se_client->LoadSerialized(serialized_executable, std::nullopt, |
| LoadOptions())); |
| |
| TF_ASSERT_OK_AND_ASSIGN( |
| auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {})); |
| ValidateResult(result); |
| } |
| |
| constexpr absl::string_view kProgramIdentity = R"(HloModule Identity |
| |
| ENTRY main { |
| ROOT Arg_0.1 = s32[1]{0} parameter(0) |
| })"; |
| |
| TEST(StreamExecutorGpuCompilerTest, SuccessSerializeDeserialize) { |
| TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client, |
| GetStreamExecutorGpuClient(GpuClientOptions())); |
| auto se_client = absl::WrapUnique( |
| tensorflow::down_cast<StreamExecutorGpuClient*>(client.release())); |
| StreamExecutorGpuCompiler compiler(se_client->platform_id(), |
| se_client->client()->platform()->id()); |
| xla::CompileOptions opts; |
| opts.gpu_target_config = Compiler::GpuTargetConfig( |
| se_client->client()->backend().default_stream_executor()); |
| |
| TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, |
| GetXlaComputation(kProgramIdentity)); |
| TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology, |
| se_client->GetTopologyDescription()); |
| TF_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<PjRtExecutable> executable, |
| compiler.Compile(opts, computation, *topology, /*client=*/nullptr)); |
| EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1))); |
| TF_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<PjRtLoadedExecutable> loaded_executable, |
| se_client->Load(std::move(executable), LoadOptions())); |
| |
| // Serialize the executable and deserialize it without failure. |
| TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable, |
| se_client->SerializeExecutable(*loaded_executable)); |
| TF_ASSERT_OK_AND_ASSIGN( |
| auto deserialized_executable, |
| se_client->LoadSerializedExecutable(serialized_executable, std::nullopt, |
| LoadOptions())); |
| |
| EXPECT_EQ(deserialized_executable->GetExecutable()->name(), "Identity"); |
| } |
| |
| constexpr char const* kD2HProgramTupleOutput = R"( |
| HloModule f |
| |
| ENTRY main.5 { |
| p = s32[4]{0} parameter(0) |
| cc = s32[4] custom-call(p), |
| custom_call_target="annotate_device_placement", |
| frontend_attributes={_xla_buffer_placement="pinned_host"} |
| ROOT tuple = (s32[4]{0}, s32[4]{0}) tuple(s32[4]{0} p, s32[4]{0} cc) |
| } |
| )"; |
| TEST(StreamExecutorGpuCompilerTest, UnloadedExecutableMemoryStats) { |
| TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client, |
| GetStreamExecutorGpuClient(GpuClientOptions())); |
| auto se_client = absl::WrapUnique( |
| tensorflow::down_cast<StreamExecutorGpuClient*>(client.release())); |
| StreamExecutorGpuCompiler compiler(se_client->platform_id(), |
| se_client->client()->platform()->id()); |
| xla::CompileOptions options; |
| options.gpu_target_config = Compiler::GpuTargetConfig( |
| se_client->client()->backend().default_stream_executor()); |
| |
| // Build the output shape with the correct memory space set. |
| Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, {4}, {0}); |
| Shape host_shape = shape; |
| host_shape.mutable_layout()->set_memory_space(Layout::kHostMemorySpace); |
| Shape out_shape = ShapeUtil::MakeTupleShape({shape, host_shape}); |
| options.executable_build_options.set_result_layout(out_shape); |
| |
| TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, |
| GetXlaComputation(kD2HProgramTupleOutput)); |
| TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology, |
| se_client->GetTopologyDescription()); |
| TF_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<PjRtExecutable> executable, |
| compiler.Compile(options, computation, *topology, /*client=*/nullptr)); |
| EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1))); |
| |
| TF_ASSERT_OK_AND_ASSIGN(CompiledMemoryStats compiled_memory_stats, |
| executable->GetCompiledMemoryStats()); |
| |
| EXPECT_EQ(compiled_memory_stats.argument_size_in_bytes, 16); |
| EXPECT_EQ(compiled_memory_stats.output_size_in_bytes, 32); |
| EXPECT_GT(compiled_memory_stats.temp_size_in_bytes, 0); |
| EXPECT_EQ(compiled_memory_stats.host_temp_size_in_bytes, 0); |
| EXPECT_EQ(compiled_memory_stats.host_output_size_in_bytes, 16); |
| EXPECT_GT(compiled_memory_stats.peak_memory_in_bytes, 0); |
| } |
| |
| TEST(StreamExecutorGpuCompilerTest, AutoLayoutIsSupported) { |
| const char* hlo_text = R"( |
| HloModule DotLayout, |
| entry_computation_layout={(f32[2,3,5],f32[3,4,5])->f32[5,2,4]{2,1,0}} |
| |
| ENTRY dot { |
| p0 = f32[2,3,5]{2,1,0} parameter(0) |
| p1 = f32[3,4,5]{2,1,0} parameter(1) |
| ROOT dot.1330.10585 = f32[5,2,4]{2,1,0} dot(p0, p1), |
| lhs_batch_dims={2}, lhs_contracting_dims={1}, |
| rhs_batch_dims={2}, rhs_contracting_dims={0} |
| })"; |
| |
| TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client, |
| GetStreamExecutorGpuClient(GpuClientOptions())); |
| auto se_client = absl::WrapUnique( |
| tensorflow::down_cast<StreamExecutorGpuClient*>(client.release())); |
| StreamExecutorGpuCompiler compiler(se_client->platform_id(), |
| se_client->client()->platform()->id()); |
| TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology, |
| se_client->GetTopologyDescription()); |
| TF_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<HloModule> m, |
| ParseAndReturnUnverifiedModule( |
| hlo_text, {}, HloParserOptions().set_fill_missing_layouts(false))); |
| |
| CompileOptions compile_options; |
| compile_options.executable_build_options.mutable_debug_options() |
| ->set_xla_pjrt_allow_auto_layout_in_hlo(true); |
| XlaComputation computation = m->ToProto(); |
| TF_ASSERT_OK_AND_ASSIGN( |
| auto executable, compiler.Compile(compile_options, computation, *topology, |
| /*client=*/nullptr)); |
| TF_ASSERT_OK_AND_ASSIGN(auto layouts, executable->GetParameterLayouts()); |
| // Check that the assigned layouts are not default. |
| EXPECT_NE(layouts[0]->ToString(), "{2,1,0}"); |
| EXPECT_NE(layouts[1]->ToString(), "{2,1,0}"); |
| } |
| |
| } // namespace |
| } // namespace xla |