blob: 912f2ea9179b967bb815aea2c953c55cb84d8678 [file] [edit]
/* Copyright 2023 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "absl/memory/memory.h"
#include "absl/status/status_matchers.h"
#include "absl/status/statusor.h"
#include "absl/strings/string_view.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/MLIRContext.h"
#include "mlir/Parser/Parser.h"
#include "xla/hlo/builder/xla_computation.h"
#include "xla/hlo/ir/hlo_module.h"
#include "xla/hlo/parser/hlo_parser.h"
#include "xla/layout.h"
#include "xla/literal.h"
#include "xla/literal_util.h"
#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
#include "xla/pjrt/compiled_memory_stats.h"
#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
#include "xla/pjrt/gpu/se_gpu_pjrt_compiler.h"
#include "xla/pjrt/maybe_owning_mlir_module.h"
#include "xla/pjrt/pjrt_client.h"
#include "xla/pjrt/pjrt_compiler.h"
#include "xla/pjrt/pjrt_executable.h"
#include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
#include "xla/service/compiler.h"
#include "xla/shape.h"
#include "xla/shape_util.h"
#include "xla/tests/literal_test_util.h"
#include "xla/tsl/platform/statusor.h"
#include "xla/xla_data.pb.h"
#include "tsl/platform/casts.h"
namespace xla {
namespace {
using ::absl_testing::IsOkAndHolds;
using ::testing::SizeIs;
constexpr absl::string_view kProgram = R"(HloModule Computation
ENTRY Computation() -> s32[] {
ROOT result = s32[] constant(2)
})";
constexpr absl::string_view mlir_str = R"mlir(
module {
func.func @main() -> tensor<i32> {
%0 = mhlo.constant dense<2> : tensor<i32>
return %0 : tensor<i32>
}
})mlir";
absl::StatusOr<xla::XlaComputation> GetXlaComputation(
absl::string_view program) {
TF_ASSIGN_OR_RETURN(auto hlo_module,
xla::ParseAndReturnUnverifiedModule(program, {}));
return XlaComputation(hlo_module->ToProto());
}
void ValidateResult(
std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>& result) {
ASSERT_EQ(result.size(), 1);
std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
ASSERT_EQ(result_buffers.size(), 1);
TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
result_buffers[0]->ToLiteral().Await());
EXPECT_TRUE(
LiteralTestUtil::Equal(LiteralUtil::CreateR0(2), *result_literal));
}
TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
GetStreamExecutorGpuClient(GpuClientOptions()));
auto se_client = absl::WrapUnique(
tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
Compiler::GpuTargetConfig gpu_target_config = xla::Compiler::GpuTargetConfig(
se_client->client()->backend().default_stream_executor());
StreamExecutorGpuCompiler compiler(se_client->platform_id(),
se_client->client()->platform()->id());
auto context = std::make_unique<mlir::MLIRContext>();
context->loadDialect<mlir::mhlo::MhloDialect, mlir::func::FuncDialect>();
auto mlir_module =
mlir::parseSourceString<mlir::ModuleOp>(mlir_str, context.get());
TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
xla::CompileOptions opts;
opts.gpu_target_config = gpu_target_config;
TF_ASSERT_OK_AND_ASSIGN(
auto executable,
compiler.Compile(
opts,
MaybeOwningMlirModule(std::move(context), std::move(mlir_module)),
*topology, /*client=*/nullptr));
EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
TF_ASSERT_OK_AND_ASSIGN(
auto loaded_executable,
se_client->Load(std::move(executable), LoadOptions()));
TF_ASSERT_OK_AND_ASSIGN(
std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> result,
loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
ValidateResult(result);
}
TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
GetStreamExecutorGpuClient(GpuClientOptions()));
auto se_client = absl::WrapUnique(
tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
Compiler::GpuTargetConfig gpu_target_config{
se_client->client()->backend().default_stream_executor()};
StreamExecutorGpuCompiler compiler(se_client->platform_id(),
se_client->client()->platform()->id());
TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
GetXlaComputation(kProgram));
TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology,
se_client->GetTopologyDescription());
xla::CompileOptions opts;
opts.gpu_target_config = gpu_target_config;
TF_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<PjRtExecutable> executable,
compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
TF_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<PjRtLoadedExecutable> loaded_executable,
se_client->Load(std::move(executable), LoadOptions()));
TF_ASSERT_OK_AND_ASSIGN(
std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> result,
loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
ValidateResult(result);
}
TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
GetStreamExecutorGpuClient(GpuClientOptions()));
auto se_client = absl::WrapUnique(
tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
StreamExecutorGpuCompiler compiler(se_client->platform_id(),
se_client->client()->platform()->id());
xla::CompileOptions opts;
opts.gpu_target_config = Compiler::GpuTargetConfig(
se_client->client()->backend().default_stream_executor());
TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
GetXlaComputation(kProgram));
TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology,
se_client->GetTopologyDescription());
TF_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<PjRtExecutable> executable,
compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
// Serialize the executable and load it.
TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
executable->SerializeExecutable());
TF_ASSERT_OK_AND_ASSIGN(
auto loaded_executable,
se_client->LoadSerialized(serialized_executable, std::nullopt,
LoadOptions()));
TF_ASSERT_OK_AND_ASSIGN(
auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
ValidateResult(result);
}
constexpr absl::string_view kProgramIdentity = R"(HloModule Identity
ENTRY main {
ROOT Arg_0.1 = s32[1]{0} parameter(0)
})";
TEST(StreamExecutorGpuCompilerTest, SuccessSerializeDeserialize) {
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
GetStreamExecutorGpuClient(GpuClientOptions()));
auto se_client = absl::WrapUnique(
tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
StreamExecutorGpuCompiler compiler(se_client->platform_id(),
se_client->client()->platform()->id());
xla::CompileOptions opts;
opts.gpu_target_config = Compiler::GpuTargetConfig(
se_client->client()->backend().default_stream_executor());
TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
GetXlaComputation(kProgramIdentity));
TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology,
se_client->GetTopologyDescription());
TF_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<PjRtExecutable> executable,
compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
TF_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<PjRtLoadedExecutable> loaded_executable,
se_client->Load(std::move(executable), LoadOptions()));
// Serialize the executable and deserialize it without failure.
TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
se_client->SerializeExecutable(*loaded_executable));
TF_ASSERT_OK_AND_ASSIGN(
auto deserialized_executable,
se_client->LoadSerializedExecutable(serialized_executable, std::nullopt,
LoadOptions()));
EXPECT_EQ(deserialized_executable->GetExecutable()->name(), "Identity");
}
constexpr char const* kD2HProgramTupleOutput = R"(
HloModule f
ENTRY main.5 {
p = s32[4]{0} parameter(0)
cc = s32[4] custom-call(p),
custom_call_target="annotate_device_placement",
frontend_attributes={_xla_buffer_placement="pinned_host"}
ROOT tuple = (s32[4]{0}, s32[4]{0}) tuple(s32[4]{0} p, s32[4]{0} cc)
}
)";
TEST(StreamExecutorGpuCompilerTest, UnloadedExecutableMemoryStats) {
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
GetStreamExecutorGpuClient(GpuClientOptions()));
auto se_client = absl::WrapUnique(
tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
StreamExecutorGpuCompiler compiler(se_client->platform_id(),
se_client->client()->platform()->id());
xla::CompileOptions options;
options.gpu_target_config = Compiler::GpuTargetConfig(
se_client->client()->backend().default_stream_executor());
// Build the output shape with the correct memory space set.
Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, {4}, {0});
Shape host_shape = shape;
host_shape.mutable_layout()->set_memory_space(Layout::kHostMemorySpace);
Shape out_shape = ShapeUtil::MakeTupleShape({shape, host_shape});
options.executable_build_options.set_result_layout(out_shape);
TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
GetXlaComputation(kD2HProgramTupleOutput));
TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology,
se_client->GetTopologyDescription());
TF_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<PjRtExecutable> executable,
compiler.Compile(options, computation, *topology, /*client=*/nullptr));
EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
TF_ASSERT_OK_AND_ASSIGN(CompiledMemoryStats compiled_memory_stats,
executable->GetCompiledMemoryStats());
EXPECT_EQ(compiled_memory_stats.argument_size_in_bytes, 16);
EXPECT_EQ(compiled_memory_stats.output_size_in_bytes, 32);
EXPECT_GT(compiled_memory_stats.temp_size_in_bytes, 0);
EXPECT_EQ(compiled_memory_stats.host_temp_size_in_bytes, 0);
EXPECT_EQ(compiled_memory_stats.host_output_size_in_bytes, 16);
EXPECT_GT(compiled_memory_stats.peak_memory_in_bytes, 0);
}
TEST(StreamExecutorGpuCompilerTest, AutoLayoutIsSupported) {
const char* hlo_text = R"(
HloModule DotLayout,
entry_computation_layout={(f32[2,3,5],f32[3,4,5])->f32[5,2,4]{2,1,0}}
ENTRY dot {
p0 = f32[2,3,5]{2,1,0} parameter(0)
p1 = f32[3,4,5]{2,1,0} parameter(1)
ROOT dot.1330.10585 = f32[5,2,4]{2,1,0} dot(p0, p1),
lhs_batch_dims={2}, lhs_contracting_dims={1},
rhs_batch_dims={2}, rhs_contracting_dims={0}
})";
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
GetStreamExecutorGpuClient(GpuClientOptions()));
auto se_client = absl::WrapUnique(
tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
StreamExecutorGpuCompiler compiler(se_client->platform_id(),
se_client->client()->platform()->id());
TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology,
se_client->GetTopologyDescription());
TF_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<HloModule> m,
ParseAndReturnUnverifiedModule(
hlo_text, {}, HloParserOptions().set_fill_missing_layouts(false)));
CompileOptions compile_options;
compile_options.executable_build_options.mutable_debug_options()
->set_xla_pjrt_allow_auto_layout_in_hlo(true);
XlaComputation computation = m->ToProto();
TF_ASSERT_OK_AND_ASSIGN(
auto executable, compiler.Compile(compile_options, computation, *topology,
/*client=*/nullptr));
TF_ASSERT_OK_AND_ASSIGN(auto layouts, executable->GetParameterLayouts());
// Check that the assigned layouts are not default.
EXPECT_NE(layouts[0]->ToString(), "{2,1,0}");
EXPECT_NE(layouts[1]->ToString(), "{2,1,0}");
}
} // namespace
} // namespace xla