blob: a4f5ce0382b6f2142439aff5218b0ea71b498b06 [file] [log] [blame]
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_ALGORITHM_PICKER_H_
#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_ALGORITHM_PICKER_H_
#include <optional>
#include <string>
#include <string_view>
#include <variant>
#include <vector>
#include "absl/time/time.h"
#include "tensorflow/compiler/xla/autotune_results.pb.h"
#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
#include "tensorflow/compiler/xla/service/compiler.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_serializable_autotuner.h"
#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
#include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
#include "tensorflow/tsl/protobuf/autotuning.pb.h"
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
#include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
#endif
namespace xla {
namespace gpu {
// Modifies CustomCalls to cudnn convolutions, choosing the best algorithm for
// each and adding explicit scratch space to the CustomCalls.
//
// It supports two modes: device and deviceless.
// In device mode, we run autotuning on the device and store autotune results.
//
// In deviceless mode, we pass in some information related to the device and
// use stored autotune results to rewrite convolutions. If the required autotune
// result is not stored, then the performance of convolution will be suboptimal.
class GpuConvAlgorithmPicker : public HloModulePass {
public:
static void ClearAutotuneResults();
static Status WriteAutotuneResults(AutotuneResults* results);
static Status LoadAutotuneResults(const AutotuneResults& results);
explicit GpuConvAlgorithmPicker(AutotuningConfig config) : config_(config) {}
absl::string_view name() const override {
return "gpu-conv-algorithm-picker";
}
using HloPassInterface::Run;
StatusOr<bool> Run(
HloModule* module,
const absl::flat_hash_set<absl::string_view>& execution_threads) override;
private:
StatusOr<bool> RunOnComputation(HloComputation* computation);
StatusOr<bool> RunOnInstruction(HloInstruction* instr);
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithm(
const HloCustomCallInstruction* instr);
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
// Simple bundle of an algorithm and its output, for comparing results across
// autotuned algorithms.
struct ReferenceResult {
stream_executor::dnn::AlgorithmDesc algorithm;
stream_executor::DeviceMemoryBase buffer;
};
// Debug information about the instruction we are autotuning.
struct AutotuneInstructionInfo {
std::string instr_str;
std::string module_str;
explicit AutotuneInstructionInfo(const HloCustomCallInstruction* instr)
: instr_str(instr->ToString()),
module_str(instr->GetModule()->ToString()) {}
};
// Execution environment for autotuning. Runtime autotuning requires runtime
// information such as input/output buffers in order to run. It can be
// constructed from the autotuned instruction by FromInstruction.
struct AutotuneRuntimeArguments {
const Shape result_shape;
const HloModuleConfig hlo_module_config;
std::vector<se::DeviceMemoryBase> operand_buffers;
se::DeviceMemoryBase result_buffer;
se::RedzoneAllocator* input_output_allocator;
const GpuConvConfig gpu_conv_config;
std::string canonical_hlo;
static StatusOr<AutotuneRuntimeArguments> FromInstruction(
const HloCustomCallInstruction* instr,
se::DeviceMemoryAllocator* allocator, se::StreamExecutor* stream,
se::RedzoneAllocator* input_output_allocator);
};
StatusOr<tensorflow::AutotuneResult> AutotuneOneConvRunner(
se::DeviceMemoryAllocator* allocator, se::Stream* stream,
MaybeFusedConvRunner* const runner,
std::optional<ReferenceResult>* reference_result,
absl::Span<const stream_executor::dnn::AlgorithmDesc> disabled_algos,
std::optional<AutotuneInstructionInfo> instruction_info,
const AutotuneRuntimeArguments& runtime_arguments);
// Pick the best algorithm for CUDA platform.
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheCuda(
const HloCustomCallInstruction* instr,
se::DeviceMemoryAllocator* allocator, se::Stream* stream,
std::optional<AutotuneInstructionInfo> instruction_info,
const AutotuneRuntimeArguments& runtime_arguments);
#endif
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheRocm(
const HloCustomCallInstruction* instr,
se::DeviceMemoryAllocator* allocator, se::Stream* stream);
private:
AutotuningConfig config_;
};
} // namespace gpu
} // namespace xla
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_ALGORITHM_PICKER_H_