blob: c806bd1fe71cd433b08fc96b991517d0edd109b3 [file] [log] [blame]
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// A class that attempts to disassemble a function into basic blocks.
//
// Given a function block (dubbed macro block), this disassembler attempts to
// cut it up into sequences of contiguous instruction runs and data blocks. A
// contiguous instruction run is defined as a set of instructions that under
// normal operation will always run from start to end. This class requires that
// all external references to addresses within a function block have an
// associated label.
#ifndef SYZYGY_BLOCK_GRAPH_BASIC_BLOCK_DECOMPOSER_H_
#define SYZYGY_BLOCK_GRAPH_BASIC_BLOCK_DECOMPOSER_H_
#include <set>
#include <string>
#include "base/callback.h"
#include "base/strings/string_piece.h"
#include "syzygy/block_graph/basic_block.h"
#include "syzygy/block_graph/basic_block_subgraph.h"
#include "syzygy/block_graph/block_graph.h"
#include "syzygy/core/address.h"
#include "syzygy/core/disassembler.h"
#include "distorm.h" // NOLINT
namespace block_graph {
// This class re-disassembles an already-processed code block (referred to
// herein as a macro block) and breaks it up into basic blocks.
//
// A basic block is defined here as one of:
//
// 1) A series of code instructions that will be executed contiguously.
// 2) A chunk of data (as determined by it being labeled as such).
// 3) Padding (or unreachable code)
//
// The break-down into basic blocks happens in six passes:
//
// 1) Code disassembly starting from the block's inbound code references. This
// carves all of the basic code blocks and creates control flow (successor)
// relationships. While the basic blocks are being created, intra-block
// successors cannot be resolved and are instead referenced by block offset;
// inter-block successors are immediately resolved.
// 2) Data block construction to carve out statically embedded data.
// 3) Padding block construction to fill any gaps.
// 4) Copying all inbound references (referrers) to their corresponding
// basic block.
// 5) Copying all references originating in the block to their corresponding
// basic block.
// 6) Wiring up all unresolved intra-block successors.
//
// The block to be decomposed must have been produced by a successful
// PE decomposition (or some facsimile thereof).
class BasicBlockDecomposer {
public:
typedef core::AbsoluteAddress AbsoluteAddress;
typedef block_graph::BasicBlock BasicBlock;
typedef BasicBlock::BasicBlockType BasicBlockType;
typedef block_graph::BlockGraph BlockGraph;
typedef BlockGraph::Offset Offset;
typedef block_graph::BasicBlockSubGraph BasicBlockSubGraph;
// Initialize the BasicBlockDecomposer instance.
// @param block The block to be decomposed
// @param subgraph The basic-block sub-graph data structure to populate.
// This can be NULL if the results of the decomposition aren't
// necessary.
BasicBlockDecomposer(const BlockGraph::Block* block,
BasicBlockSubGraph* subgraph);
// Decomposes a function macro block into its constituent basic blocks.
//
// Immediately following a successful decomposition of a block to
// basic-blocks, subgraph will contain all the basic-blocks found in the
// source block and exactly one block description: that of the source block.
//
// Following decomposition, additional block descriptions can be created,
// new basic blocks added, and basic blocks shuffled between the descriptions.
// The subgraph can then be coalesced back into the BlockGraph from
// which the original block came.
//
// This can set various status bits indicating the reason for the failure.
bool Decompose();
// @returns true if the decomposition failed because of unsupported
// instructions.
bool contains_unsupported_instructions() const {
return contains_unsupported_instructions_;
}
protected:
typedef std::map<Offset, BasicBlockReference> BasicBlockReferenceMap;
typedef core::AddressSpace<Offset, size_t, BasicBlock*> BBAddressSpace;
typedef BlockGraph::Block::SourceRange SourceRange;
typedef BlockGraph::Size Size;
typedef std::set<Offset> JumpTargets;
// Returns the source range that coincides with the data range
// [@p offset, [@p offset + @p size) in the original block.
SourceRange GetSourceRange(Offset offset, Size size) const;
// Find the basic block, and corresponding byte-range, that contains the
// given offset.
// @param offset the starting offset you with the returned basic-block/range
// to contain.
// @param basic_block the basic-block containing @p offset.
// @param range the byte-range in which @p basic_offset resides, which
// contains @p offset.
bool FindBasicBlock(Offset offset,
BasicBlock** basic_block,
BBAddressSpace::Range* range) const;
// Find the basic block that begins at the given offset.
// @param offset The starting offset of the basic block you want to find.
// @pre The basic block subgraph is derived from an original block (and
// thus has an address space) and has been broken down into all of its
// constituent basic blocks (i.e., post disassembly and basic-block
// splitting).
BasicBlock* GetBasicBlockAt(Offset offset) const;
// Helper function to end the current basic block and begin a new one
// at @p offset. This is only to be used during disassembly.
bool EndCurrentBasicBlock(Offset end_offset);
// Walk the function's code in a linear fashion, decomposing the block into
// code and data basic blocks forming an original address space.
bool Disassemble();
// Determines the code range of the block, and creates any data blocks. This
// will return false if an invalid block layout is encountered.
// @param end Will be filled in with the end of the code range.
bool GetCodeRangeAndCreateDataBasicBlocks(Offset* end);
// Performs an initial linear pass at disassembling the code bytes of the
// block into rudimentary basic blocks. The initial set of basic blocks are
// each terminated at branch points. A subsequent pass will further split
// basic blocks at branch destinations, see SplitCodeBlocksAtBranchTargets().
bool ParseInstructions();
// @name Helpers for ParseInstructions().
// @{
// Initializes jump_targets_ to the set of referenced code locations.
// This covers all locations which are externally referenced, as well as
// those that are internally referenced via jump tables. These jump targets
// may be otherwise un-discoverable through disassembly.
// @param code_end_offset the first offset above @p code_begin_offset at
// which the bytes are no longer code.
void InitJumpTargets(Offset code_end_offset);
// Decode the bytes at @p offset into @p instruction. This function takes
// into consideration the range of offsets which denote code.
// @param offset The offset of into block_ at which to start decoding.
// @param code_end_offset The offset at which the bytes cease to be code.
// @param instruction this value will be populated on success.
// @returns true on success; false otherwise.
// @note Used by ParseInstructions().
bool DecodeInstruction(Offset offset,
Offset code_end_offset,
Instruction* instruction) const;
// Called for each instruction, this creates the Instruction object
// corresponding to @p instruction, or terminates the current basic block
// if @p instruction is a branch point.
// @param instruction the decoded instruction.
// @param offset the offset at which @p instruction occurs in the block.
// @returns true on success; false otherwise.
// @note Used by ParseInstructions().
bool HandleInstruction(const Instruction& instruction, Offset offset);
// @}
// @name Validation functions.
// @{
// Verifies that every identified jump target in the original code block
// resolves to the start of a basic code block in the original code blocks
// basic-block address space. This is protected for unit-testing purposes.
void CheckAllJumpTargetsStartABasicCodeBlock() const;
// Verifies that the address space derived from the original code block
// fully covers the original code block. This is protected for unit-testing
// purposes.
void CheckHasCompleteBasicBlockCoverage() const;
// Verifies that all basic blocks in the address space derived from the
// original code block have valid successors or end in an instruction that
// does not yield successors. This is protected for unit-testing purposes.
void CheckAllControlFlowIsValid() const;
// Verifies that all labels in the original block are present in the
// decomposed basic-block subgraph.
void CheckAllLabelsArePreserved() const;
// @}
// Split code basic-blocks at branch targets such that no basic-block
// has a reference that it not to its head.
void SplitCodeBlocksAtBranchTargets();
// Propagate the referrers from the original block into the basic blocks
// so that referrers can be tracked as the basic blocks are manipulated.
void CopyExternalReferrers();
// Helper function to populate @p refs with the set of references originating
// from its source range in the original block.
void CopyReferences(Offset item_offset,
Size item_size,
BasicBlockReferenceMap* refs);
// Propagate the references from the original block into the basic blocks
// so that they can be tracked as the basic blocks are manipulated.
void CopyReferences();
// Resolve intra-block control flow references and referrers.
void ResolveSuccessors();
// Convert any unreachable code basic block into padding basic blocks.
void MarkUnreachableCodeAsPadding();
// Inserts a basic block range into the decomposition.
bool InsertBasicBlockRange(Offset offset,
size_t size,
BasicBlockType type);
// The block being disassembled.
const BlockGraph::Block* const block_;
// The basic-block sub-graph to which the block will be decomposed.
BasicBlockSubGraph* subgraph_;
// The layout of the original block into basic blocks in subgraph_.
BBAddressSpace original_address_space_;
// Tracks locations our conditional branches jump to. Used to fix up basic
// blocks by breaking up those that have a jump target in the middle.
JumpTargets jump_targets_;
// The start offset of the current basic block during a walk.
Offset current_block_start_;
// The list of instructions in the current basic block.
BasicBlock::Instructions current_instructions_;
// The set of successors for the current basic block.
BasicBlock::Successors current_successors_;
// A debugging flag indicating whether the decomposition results should be
// CHECKed.
bool check_decomposition_results_;
// If no explicit subgraph was provided then we need to use one as scratch
// space in order to do some work.
std::unique_ptr<BasicBlockSubGraph> scratch_subgraph_;
// Decomposition failure flags.
bool contains_unsupported_instructions_;
};
} // namespace block_graph
#endif // SYZYGY_BLOCK_GRAPH_BASIC_BLOCK_DECOMPOSER_H_