blob: 00476f41f10638a7bd6e79e58b88484a0dccc717 [file] [log] [blame]
#!/usr/bin/env vpython3
# Copyright 2024 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Script to extract edits from clang spanification tool output.
The edits have the following format:
```
e lhs_node1 rhs_node2 # Edge from node 1 to node 2.
e lhs_node2 rhs_node3 # Edge from node 2 to node 3.
e lhs_node3 rhs_node4 # Edge from node 3 to node 4.
...
s node_1 # Source node of the graph that triggers a
# rewrite (i.e. buffer usage)
...
i node_4 # Sink node. A rewrite from a source
# requires the ultimate end nodes to be
# sink. They represent nodes we know can
# be rewrite because the buffer's size
# is known.
...
f lhs_node rhs_node replacement # Span frontier replacement applied if
# lhs_node is not rewritten but rhs_node
# is.
...
r node_1 replacement # A replacement associated with a node.
```
Where all the `*node*` are abstract ID that represents a node in the graph.
Real example:
```
s 0008244:DBKYJas7
s 0008303:GWkNbhQ4
e 0001450:8-AxbSn3 0008303:GWkNbhQ4
e 0001518:BUQKDaXe 0008244:DBKYJas7
e 0001518:L97i_bwg 0008303:GWkNbhQ4
f 0001450:8-AxbSn3 0008303:GWkNbhQ4 r:::../../base/memory/shared_memory_mapping.h:::8684:::0:::0:::.data()
f 0001518:BUQKDaXe 0008244:DBKYJas7 r:::../../base/memory/shared_memory_mapping.h:::8535:::15:::0:::(data() + size()).data()
f 0001518:L97i_bwg 0008303:GWkNbhQ4 r:::../../base/memory/shared_memory_mapping.h:::8686:::15:::0:::(data() + size()).data()
r 0001518:BUQKDaXe include-user-header:::../../base/containers/checked_iterators.h:::-1:::-1:::base/containers/span.h
r 0001518:BUQKDaXe r:::../../base/containers/checked_iterators.h:::1518:::9:::0:::base::span<const unsigned char>
r 0001946:gKWdIpwv r:::../../base/containers/checked_iterators.h:::1946:::9:::0:::base::span<const unsigned char>
```
**Important Note on "r:::" Replacement Directive:**
The `replacement_directive` strings starting with `r:::` (which can appear in
`r` lines or as the third argument in `f` lines) have been extended and are
slightly different from the format accepted by apply_edits.py. There is an
additional `precedence` field before the replacement text.
This `<precedence>` value is used by `extract_edits.py` to merge conflicting
insertions. If multiple `r` directives are insertions (i.e., `<length>` is
"0") and target the exact same file and offset:
Their `<text>` components are merged into a single replacement.
Directives with a lower numerical `<precedence>` value have their text
inserted *earlier* (further to the left) in the final merged text.
The `<precedence>` field is **removed** by `extract_edits.py` from all `r`
directives before they are included in the final list of edits output by this
script.
extract_edits.py takes input that is concatenated from multiple tool
invocations and extract just the edits with the following steps:
1- Construct the adjacency list of nodes
(a pairs of nodes represents an edge in the directed graph)
2- Determine whether size info is available for a given source node.
3- Run `DFS` starting from source nodes whose size info is available and emit
edits for reachable nodes.
4- Adapt dereference expressions and add data changes where necessary.
extract_edits.py would then emit the following output:
<edit1>
<edit2>
<edit3>
...
Where the edit is either a replacement or an include directive.
For more details about how the tool works, see the doc here:
https://docs.google.com/document/d/1hUPe21CDdbT6_YFHl03KWlcZqhNIPBAfC-5N5DDY2OE/
"""
import sys
import urllib.parse
from os.path import expanduser
import pprint
from collections import defaultdict
# The connected components in the graph. This is useful to split the rewrite
# into atomic changes.
class Component:
all = set()
def __init__(self) -> None:
# Changes associated with the connected component.
self.changes = set()
# Frontier changes are either accepted or rejected. The two dictionaries
# are used to detect conflicts in the frontier changes. This might
# happen in rare cases where C++ macros are used. In case of conflict,
# the whole component is discarded.
self.frontier_changes_accepted = set()
self.frontier_changes_rejected = set()
# `Component.all` can be used to iterate over all components.
Component.all.add(self)
class Node:
# Mapping in between the node's key and the node.
key_to_node = dict()
def __init__(self, key) -> None:
self.key = key
self.replacements = set()
# Neighbors of the node in the graph. The graph is directed,
# flowing from lhs to rhs.
self.neighbors_directed = set()
self.neighbors_undirected = set()
# Property to track whether the node is "connected" to a source node.
# This is set from DFS(...)
self.visited = False
# The size info is available for a node if all the paths through the
# graph are leading to a sink. This is initially set to None, and then
# set by ComputeSizeInfoAvailable(...).
self.size_info_available = None
# Track whether this node is currently on the stack of the
# `ComputeSizeInfoAvailable` recursive function. This is used to detect
# cycles in the graph.
self.size_info_visiting = False
# Identify the connected component this node belongs to. This is set in
# the main function.
self.component = None
def add_replacement(self, replacement: str):
assert_valid_replacement(replacement)
self.replacements.add(replacement)
# Static method to get a node from a replacement key.
@classmethod
def from_key(cls: type, key: str):
# Deduplicate nodes, as they will appear multiple times in the input.
node = Node.key_to_node.get(key)
if node is not None:
return node
node = Node(key)
Node.key_to_node[key] = node
return node
def __repr__(self) -> str:
result = [
f"Node {hash(self)} {{",
f" key: {self.key}",
f" size_info_available: {self.size_info_available}",
f" neighbors_directed: {pprint.pformat([hash(n) for n in self.neighbors_directed], indent=4)}",
"}",
]
return "\n".join(result)
# This is not parsable by from_key but is useful for debugging the
# graph of nodes.
def to_debug_string(self) -> str:
return repr(self)
# Static method to get all nodes.
@classmethod
def all(cls: type):
return cls.key_to_node.values()
def DFS(node: Node):
"""
Explore the graph in depth-first search from the given node. Identify edits
to apply.
Args:
node: The current node being processed.
"""
# Only visit nodes once:
if (node.visited):
return
node.visited = True
for replacement in node.replacements:
node.component.changes.add(replacement)
for neighbour in node.neighbors_directed:
DFS(neighbour)
def ComputeSizeInfoAvailable(node: Node):
"""
Determines whether size information is available for a source node and its
neighbors_directed. Updates the node's size_info_available attribute.
Args:
node: The current node's being processed.
"""
# Memoization: node.size_info_available has already been computed. Return.
if node.size_info_available:
return
# If there are no dependencies, the size info is definitely not available
# for this node.
if not node.neighbors_directed:
node.size_info_available = False
return
# Cycle: If the node is currently being visited, it means it depends on
# itself, and there's a cycle. We can't determine the size info for this
# node with the current implementation.
if node.size_info_visiting:
return
# The size info is available for a node if all the paths through the graph
# are leading to a sink. Locally, it means all the dependencies have their
# size info available.
node.size_info_visiting = True
for neighbour in node.neighbors_directed:
ComputeSizeInfoAvailable(neighbour)
node.size_info_visiting = False
# This node can be rewritten if all of its dependencies can.
# Dependencies with `size_info_available == None` are nodes that are part
# of an isolated cycle. Isolated cycle are rewritten.
node.size_info_available = not any(
neighbour.size_info_available == False
for neighbour in node.neighbors_directed)
# Assert a replacement follows the expected format:
# - r:::<file path>:::<offset>:::<length>:::<replacement text>
# - include-user-header:::<file path>:::-1:::-1:::<include text>
# - include-system-header:::<file path>:::-1:::-1:::<include text>
def assert_valid_replacement(replacement: str):
try:
parts = replacement.split(':::')
directive_type = parts[0]
assert directive_type in [
'r', 'include-user-header', 'include-system-header'
], f"Unknown directive type '{directive_type}'"
assert len(parts) > 1
assert parts[1] != '', "File path must not be empty."
if directive_type == 'r':
assert len(
parts
) == 6, f"Directive 'r' must have 6 parts, got {len(parts)}"
# Validate offset.
assert parts[2].isdigit()
# Validate length.
assert parts[3].isdigit()
# Validate precedence. Can be a negative or positive integer.
try:
int(parts[4]) # Check if it's a valid integer representation
except ValueError:
raise AssertionError(
f"Precedence '{parts[4]}' must be a valid integer string.")
else:
assert len(parts) == 5
except:
# Augment the error with the replacement text for better debugging.
assert False, f"Invalid replacement: \"{replacement}\""
def merge_insertions_and_remove_precedence_field(changes: set) -> set:
"""
Merges conflicting insertions at the same code location.
The merge order is determined as follows:
Handles "associativity" by grouping insertions based on precedence sign.
The final text is formed by concatenating all positive-precedence
insertions (i.e., "closing" parts), then zero-precedence, then all
negative-precedence insertions ("opening" parts). This ensures that
a closing bracket from a left expression is placed before an opening
bracket from a right expression (e.g., `...>[...`).
Also removes the precedence field from the final replacement directives.
"""
replacements_by_range = defaultdict(list)
result = set()
for change in changes:
assert_valid_replacement(change)
parts = change.split(':::')
directive_type = parts[0]
if directive_type == 'r':
_, file_path, offset, length, precedence, text = parts
precedence = int(precedence)
# Key identifies the exact code range being replaced
key = (file_path, offset, length)
replacements_by_range[key].append((precedence, text))
else:
result.add(change)
for key, candidates in replacements_by_range.items():
assert candidates, "A key should always have at least one candidate."
file_path, offset, length = key
if len(candidates) == 1:
# No conflict.
_, text = candidates[0]
reconstructed_directive = f"r:::{file_path}:::{offset}:::{length}:::{text}"
result.add(reconstructed_directive)
continue
if int(length) == 0:
# Conflicting insertion detected.
# Assert uniqueness of precedence values to ensure determinism.
precedences = [p for p, _ in candidates]
assert len(precedences) == len(
set(precedences)
), "Conflicting insertions need to have unique precedece values."
merged_texts = [t for _, t in sorted(candidates)]
reconstructed_directive = f"r:::{file_path}:::{offset}:::{length}:::{''.join(merged_texts)}"
result.add(reconstructed_directive)
else:
# Conflicting non-insertion replacement. This is an unresolvable
# conflict for now. Just remove the precedence field.
for _, text in candidates:
reconstructed_directive = f"r:::{file_path}:::{offset}:::{length}:::{text}"
result.add(reconstructed_directive)
return result
def main():
# Since the tool is invoked from multiple compile units, we are using sets
# to deduplicate what was visible from multiple compile units.
# A set of source nodes that trigger the rewrite.
sources = set()
# A set of sink nodes. A rewrite from a source requires all the end nodes
# to be sink. They represent nodes where the rewrite could be applied,
# because the size info is available.
sinks = set()
# Change to apply at the edge in between rewritten and non-rewritten nodes.
frontiers = set()
# Collect from every compile units the nodes and edges of the graph:
for line in sys.stdin:
line = line.rstrip('\n\r')
# The first character of the line denotes the type of the line:
# - 'r': Replacement associated with a node.
# - 'e': Edge in between two nodes.
# - 's': Source node of the graph triggering the rewrite.
# - 'f': Span frontier change.
# - 'i': Sink node. A rewrite from a source requires the ultimate end
# nodes to be sink. They represent nodes we know can be rewrite
# because the buffer's size is known.
assert line[0] in ['r', 'e', 's', 'i', 'f'], "Unknown line type: " +\
line[0] + " in line: " + line
# Replacement associated with a node:
if line[0] == 'r':
(_, key, replacement) = line.split(' ', 2)
Node.from_key(key).add_replacement(replacement)
continue
# Sink node:
if line[0] == 'i':
(_, key) = line.split(' ')
sinks.add(key)
continue
# Source node:
if line[0] == 's':
(_, key) = line.split(' ')
sources.add(key)
continue
# Edge in between two nodes:
if line[0] == 'e':
(_, lhs_key, rhs_key) = line.split(' ')
lhs = Node.from_key(lhs_key)
rhs = Node.from_key(rhs_key)
# Directed edge:
lhs.neighbors_directed.add(rhs)
# Undirected edge:
lhs.neighbors_undirected.add(rhs)
rhs.neighbors_undirected.add(lhs)
continue
# Span frontier change:
if line[0] == 'f':
frontiers.add(line)
continue
assert False, "Unreachable code"
# Mark the sink nodes as rewritable.
for sink in sinks:
Node.from_key(sink).size_info_available = True
# Mark the source nodes:
source_nodes = []
for source in sources:
source_node = Node.from_key(source)
source_nodes.append(source_node)
# Determine whether size information is available from this source.
ComputeSizeInfoAvailable(source_node)
# Identify all the connected components in the undirected graph. This is
# exploring the graph in depth-first search and assigning the same component
# to each node in the connected component.
for node in Node.all():
if node.component is not None:
continue
new_component = Component()
stack = [node]
while stack:
current = stack.pop()
if current.component is not None:
continue
current.component = new_component
for neighbor in current.neighbors_undirected:
stack.append(neighbor)
# Collect the changes to apply. Starting from sources nodes whose size info
# could be determined.
for node in source_nodes:
# Collect the changes to apply. We start from sources nodes whose size
# info is available and explore the graph in depth-first search.
if node.size_info_available:
DFS(node)
# At the edge in between rewritten and non-rewritten nodes, we need
# to add a call to `.data()` to access the pointer from the span:
for frontier in frontiers:
(_, lhs_key, rhs_key, replacement) = frontier.split(' ', 3)
lhs_node = Node.from_key(lhs_key)
rhs_node = Node.from_key(rhs_key)
apply_frontier = rhs_node.visited and not lhs_node.visited
if apply_frontier:
lhs_node.component.frontier_changes_accepted.add(replacement)
else:
lhs_node.component.frontier_changes_rejected.add(replacement)
# Do or do not, there is no try. Discard components with conflicting
# frontier changes. This happens in rare cases where C++ macros are used.
# The whole component is discarded in case of conflict, because we can't
# satisfy the constraints.
for component in Component.all:
if component.frontier_changes_accepted & component.frontier_changes_rejected:
component.changes.clear()
continue;
component.changes |= component.frontier_changes_accepted
# Emit the changes:
# - ~/scratch/patches.txt: A summary of each atomic change.
# - ~/scratch/patch_<patch_index>: Write each atomic change.
# - stdout: Print a bundle of all the changes. This is usually piped to
# "./tools/clang/scripts/apply_edits.py" to apply the changes.
summary_filename = expanduser('~/scratch/patches.txt')
summary_file = open(summary_filename, 'w')
component_with_changes = [
component for component in Component.all if len(component.changes) > 0
]
for index, component in enumerate(component_with_changes):
merged_component_changes = merge_insertions_and_remove_precedence_field(
component.changes)
for text in merged_component_changes:
print(text)
summary_file.write(f'patch_{index}: {len(merged_component_changes)}\n')
with open(expanduser(f'~/scratch/patch_{index}.txt'), 'w') as f:
f.write('\n'.join(merged_component_changes))
summary_file.close()
return 0
if __name__ == '__main__':
sys.exit(main())