blob: 0df0fc48454e651717380feceb4a6297b4e6c5da [file] [log] [blame]
#!/usr/bin/env vpython3
# Copyright 2024 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Script to extract edits from clang spanification tool output.
The edits have the following format:
...
{lhs_node1};{rhs_node1}
{node_n}
{lhs_node2};{rhs_node2}
...
...
Where lhs_node, rhs_node, and node_n represent a node's text representation
generated using the spanification tool's Node::ToString() function.
The string representation has the following format:
`{is_buffer\,r:::<file path>:::<offset>:::<length>
:::<replacement text>\,include-user-header:::<file path>:::-1:::-1
:::<include text>\,size_info_available\,is_data_change\,is_deref_node}`
where `is_buffer`,`size_info_available`, `is_data_change` and `is_deref_node`
are booleans represented as 0 or 1.
extract_edits.py takes input that is concatenated from multiple tool
invocations and extract just the edits with the following steps:
1- Construct the adjacency list of nodes
(a pairs of nodes represents an edge in the directed graph)
2- Determine whether size info is available for a given buffer node.
3- Run `DFS` starting from buffer nodes whose size info is available and emit
edits for reachable nodes.
4- Adapt dereference expressions and add data changes where necessary.
extract_edits.py would then emit the following output:
<edit1>
<edit2>
<edit3>
...
Where the edit is either a replacement or an include directive.
For more details about how the tool works, see the doc here:
https://docs.google.com/document/d/1hUPe21CDdbT6_YFHl03KWlcZqhNIPBAfC-5N5DDY2OE/
"""
import sys
import resource
from os.path import expanduser
class Node:
# Mapping in between the replacement directive and the node.
key_to_node = dict()
def __init__(self, is_buffer, replacement, include_directive,
size_info_available, is_deref_node, is_data_change) -> None:
self.is_buffer = is_buffer
self.replacement = replacement
self.include_directive = include_directive
# We need to also rewrite deref expressions of the form:
# |*buf = something;| into |buf[0] = something;|
# for that, create a link between buf and the deref expression.
self.is_deref_node = is_deref_node
self.is_data_change = is_data_change
# Neighbors of the node in the graph.
self.neighbors = set()
# Property to tracker whether the node is "connected" to a buffer node.
# This is set from DFS(...)
self.visited = False
# See SizeInfoAvailable(...) for more details.
self.size_info_available = size_info_available
self.size_info_step = False
# Changes associated with the node. The whole connected component are
# sharing the same set. See DFS(...) for more details.
self.changes = None
def __eq__(self, other):
if isinstance(other, Node):
return self.replacement == other.replacement
return False
def __hash__(self) -> int:
return hash((self.replacement, self.include_directive))
# Static method to get a node from a replacement key.
def from_key(replacement: str):
return Node.key_to_node.get(replacement)
# Static method to create a node from its string representation. This
# deduplicate nodes by storing them in a dictionary.
def from_string(txt: str):
# Skipping the first and last character that correspond to the curly
# braces denoting the start and end of a serialized node.
x = txt[1:-1].split('\\,')
# Expect exactly 6 elements that correspond to the following node
# attributes:
# - is_buffer
# - replacement
# - include_directive
# - size_info_available
# - is_deref_node
# - is_data_change
assert len(x) == 6
node = Node(*x)
# Deduplicate nodes, as they might appear multiple times in the input.
if (Node.key_to_node.get(node.replacement) is None):
Node.key_to_node[node.replacement] = node
return Node.key_to_node[node.replacement]
# This is not parsable by from_string but is useful for debugging the graph
# of nodes.
def to_debug_string(self) -> str:
# include_directory already includes explanatory text so we don't have a
# string before its value.
result = "is_buffer:{},replacement:{},{},size_info_available:{}".format(
self.is_buffer, self.replacement, self.include_directive,
self.size_info_available)
result += "is_deref_node:{},is_data_change:{},".format(
self.is_deref_node, self.is_data_change)
# Recursively get neighbors.
result += "neighbors:"
neighbors = "{"
for node in self.neighbors:
if len(neighbors) > 1:
neighbors += ", "
neighbors += node.to_debug_string()
neighbors += "}"
return result + neighbors
# Static method to get all nodes.
def all():
return Node.key_to_node.values()
def DFS(node: Node, changes: set):
"""
Explore the graph in depth-first search from the given node. Identify edits
to apply.
Args:
node: The current node being processed.
changes: A set to collect the identified changes.
"""
# Only visit nodes once:
if (node.visited):
return
node.visited = True
node.changes = changes
if not node.replacement.endswith('<empty>'):
changes.add(node.replacement)
changes.add(node.include_directive)
for neighbour in node.neighbors:
DFS(neighbour, changes)
def SizeInfoAvailable(node: Node):
"""
Determines whether size information is available for a buffer node and its
neighbors. Updates the node's size_info_available attribute.
Args:
node: The current node's being processed.
Returns:
None if a cycle is detected,
'1' if size information is available for the node and its neighbors,
'0' otherwise,
"""
# If we reached a node that contains size info, just return that.
if node.size_info_available == '1':
return '1'
# Cycle detection: If the node is currently being visited, there's a cycle.
# We can't determine the size info for this node.
if node.size_info_step == 'visiting':
return None
# Memoization: If the node has already been visited, return the result
# immediately.
if node.size_info_step == 'visited':
return node.size_info_available
node.size_info_step = 'visiting'
size_info_available = '0'
# Check neighbors. If any neighbor doesn't have size info or there's a
# cycle, the current node also doesn't.
for neighbour in node.neighbors:
# Break as soon as we encounter a neighbor for which size info is not
# available.
if SizeInfoAvailable(neighbour) == '0':
size_info_available = '0'
break
size_info_available = '1'
node.size_info_available = size_info_available
node.size_info_step = 'visited'
return size_info_available
def main():
# Collect from every compile units the nodes and edges of the graph:
for line in sys.stdin:
line = line.rstrip('\n\r')
nodes = line.split(';')
# If there's only one node, it's a buffer node.
if len(nodes) == 1:
Node.from_string(nodes[0]).is_buffer = '1'
continue
# Else, parse the edge between two nodes:
assert len(nodes) == 2
lhs = Node.from_string(nodes[0])
rhs = Node.from_string(nodes[1])
lhs.neighbors.add(rhs)
# Determine whether size information is available for each buffer node:
for node in Node.all():
if node.is_buffer == '1':
SizeInfoAvailable(node)
# Collect the changes to apply. Starting from buffers nodes whose size info
# could be determined.
for node in Node.all():
# We only want to rewrite components connected to a buffer node.
if node.is_buffer != '1':
continue
# Some buffers might not have their size info available. We can't
# rewrite those.
if node.size_info_available != '1':
continue
# Collect the changes to apply. We start from buffer nodes whose size
# info is available and explore the graph in depth-first search.
changes = set()
DFS(node, changes)
# Iterate over the deref_nodes and then check if their only neighbor was
# visited. Visited nodes here are nodes who's type was rewritten to span.
# In that case, the deref expression needs to be adapted(rewritten)
for node in Node.all():
if node.is_deref_node == '1':
neighbor = list(node.neighbors)[0]
if neighbor.visited:
neighbor.changes.add(node.replacement)
# At the edge in between rewritten and non-rewritten nodes, we need
# to add a call to `.data()` to access the pointer from the span:
for node in Node.all():
if node.is_data_change != '1':
continue
# A data change needs to be added if lhs was not rewritten and rhs was.
# The lhs key is stored in the data_change_node's include_directive.
# Check if the lhs key is in the set of visited nodes (i.e lhs was
# rewritten). If lhs was rewritten, we don't need to add `.data()`
if Node.from_key(node.include_directive).visited:
continue
# Expect a single neighbor.
# TODO(357433195): In practice, this is not always the case. Investigate
# why and add the assertion back:
# assert len(node.neighbors) == 1, "and node: " + node.to_debug_string()
# If the rhs node was visited (i.e rewritten), then we need to apply the
# data change.
neighbor = list(node.neighbors)[0]
if neighbor.visited:
# In this case, rhs was rewritten, and lhs was not, we need to add
# the corresponding `.data()`
neighbor.changes.add(node.replacement)
# Emit the changes:
# - ~/scratch/patches.txt: A summary of each atomic change.
# - ~/scratch/patch_<patch_index>: Write each atomic change.
# - stdout: Print a bundle of all the changes. This is usually piped to
# "./tools/clang/scripts/apply_edits.py" to apply the changes.
summary_filename = expanduser('~/scratch/patches.txt')
summary_file = open(summary_filename, 'w')
change_index = 0
for node in Node.all():
if node.changes is None or len(node.changes) == 0:
continue
for text in node.changes:
print(text)
summary_file.write(f'patch_{change_index}: {len(node.changes)}\n')
with open(expanduser(f'~/scratch/patch_{change_index}.txt'), 'w') as f:
f.write('\n'.join(node.changes))
node.changes.clear()
change_index += 1
summary_file.close()
return 0
if __name__ == '__main__':
sys.exit(main())