tools/clang/spanify/extract_edits.py - chromium/src - Git at Google

 #!/usr/bin/env vpython3
 # Copyright 2024 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """Script to extract edits from clang spanification tool output.

 The edits have the following format:
     ...
     {lhs_node1};{rhs_node1}
     {node_n}
     {lhs_node2};{rhs_node2}
     ...
     ...
 Where lhs_node, rhs_node, and node_n represent a node's text representation
 generated using the spanification tool's Node::ToString() function.

 The string representation has the following format:
 `{is_buffer\,r:::<file path>:::<offset>:::<length>
 :::<replacement text>\,include-user-header:::<file path>:::-1:::-1
 :::<include text>\,size_info_available\,is_data_change\,is_deref_node}`

 where `is_buffer`,`size_info_available`, `is_data_change` and `is_deref_node`
 are booleans represented as  0 or 1.

 extract_edits.py takes input that is concatenated from multiple tool
 invocations and extract just the edits with the following steps:
 1- Construct the adjacency list of nodes
    (a pairs of nodes represents an edge in the directed graph)

 2- Determine whether size info is available for a given buffer node.

 3- Run `DFS` starting from buffer nodes whose size info is available and emit
    edits for reachable nodes.

 4- Adapt dereference expressions and add data changes where necessary.

 extract_edits.py would then emit the following output:
     <edit1>
     <edit2>
     <edit3>
     ...
 Where the edit is either a replacement or an include directive.

 For more details about how the tool works, see the doc here:
 https://docs.google.com/document/d/1hUPe21CDdbT6_YFHl03KWlcZqhNIPBAfC-5N5DDY2OE/
 """

 import sys

 import resource
 from os.path import expanduser


 class Node:
     # Mapping in between the replacement directive and the node.
     key_to_node = dict()

     def __init__(self, is_buffer, replacement, include_directive,
                  size_info_available, is_deref_node, is_data_change) -> None:
         self.is_buffer = is_buffer
         self.replacement = replacement
         self.include_directive = include_directive
         # We need to also rewrite deref expressions of the form:
         # |*buf = something;| into |buf[0] = something;|
         # for that, create a link between buf and the deref expression.
         self.is_deref_node = is_deref_node
         self.is_data_change = is_data_change

         # Neighbors of the node in the graph.
         self.neighbors = set()

         # Property to tracker whether the node is "connected" to a buffer node.
         # This is set from DFS(...)
         self.visited = False

         # See SizeInfoAvailable(...) for more details.
         self.size_info_available = size_info_available
         self.size_info_step = False

         # Changes associated with the node. The whole connected component are
         # sharing the same set. See DFS(...) for more details.
         self.changes = None

     def __eq__(self, other):
         if isinstance(other, Node):
             return self.replacement == other.replacement
         return False

     def __hash__(self) -> int:
         return hash((self.replacement, self.include_directive))

     # Static method to get a node from a replacement key.
     def from_key(replacement: str):
         return Node.key_to_node.get(replacement)

     # Static method to create a node from its string representation. This
     # deduplicate nodes by storing them in a dictionary.
     def from_string(txt: str):
         # Skipping the first and last character that correspond to the curly
         # braces denoting the start and end of a serialized node.
         x = txt[1:-1].split('\\,')
         # Expect exactly 6 elements that correspond to the following node
         # attributes:
         # - is_buffer
         # - replacement
         # - include_directive
         # - size_info_available
         # - is_deref_node
         # - is_data_change
         assert len(x) == 6

         node = Node(*x)

         # Deduplicate nodes, as they might appear multiple times in the input.
         if (Node.key_to_node.get(node.replacement) is None):
             Node.key_to_node[node.replacement] = node

         return Node.key_to_node[node.replacement]

     # This is not parsable by from_string but is useful for debugging the graph
     # of nodes.
     def to_debug_string(self) -> str:
         # include_directory already includes explanatory text so we don't have a
         # string before its value.
         result = "is_buffer:{},replacement:{},{},size_info_available:{}".format(
             self.is_buffer, self.replacement, self.include_directive,
             self.size_info_available)
         result += "is_deref_node:{},is_data_change:{},".format(
             self.is_deref_node, self.is_data_change)
         # Recursively get neighbors.
         result += "neighbors:"
         neighbors = "{"
         for node in self.neighbors:
             if len(neighbors) > 1:
                 neighbors += ", "
             neighbors += node.to_debug_string()
         neighbors += "}"
         return result + neighbors

     # Static method to get all nodes.
     def all():
         return Node.key_to_node.values()


 def DFS(node: Node, changes: set):
     """
     Explore the graph in depth-first search from the given node. Identify edits
     to apply.

     Args:
         node: The current node being processed.
         changes: A set to collect the identified changes.
     """
     # Only visit nodes once:
     if (node.visited):
         return
     node.visited = True
     node.changes = changes

     if not node.replacement.endswith('<empty>'):
         changes.add(node.replacement)
         changes.add(node.include_directive)

     for neighbour in node.neighbors:
         DFS(neighbour, changes)


 def SizeInfoAvailable(node: Node):
     """
     Determines whether size information is available for a buffer node and its
     neighbors. Updates the node's size_info_available attribute.

     Args:
         node: The current node's being processed.

     Returns:
         None if a cycle is detected,
         '1' if size information is available for the node and its neighbors,
         '0' otherwise,
     """

     # If we reached a node that contains size info, just return that.
     if node.size_info_available == '1':
         return '1'

     # Cycle detection: If the node is currently being visited, there's a cycle.
     # We can't determine the size info for this node.
     if node.size_info_step == 'visiting':
         return None

     # Memoization: If the node has already been visited, return the result
     # immediately.
     if node.size_info_step == 'visited':
         return node.size_info_available

     node.size_info_step = 'visiting'
     size_info_available = '0'
     # Check neighbors. If any neighbor doesn't have size info or there's a
     # cycle, the current node also doesn't.
     for neighbour in node.neighbors:
         # Break as soon as we encounter a neighbor for which size info is not
         # available.
         if SizeInfoAvailable(neighbour) == '0':
             size_info_available = '0'
             break
         size_info_available = '1'

     node.size_info_available = size_info_available
     node.size_info_step = 'visited'
     return size_info_available


 def main():
     # Collect from every compile units the nodes and edges of the graph:
     for line in sys.stdin:
         line = line.rstrip('\n\r')
         nodes = line.split(';')

         # If there's only one node, it's a buffer node.
         if len(nodes) == 1:
             Node.from_string(nodes[0]).is_buffer = '1'
             continue

         # Else, parse the edge between two nodes:
         assert len(nodes) == 2
         lhs = Node.from_string(nodes[0])
         rhs = Node.from_string(nodes[1])

         lhs.neighbors.add(rhs)

     # Determine whether size information is available for each buffer node:
     for node in Node.all():
         if node.is_buffer == '1':
             SizeInfoAvailable(node)

     # Collect the changes to apply. Starting from buffers nodes whose size info
     # could be determined.
     for node in Node.all():

         # We only want to rewrite components connected to a buffer node.
         if node.is_buffer != '1':
             continue

         # Some buffers might not have their size info available. We can't
         # rewrite those.
         if node.size_info_available != '1':
             continue

         # Collect the changes to apply. We start from buffer nodes whose size
         # info is available and explore the graph in depth-first search.
         changes = set()
         DFS(node, changes)

     # Iterate over the deref_nodes and then check if their only neighbor was
     # visited. Visited nodes here are nodes who's type was rewritten to span.
     # In that case, the deref expression needs to be adapted(rewritten)
     for node in Node.all():
         if node.is_deref_node == '1':
             neighbor = list(node.neighbors)[0]
             if neighbor.visited:
                 neighbor.changes.add(node.replacement)

     # At the edge in between rewritten and non-rewritten nodes, we need
     # to add a call to `.data()` to access the pointer from the span:
     for node in Node.all():

         if node.is_data_change != '1':
             continue

         # A data change needs to be added if lhs was not rewritten and rhs was.
         # The lhs key is stored in the data_change_node's include_directive.
         # Check if the lhs key is in the set of visited nodes (i.e lhs was
         # rewritten). If lhs was rewritten, we don't need to add `.data()`
         if Node.from_key(node.include_directive).visited:
             continue

         # Expect a single neighbor.
         # TODO(357433195): In practice, this is not always the case. Investigate
         # why and add the assertion back:
         # assert len(node.neighbors) == 1, "and node: " + node.to_debug_string()

         # If the rhs node was visited (i.e rewritten), then we need to apply the
         # data change.
         neighbor = list(node.neighbors)[0]
         if neighbor.visited:
             # In this case, rhs was rewritten, and lhs was not, we need to add
             # the corresponding `.data()`
             neighbor.changes.add(node.replacement)

     # Emit the changes:
     # - ~/scratch/patches.txt: A summary of each atomic change.
     # - ~/scratch/patch_<patch_index>: Write each atomic change.
     # - stdout: Print a bundle of all the changes. This is usually piped to
     #           "./tools/clang/scripts/apply_edits.py" to apply the changes.

     summary_filename = expanduser('~/scratch/patches.txt')
     summary_file = open(summary_filename, 'w')

     change_index = 0
     for node in Node.all():
         if node.changes is None or len(node.changes) == 0:
             continue

         for text in node.changes:
             print(text)

         summary_file.write(f'patch_{change_index}: {len(node.changes)}\n')

         with open(expanduser(f'~/scratch/patch_{change_index}.txt'), 'w') as f:
             f.write('\n'.join(node.changes))
         node.changes.clear()

         change_index += 1

     summary_file.close()

     return 0

 if __name__ == '__main__':
     sys.exit(main())
	#!/usr/bin/env vpython3
	# Copyright 2024 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	"""Script to extract edits from clang spanification tool output.

	The edits have the following format:
	...
	{lhs_node1};{rhs_node1}
	{node_n}
	{lhs_node2};{rhs_node2}
	...
	...
	Where lhs_node, rhs_node, and node_n represent a node's text representation
	generated using the spanification tool's Node::ToString() function.

	The string representation has the following format:
	`{is_buffer\,r:::<file path>:::<offset>:::<length>
	:::<replacement text>\,include-user-header:::<file path>:::-1:::-1
	:::<include text>\,size_info_available\,is_data_change\,is_deref_node}`

	where `is_buffer`,`size_info_available`, `is_data_change` and `is_deref_node`
	are booleans represented as 0 or 1.

	extract_edits.py takes input that is concatenated from multiple tool
	invocations and extract just the edits with the following steps:
	1- Construct the adjacency list of nodes
	(a pairs of nodes represents an edge in the directed graph)

	2- Determine whether size info is available for a given buffer node.

	3- Run `DFS` starting from buffer nodes whose size info is available and emit
	edits for reachable nodes.

	4- Adapt dereference expressions and add data changes where necessary.

	extract_edits.py would then emit the following output:
	<edit1>
	<edit2>
	<edit3>
	...
	Where the edit is either a replacement or an include directive.

	For more details about how the tool works, see the doc here:
	https://docs.google.com/document/d/1hUPe21CDdbT6_YFHl03KWlcZqhNIPBAfC-5N5DDY2OE/
	"""

	import sys

	import resource
	from os.path import expanduser


	class Node:
	# Mapping in between the replacement directive and the node.
	key_to_node = dict()

	def __init__(self, is_buffer, replacement, include_directive,
	size_info_available, is_deref_node, is_data_change) -> None:
	self.is_buffer = is_buffer
	self.replacement = replacement
	self.include_directive = include_directive
	# We need to also rewrite deref expressions of the form:
	# \|*buf = something;\| into \|buf[0] = something;\|
	# for that, create a link between buf and the deref expression.
	self.is_deref_node = is_deref_node
	self.is_data_change = is_data_change

	# Neighbors of the node in the graph.
	self.neighbors = set()

	# Property to tracker whether the node is "connected" to a buffer node.
	# This is set from DFS(...)
	self.visited = False

	# See SizeInfoAvailable(...) for more details.
	self.size_info_available = size_info_available
	self.size_info_step = False

	# Changes associated with the node. The whole connected component are
	# sharing the same set. See DFS(...) for more details.
	self.changes = None

	def __eq__(self, other):
	if isinstance(other, Node):
	return self.replacement == other.replacement
	return False

	def __hash__(self) -> int:
	return hash((self.replacement, self.include_directive))

	# Static method to get a node from a replacement key.
	def from_key(replacement: str):
	return Node.key_to_node.get(replacement)

	# Static method to create a node from its string representation. This
	# deduplicate nodes by storing them in a dictionary.
	def from_string(txt: str):
	# Skipping the first and last character that correspond to the curly
	# braces denoting the start and end of a serialized node.
	x = txt[1:-1].split('\\,')
	# Expect exactly 6 elements that correspond to the following node
	# attributes:
	# - is_buffer
	# - replacement
	# - include_directive
	# - size_info_available
	# - is_deref_node
	# - is_data_change
	assert len(x) == 6

	node = Node(*x)

	# Deduplicate nodes, as they might appear multiple times in the input.
	if (Node.key_to_node.get(node.replacement) is None):
	Node.key_to_node[node.replacement] = node

	return Node.key_to_node[node.replacement]

	# This is not parsable by from_string but is useful for debugging the graph
	# of nodes.
	def to_debug_string(self) -> str:
	# include_directory already includes explanatory text so we don't have a
	# string before its value.
	result = "is_buffer:{},replacement:{},{},size_info_available:{}".format(
	self.is_buffer, self.replacement, self.include_directive,
	self.size_info_available)
	result += "is_deref_node:{},is_data_change:{},".format(
	self.is_deref_node, self.is_data_change)
	# Recursively get neighbors.
	result += "neighbors:"
	neighbors = "{"
	for node in self.neighbors:
	if len(neighbors) > 1:
	neighbors += ", "
	neighbors += node.to_debug_string()
	neighbors += "}"
	return result + neighbors

	# Static method to get all nodes.
	def all():
	return Node.key_to_node.values()


	def DFS(node: Node, changes: set):
	"""
	Explore the graph in depth-first search from the given node. Identify edits
	to apply.

	Args:
	node: The current node being processed.
	changes: A set to collect the identified changes.
	"""
	# Only visit nodes once:
	if (node.visited):
	return
	node.visited = True
	node.changes = changes

	if not node.replacement.endswith('<empty>'):
	changes.add(node.replacement)
	changes.add(node.include_directive)

	for neighbour in node.neighbors:
	DFS(neighbour, changes)


	def SizeInfoAvailable(node: Node):
	"""
	Determines whether size information is available for a buffer node and its
	neighbors. Updates the node's size_info_available attribute.

	Args:
	node: The current node's being processed.

	Returns:
	None if a cycle is detected,
	'1' if size information is available for the node and its neighbors,
	'0' otherwise,
	"""

	# If we reached a node that contains size info, just return that.
	if node.size_info_available == '1':
	return '1'

	# Cycle detection: If the node is currently being visited, there's a cycle.
	# We can't determine the size info for this node.
	if node.size_info_step == 'visiting':
	return None

	# Memoization: If the node has already been visited, return the result
	# immediately.
	if node.size_info_step == 'visited':
	return node.size_info_available

	node.size_info_step = 'visiting'
	size_info_available = '0'
	# Check neighbors. If any neighbor doesn't have size info or there's a
	# cycle, the current node also doesn't.
	for neighbour in node.neighbors:
	# Break as soon as we encounter a neighbor for which size info is not
	# available.
	if SizeInfoAvailable(neighbour) == '0':
	size_info_available = '0'
	break
	size_info_available = '1'

	node.size_info_available = size_info_available
	node.size_info_step = 'visited'
	return size_info_available


	def main():
	# Collect from every compile units the nodes and edges of the graph:
	for line in sys.stdin:
	line = line.rstrip('\n\r')
	nodes = line.split(';')

	# If there's only one node, it's a buffer node.
	if len(nodes) == 1:
	Node.from_string(nodes[0]).is_buffer = '1'
	continue

	# Else, parse the edge between two nodes:
	assert len(nodes) == 2
	lhs = Node.from_string(nodes[0])
	rhs = Node.from_string(nodes[1])

	lhs.neighbors.add(rhs)

	# Determine whether size information is available for each buffer node:
	for node in Node.all():
	if node.is_buffer == '1':
	SizeInfoAvailable(node)

	# Collect the changes to apply. Starting from buffers nodes whose size info
	# could be determined.
	for node in Node.all():

	# We only want to rewrite components connected to a buffer node.
	if node.is_buffer != '1':
	continue

	# Some buffers might not have their size info available. We can't
	# rewrite those.
	if node.size_info_available != '1':
	continue

	# Collect the changes to apply. We start from buffer nodes whose size
	# info is available and explore the graph in depth-first search.
	changes = set()
	DFS(node, changes)

	# Iterate over the deref_nodes and then check if their only neighbor was
	# visited. Visited nodes here are nodes who's type was rewritten to span.
	# In that case, the deref expression needs to be adapted(rewritten)
	for node in Node.all():
	if node.is_deref_node == '1':
	neighbor = list(node.neighbors)[0]
	if neighbor.visited:
	neighbor.changes.add(node.replacement)

	# At the edge in between rewritten and non-rewritten nodes, we need
	# to add a call to `.data()` to access the pointer from the span:
	for node in Node.all():

	if node.is_data_change != '1':
	continue

	# A data change needs to be added if lhs was not rewritten and rhs was.
	# The lhs key is stored in the data_change_node's include_directive.
	# Check if the lhs key is in the set of visited nodes (i.e lhs was
	# rewritten). If lhs was rewritten, we don't need to add `.data()`
	if Node.from_key(node.include_directive).visited:
	continue

	# Expect a single neighbor.
	# TODO(357433195): In practice, this is not always the case. Investigate
	# why and add the assertion back:
	# assert len(node.neighbors) == 1, "and node: " + node.to_debug_string()

	# If the rhs node was visited (i.e rewritten), then we need to apply the
	# data change.
	neighbor = list(node.neighbors)[0]
	if neighbor.visited:
	# In this case, rhs was rewritten, and lhs was not, we need to add
	# the corresponding `.data()`
	neighbor.changes.add(node.replacement)

	# Emit the changes:
	# - ~/scratch/patches.txt: A summary of each atomic change.
	# - ~/scratch/patch_<patch_index>: Write each atomic change.
	# - stdout: Print a bundle of all the changes. This is usually piped to
	# "./tools/clang/scripts/apply_edits.py" to apply the changes.

	summary_filename = expanduser('~/scratch/patches.txt')
	summary_file = open(summary_filename, 'w')

	change_index = 0
	for node in Node.all():
	if node.changes is None or len(node.changes) == 0:
	continue

	for text in node.changes:
	print(text)

	summary_file.write(f'patch_{change_index}: {len(node.changes)}\n')

	with open(expanduser(f'~/scratch/patch_{change_index}.txt'), 'w') as f:
	f.write('\n'.join(node.changes))
	node.changes.clear()

	change_index += 1

	summary_file.close()

	return 0

	if __name__ == '__main__':
	sys.exit(main())