tools/traffic_annotation/scripts/extractor.py - chromium/src - Git at Google

 #!/usr/bin/env python3
 # Copyright 2019 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """
 Extracts network traffic annotation definitions from C++ source code.
 """

 from __future__ import print_function

 import argparse
 import re
 import sys
 import traceback

 from annotation_tools import NetworkTrafficAnnotationTools
 from annotation_tokenizer import Tokenizer, SourceCodeParsingError

 from enum import Enum
 from pathlib import Path
 from typing import List, Dict, NamedTuple


 class AnnotationType(Enum):
   COMPLETE = 'Definition'
   PARTIAL = 'Partial'
   COMPLETING = 'Completing'
   BRANCHED_COMPLETING = 'BranchedCompleting'
   MUTABLE = 'Mutable'


 class Language(NamedTuple):
   """Info on how to parse a given programming language's source code."""
   # Human-readable name, for debugging.
   name: str
   # Maps definition function names to the type of annotation they define.
   annotation_types: Dict[str, AnnotationType]
   # Regex that matches an annotation definition. Capture group 1 of this regex
   # should contain a function name that can be mapped via annotation_types.
   call_detection_regex: re.Pattern

 # Exit code for parsing errors. Other runtime errors return 1.
 EX_PARSE_ERROR = 2

 # Language definition for C++ source files.
 CPP_ANNOTATION_TYPES = {
     'DefineNetworkTrafficAnnotation': AnnotationType.COMPLETE,
     'DefinePartialNetworkTrafficAnnotation': AnnotationType.PARTIAL,
     'CompleteNetworkTrafficAnnotation': AnnotationType.COMPLETING,
     'BranchedCompleteNetworkTrafficAnnotation':
     AnnotationType.BRANCHED_COMPLETING,
     'CreateMutableNetworkTrafficAnnotationTag': AnnotationType.MUTABLE,
 }

 CPP_LANGUAGE = Language(name='C++',
                         annotation_types=CPP_ANNOTATION_TYPES,
                         call_detection_regex=re.compile(
                             r'''
     \b
     # Look for one of the tracked function names.
     # Capture group 1: function name.
     (
       ''' + ('|'.join(CPP_ANNOTATION_TYPES.keys())) + r'''
     )
     # Followed by a left-paren.
     \s*
     \(
   ''', re.VERBOSE | re.DOTALL))

 # Language definition for Java source files.
 JAVA_ANNOTATION_TYPES = {
     'createComplete': AnnotationType.COMPLETE,
 }

 JAVA_LANGUAGE = Language(name='Java',
                          annotation_types=JAVA_ANNOTATION_TYPES,
                          call_detection_regex=re.compile(
                              r'''
     \b
     # Look for a string like NetworkTrafficAnnotationTag.<methodName>
     NetworkTrafficAnnotationTag \s* \. \s*
     # Capture group 1: method name.
     (
       ''' + ('|'.join(JAVA_ANNOTATION_TYPES.keys())) + r'''
     )
     # Followed by a left-paren.
     \s*
     \(
   ''', re.VERBOSE | re.DOTALL))

 # Maps file extensions to their Language definition.
 LANGUAGE_MAPPING: Dict[str, Language] = {
     '.cc': CPP_LANGUAGE,
     '.mm': CPP_LANGUAGE,
     '.java': JAVA_LANGUAGE,
 }

 # Regex that matches an annotation that should only be used in test files.
 TEST_ANNOTATION_REGEX = re.compile(
     r'\b(PARTIAL_)?TRAFFIC_ANNOTATION_FOR_TESTS\b')

 # Regex that matches a placeholder annotation for a few whitelisted files.
 MISSING_ANNOTATION_REGEX = re.compile(r'\bMISSING_TRAFFIC_ANNOTATION\b')

 # Regex that matches placeholder annotations for unsupported platforms that
 # don't require Network Traffic Annotations compliance. (e.g. iOS)
 NO_ANNOTATION_REGEX = re.compile(r'\bNO_TRAFFIC_ANNOTATION_YET\b')

 # List of supported file extensions for source code.
 SUPPORTED_EXTENSIONS = set(LANGUAGE_MAPPING.keys())


 class Annotation:
   """A network annotation definition in C++ code."""

   def __init__(self,
                language: Language,
                file_path: Path,
                line_number: int,
                type_name: AnnotationType,
                unique_id='',
                extra_id='',
                text=''):
     """Constructs an Annotation object with the given field values.

     Args:
       file_path: Path to the file that contains this annotation.
     """
     self.language = language
     self.file_path = file_path
     self.line_number = line_number
     self.type_name = type_name
     self.unique_id = unique_id
     self.extra_id = extra_id
     self.text = text

   def parse_definition(self, re_match: re.Match):
     """Parses the annotation and populates object fields.

     Args:
       re_match: A Match obtained from the Language's call_detection_regex.
     """
     definition_function = re_match.group(1)
     self.type_name = self.language.annotation_types[definition_function]

     # Parse the arguments given to the definition function, populating
     # |unique_id|, |text| and (possibly) |extra_id|.
     body = re_match.string[re_match.end():]
     self._parse_body(body)


   def extractor_output_string(self) -> str:
     """Returns a string formatted for output."""
     return '\n'.join(
         map(str, [
             '==== NEW ANNOTATION ====',
             self.file_path,
             self.line_number,
             self.type_name.value,
             self.unique_id,
             self.extra_id,
             self.text,
             '==== ANNOTATION ENDS ====',
         ]))

   def _parse_body(self, body: str):
     """Tokenizes and parses the arguments given to the definition function."""
     # Don't bother parsing CreateMutableNetworkTrafficAnnotationTag(), we don't
     # care about its arguments anyways.
     if self.type_name == AnnotationType.MUTABLE:
       return

     tokenizer = Tokenizer(body, self.file_path, self.line_number)

     # unique_id
     self.unique_id = self._parse_string(tokenizer)
     tokenizer.advance('comma')

     # extra_id (Partial/BranchedCompleting)
     if self.type_name in [
         AnnotationType.PARTIAL, AnnotationType.BRANCHED_COMPLETING
     ]:
       self.extra_id = self._parse_string(tokenizer)
       tokenizer.advance('comma')

     # partial_annotation (Completing/BranchedCompleting)
     if self.type_name in [
         AnnotationType.COMPLETING, AnnotationType.BRANCHED_COMPLETING
     ]:
       # Skip the |partial_annotation| argument. It can be a variable_name, or a
       # FunctionName(), so skip the parentheses if they're there.
       tokenizer.advance('symbol')
       if tokenizer.maybe_advance('left_paren'):
         tokenizer.advance('right_paren')
       tokenizer.advance('comma')

     # proto text
     self.text = self._parse_string(tokenizer)

     # The function call should end here without any more arguments.
     assert tokenizer.advance('right_paren')

   def _parse_string(self, tokenizer: Tokenizer) -> str:
     """Parse a string value.

     It could be a string literal by itself, or multiple string literals
     concatenated together. Add a newline to the string for each
     concatenation."""
     text = tokenizer.advance('string_literal')
     while True:
       # Perform concatenations.
       if tokenizer.maybe_advance('plus') is None:
         break
       text += '\n'
       text += tokenizer.advance('string_literal')
     return text


 def get_line_number_at(string, pos):
   """Find the line number for the char at position |pos|. 1-indexed."""
   # This is inefficient: O(n). But we only run it once for each annotation
   # definition, so the effect on total runtime is negligible.
   return 1 + len(re.compile(r'\n').findall(string[:pos]))


 def is_inside_comment(string, pos):
   """Checks if the position |pos| within string seems to be inside a comment.

   This is a bit naive. Only checks for single-line comments (// ...), not block
   comments (/* ...  */).

   Args:
     string: string to scan.
     pos: position within the string.

   Returns:
     True if |string[pos]| looks like it's inside a C++ comment.
   """
   # Look for "//" on the same line in the reversed string.
   return bool(re.match(r'[^\n]*//', string[pos::-1]))
   # TODO(crbug/966883): Add multi-line comment support.


 def extract_annotations(file_path: Path):
   """Extracts and returns annotations from the file at |file_path|."""
   if file_path.suffix not in LANGUAGE_MAPPING:
     raise ValueError("Unrecognized extension '{}' for file '{}'.".format(
         file_path.suffix, str(file_path)))

   language = LANGUAGE_MAPPING[file_path.suffix]

   contents = file_path.read_text()

   defs = []

   # Check for function calls (e.g. DefineNetworkTrafficAnnotation(...))
   for re_match in language.call_detection_regex.finditer(contents):
     if is_inside_comment(re_match.string, re_match.start()):
       continue
     line_number = get_line_number_at(contents, re_match.start())
     annotation = Annotation(language, file_path, line_number,
                             AnnotationType.COMPLETE)
     annotation.parse_definition(re_match)
     defs.append(annotation)

   # Check for test annotations (e.g. TRAFFIC_ANNOTATION_FOR_TESTS)
   for re_match in TEST_ANNOTATION_REGEX.finditer(contents):
     if is_inside_comment(re_match.string, re_match.start()):
       continue
     line_number = get_line_number_at(contents, re_match.start())

     is_partial = bool(re_match.group(1))
     if is_partial:
       type_name = AnnotationType.PARTIAL
       unique_id = 'test_partial'
       extra_id = 'test'
     else:
       type_name = AnnotationType.COMPLETE
       unique_id = 'test'
       extra_id = ''

     annotation = Annotation(
         language,
         file_path,
         line_number,
         type_name=type_name,
         unique_id=unique_id,
         extra_id=extra_id,
         text='Traffic annotation for unit, browser and other tests')
     defs.append(annotation)

   # Check for MISSING_TRAFFIC_ANNOTATION.
   for re_match in MISSING_ANNOTATION_REGEX.finditer(contents):
     if is_inside_comment(re_match.string, re_match.start()):
       continue
     line_number = get_line_number_at(contents, re_match.start())

     annotation = Annotation(language,
                             file_path,
                             line_number,
                             type_name=AnnotationType.COMPLETE,
                             unique_id='missing',
                             text='Function called without traffic annotation.')
     defs.append(annotation)

   # Check for NO_TRAFFIC_ANNOTATION_YET.
   for re_match in NO_ANNOTATION_REGEX.finditer(contents):
     if is_inside_comment(re_match.string, re_match.start()):
       continue
     line_number = get_line_number_at(contents, re_match.start())

     annotation = Annotation(language,
                             file_path,
                             line_number,
                             type_name=AnnotationType.COMPLETE,
                             unique_id='undefined',
                             text='Nothing here yet.')
     defs.append(annotation)

   return defs


 def main():
   parser = argparse.ArgumentParser()
   parser.add_argument(
       '--options-file',
       help='optional file to read options from')
   args, argv = parser.parse_known_args()
   if args.options_file:
     argv = open(args.options_file).read().split()

   parser.add_argument(
       '--build-path',
       type=Path,
       help='Specifies a compiled build directory, e.g. out/Debug.')
   parser.add_argument(
       '--generate-compdb', action='store_true',
       help='Generate a new compile_commands.json before running')
   parser.add_argument(
       '--no-filter', action='store_true',
       help='Do not filter files based on compdb entries')
   parser.add_argument('file_paths',
                       nargs='+',
                       type=Path,
                       help='List of files to process.')

   args = parser.parse_args(argv)

   if not args.no_filter:
     tools = NetworkTrafficAnnotationTools(args.build_path)
     compdb_files = tools.GetCompDBFiles(args.generate_compdb)

   annotation_definitions = []

   # Parse all the files.
   # TODO(crbug/966883): Do this in parallel.
   for file_path in args.file_paths:
     if not args.no_filter and file_path.resolve() not in compdb_files:
       continue
     try:
       annotation_definitions.extend(extract_annotations(file_path))
     except SourceCodeParsingError:
       traceback.print_exc()
       return EX_PARSE_ERROR

   # Print output.
   for annotation in annotation_definitions:
     print(annotation.extractor_output_string())

   # If all files were successfully checked for annotations but none of them had
   # any, print something so that the traffic_annotation_auditor knows there was
   # no error so that the files get checked for deleted annotations.
   if not annotation_definitions:
     print('No annotations in these files.')
   return 0


 if '__main__' == __name__:
   sys.exit(main())
	#!/usr/bin/env python3
	# Copyright 2019 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""
	Extracts network traffic annotation definitions from C++ source code.
	"""

	from __future__ import print_function

	import argparse
	import re
	import sys
	import traceback

	from annotation_tools import NetworkTrafficAnnotationTools
	from annotation_tokenizer import Tokenizer, SourceCodeParsingError

	from enum import Enum
	from pathlib import Path
	from typing import List, Dict, NamedTuple


	class AnnotationType(Enum):
	COMPLETE = 'Definition'
	PARTIAL = 'Partial'
	COMPLETING = 'Completing'
	BRANCHED_COMPLETING = 'BranchedCompleting'
	MUTABLE = 'Mutable'


	class Language(NamedTuple):
	"""Info on how to parse a given programming language's source code."""
	# Human-readable name, for debugging.
	name: str
	# Maps definition function names to the type of annotation they define.
	annotation_types: Dict[str, AnnotationType]
	# Regex that matches an annotation definition. Capture group 1 of this regex
	# should contain a function name that can be mapped via annotation_types.
	call_detection_regex: re.Pattern

	# Exit code for parsing errors. Other runtime errors return 1.
	EX_PARSE_ERROR = 2

	# Language definition for C++ source files.
	CPP_ANNOTATION_TYPES = {
	'DefineNetworkTrafficAnnotation': AnnotationType.COMPLETE,
	'DefinePartialNetworkTrafficAnnotation': AnnotationType.PARTIAL,
	'CompleteNetworkTrafficAnnotation': AnnotationType.COMPLETING,
	'BranchedCompleteNetworkTrafficAnnotation':
	AnnotationType.BRANCHED_COMPLETING,
	'CreateMutableNetworkTrafficAnnotationTag': AnnotationType.MUTABLE,
	}

	CPP_LANGUAGE = Language(name='C++',
	annotation_types=CPP_ANNOTATION_TYPES,
	call_detection_regex=re.compile(
	r'''
	\b
	# Look for one of the tracked function names.
	# Capture group 1: function name.
	(
	''' + ('\|'.join(CPP_ANNOTATION_TYPES.keys())) + r'''
	)
	# Followed by a left-paren.
	\s*
	\(
	''', re.VERBOSE \| re.DOTALL))

	# Language definition for Java source files.
	JAVA_ANNOTATION_TYPES = {
	'createComplete': AnnotationType.COMPLETE,
	}

	JAVA_LANGUAGE = Language(name='Java',
	annotation_types=JAVA_ANNOTATION_TYPES,
	call_detection_regex=re.compile(
	r'''
	\b
	# Look for a string like NetworkTrafficAnnotationTag.<methodName>
	NetworkTrafficAnnotationTag \s* \. \s*
	# Capture group 1: method name.
	(
	''' + ('\|'.join(JAVA_ANNOTATION_TYPES.keys())) + r'''
	)
	# Followed by a left-paren.
	\s*
	\(
	''', re.VERBOSE \| re.DOTALL))

	# Maps file extensions to their Language definition.
	LANGUAGE_MAPPING: Dict[str, Language] = {
	'.cc': CPP_LANGUAGE,
	'.mm': CPP_LANGUAGE,
	'.java': JAVA_LANGUAGE,
	}

	# Regex that matches an annotation that should only be used in test files.
	TEST_ANNOTATION_REGEX = re.compile(
	r'\b(PARTIAL_)?TRAFFIC_ANNOTATION_FOR_TESTS\b')

	# Regex that matches a placeholder annotation for a few whitelisted files.
	MISSING_ANNOTATION_REGEX = re.compile(r'\bMISSING_TRAFFIC_ANNOTATION\b')

	# Regex that matches placeholder annotations for unsupported platforms that
	# don't require Network Traffic Annotations compliance. (e.g. iOS)
	NO_ANNOTATION_REGEX = re.compile(r'\bNO_TRAFFIC_ANNOTATION_YET\b')

	# List of supported file extensions for source code.
	SUPPORTED_EXTENSIONS = set(LANGUAGE_MAPPING.keys())


	class Annotation:
	"""A network annotation definition in C++ code."""

	def __init__(self,
	language: Language,
	file_path: Path,
	line_number: int,
	type_name: AnnotationType,
	unique_id='',
	extra_id='',
	text=''):
	"""Constructs an Annotation object with the given field values.

	Args:
	file_path: Path to the file that contains this annotation.
	"""
	self.language = language
	self.file_path = file_path
	self.line_number = line_number
	self.type_name = type_name
	self.unique_id = unique_id
	self.extra_id = extra_id
	self.text = text

	def parse_definition(self, re_match: re.Match):
	"""Parses the annotation and populates object fields.

	Args:
	re_match: A Match obtained from the Language's call_detection_regex.
	"""
	definition_function = re_match.group(1)
	self.type_name = self.language.annotation_types[definition_function]

	# Parse the arguments given to the definition function, populating
	# \|unique_id\|, \|text\| and (possibly) \|extra_id\|.
	body = re_match.string[re_match.end():]
	self._parse_body(body)


	def extractor_output_string(self) -> str:
	"""Returns a string formatted for output."""
	return '\n'.join(
	map(str, [
	'==== NEW ANNOTATION ====',
	self.file_path,
	self.line_number,
	self.type_name.value,
	self.unique_id,
	self.extra_id,
	self.text,
	'==== ANNOTATION ENDS ====',
	]))

	def _parse_body(self, body: str):
	"""Tokenizes and parses the arguments given to the definition function."""
	# Don't bother parsing CreateMutableNetworkTrafficAnnotationTag(), we don't
	# care about its arguments anyways.
	if self.type_name == AnnotationType.MUTABLE:
	return

	tokenizer = Tokenizer(body, self.file_path, self.line_number)

	# unique_id
	self.unique_id = self._parse_string(tokenizer)
	tokenizer.advance('comma')

	# extra_id (Partial/BranchedCompleting)
	if self.type_name in [
	AnnotationType.PARTIAL, AnnotationType.BRANCHED_COMPLETING
	]:
	self.extra_id = self._parse_string(tokenizer)
	tokenizer.advance('comma')

	# partial_annotation (Completing/BranchedCompleting)
	if self.type_name in [
	AnnotationType.COMPLETING, AnnotationType.BRANCHED_COMPLETING
	]:
	# Skip the \|partial_annotation\| argument. It can be a variable_name, or a
	# FunctionName(), so skip the parentheses if they're there.
	tokenizer.advance('symbol')
	if tokenizer.maybe_advance('left_paren'):
	tokenizer.advance('right_paren')
	tokenizer.advance('comma')

	# proto text
	self.text = self._parse_string(tokenizer)

	# The function call should end here without any more arguments.
	assert tokenizer.advance('right_paren')

	def _parse_string(self, tokenizer: Tokenizer) -> str:
	"""Parse a string value.

	It could be a string literal by itself, or multiple string literals
	concatenated together. Add a newline to the string for each
	concatenation."""
	text = tokenizer.advance('string_literal')
	while True:
	# Perform concatenations.
	if tokenizer.maybe_advance('plus') is None:
	break
	text += '\n'
	text += tokenizer.advance('string_literal')
	return text


	def get_line_number_at(string, pos):
	"""Find the line number for the char at position \|pos\|. 1-indexed."""
	# This is inefficient: O(n). But we only run it once for each annotation
	# definition, so the effect on total runtime is negligible.
	return 1 + len(re.compile(r'\n').findall(string[:pos]))


	def is_inside_comment(string, pos):
	"""Checks if the position \|pos\| within string seems to be inside a comment.

	This is a bit naive. Only checks for single-line comments (// ...), not block
	comments (/* ... */).

	Args:
	string: string to scan.
	pos: position within the string.

	Returns:
	True if \|string[pos]\| looks like it's inside a C++ comment.
	"""
	# Look for "//" on the same line in the reversed string.
	return bool(re.match(r'[^\n]*//', string[pos::-1]))
	# TODO(crbug/966883): Add multi-line comment support.


	def extract_annotations(file_path: Path):
	"""Extracts and returns annotations from the file at \|file_path\|."""
	if file_path.suffix not in LANGUAGE_MAPPING:
	raise ValueError("Unrecognized extension '{}' for file '{}'.".format(
	file_path.suffix, str(file_path)))

	language = LANGUAGE_MAPPING[file_path.suffix]

	contents = file_path.read_text()

	defs = []

	# Check for function calls (e.g. DefineNetworkTrafficAnnotation(...))
	for re_match in language.call_detection_regex.finditer(contents):
	if is_inside_comment(re_match.string, re_match.start()):
	continue
	line_number = get_line_number_at(contents, re_match.start())
	annotation = Annotation(language, file_path, line_number,
	AnnotationType.COMPLETE)
	annotation.parse_definition(re_match)
	defs.append(annotation)

	# Check for test annotations (e.g. TRAFFIC_ANNOTATION_FOR_TESTS)
	for re_match in TEST_ANNOTATION_REGEX.finditer(contents):
	if is_inside_comment(re_match.string, re_match.start()):
	continue
	line_number = get_line_number_at(contents, re_match.start())

	is_partial = bool(re_match.group(1))
	if is_partial:
	type_name = AnnotationType.PARTIAL
	unique_id = 'test_partial'
	extra_id = 'test'
	else:
	type_name = AnnotationType.COMPLETE
	unique_id = 'test'
	extra_id = ''

	annotation = Annotation(
	language,
	file_path,
	line_number,
	type_name=type_name,
	unique_id=unique_id,
	extra_id=extra_id,
	text='Traffic annotation for unit, browser and other tests')
	defs.append(annotation)

	# Check for MISSING_TRAFFIC_ANNOTATION.
	for re_match in MISSING_ANNOTATION_REGEX.finditer(contents):
	if is_inside_comment(re_match.string, re_match.start()):
	continue
	line_number = get_line_number_at(contents, re_match.start())

	annotation = Annotation(language,
	file_path,
	line_number,
	type_name=AnnotationType.COMPLETE,
	unique_id='missing',
	text='Function called without traffic annotation.')
	defs.append(annotation)

	# Check for NO_TRAFFIC_ANNOTATION_YET.
	for re_match in NO_ANNOTATION_REGEX.finditer(contents):
	if is_inside_comment(re_match.string, re_match.start()):
	continue
	line_number = get_line_number_at(contents, re_match.start())

	annotation = Annotation(language,
	file_path,
	line_number,
	type_name=AnnotationType.COMPLETE,
	unique_id='undefined',
	text='Nothing here yet.')
	defs.append(annotation)

	return defs


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	'--options-file',
	help='optional file to read options from')
	args, argv = parser.parse_known_args()
	if args.options_file:
	argv = open(args.options_file).read().split()

	parser.add_argument(
	'--build-path',
	type=Path,
	help='Specifies a compiled build directory, e.g. out/Debug.')
	parser.add_argument(
	'--generate-compdb', action='store_true',
	help='Generate a new compile_commands.json before running')
	parser.add_argument(
	'--no-filter', action='store_true',
	help='Do not filter files based on compdb entries')
	parser.add_argument('file_paths',
	nargs='+',
	type=Path,
	help='List of files to process.')

	args = parser.parse_args(argv)

	if not args.no_filter:
	tools = NetworkTrafficAnnotationTools(args.build_path)
	compdb_files = tools.GetCompDBFiles(args.generate_compdb)

	annotation_definitions = []

	# Parse all the files.
	# TODO(crbug/966883): Do this in parallel.
	for file_path in args.file_paths:
	if not args.no_filter and file_path.resolve() not in compdb_files:
	continue
	try:
	annotation_definitions.extend(extract_annotations(file_path))
	except SourceCodeParsingError:
	traceback.print_exc()
	return EX_PARSE_ERROR

	# Print output.
	for annotation in annotation_definitions:
	print(annotation.extractor_output_string())

	# If all files were successfully checked for annotations but none of them had
	# any, print something so that the traffic_annotation_auditor knows there was
	# no error so that the files get checked for deleted annotations.
	if not annotation_definitions:
	print('No annotations in these files.')
	return 0


	if '__main__' == __name__:
	sys.exit(main())