Migrate to llvm-objdump and llvm-symbolizer

Android NDK will no longer support tools based on GNU toolchain.

Hence migrate ASan symbolization for Android to LLVM base tools
like llvm-symbolizer and llvm-objdump.

Bug: 1273402
Cq-Include-Trybots: luci.chromium.try:linux_chromium_asan_rel_ng,win-asan,mac_chromium_asan_rel_ng,android-asan
Change-Id: Icc0c5f8e64e1bb05d49f347c3316850a0ff83e55
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3303875
Reviewed-by: Andrew Grieve <agrieve@chromium.org>
Reviewed-by: Hans Wennborg <hans@chromium.org>
Reviewed-by: Nico Weber <thakis@chromium.org>
Commit-Queue: Jonathan Wright <jonathan.wright@arm.com>
Cr-Commit-Position: refs/heads/main@{#961488}
NOKEYCHECK=True
GitOrigin-RevId: ffa736ce5f5d7b018b768c29c0cc1ff35152393a
diff --git a/README.chromium b/README.chromium
index e4e184d..bf7bf9f 100644
--- a/README.chromium
+++ b/README.chromium
@@ -19,7 +19,6 @@
 
 The scripts have been modified to better suit Chromium development. Changes
 include, but are not limited to, the following:
-Added memoization of addr2line and objdump.
 Added option to change the amount of symbolization done.
 Updated output directories to be set by environment variable or --flags
 When calling addr2line, check the symbol is a file that looks like it contains
@@ -31,16 +30,15 @@
 Added support for arch=x64 as an alias to arch=x86_64
 Added debug logging and --verbose parameter.
 Used fast ELF symbolizer for symbols.py and tombstones
-Used multiprocessing to pre-process logcat before symbolizing it
 Added code address adjustment for the debuggerd output from pre-M Android
     where relocations are packed.
 Added code to capture java stderr for better handling of native->java crashes.
 Fixed invalid using decl in logging header debug.h
 Only attempt to symbolize with ELF libraries.
 
-Changed the stack script to use llvm symbolizer instead of addr2line,
-objdump, etc, since llvm symbolizer is more efficient in finding
-function names, line numbers etc.
+Changed the stack scripts to use llvm tools instead of gnu toolchain.
+Additionally use symbolizer instead of addr2line, objdump, etc, since llvm symbolizer
+is more efficient in finding function names, line numbers etc.
 
 Speedup symbolization by avoiding unnecessary APK manifest extraction loops.
 
diff --git a/development/scripts/stack.pydeps b/development/scripts/stack.pydeps
index ab65fc3..8a95b21 100644
--- a/development/scripts/stack.pydeps
+++ b/development/scripts/stack.pydeps
@@ -3,8 +3,7 @@
 ../../../../build/android/pylib/__init__.py
 ../../../../build/android/pylib/constants/__init__.py
 ../../../../build/android/pylib/constants/host_paths.py
-../../../../build/android/pylib/symbols/__init__.py
-../../../../build/android/pylib/symbols/elf_symbolizer.py
+../../../../tools/python/llvm_objdump.py
 ../../../../tools/python/llvm_symbolizer.py
 ../../../catapult/devil/devil/__init__.py
 ../../../catapult/devil/devil/android/__init__.py
diff --git a/development/scripts/stack_core.py b/development/scripts/stack_core.py
index e869a14..b0937db 100755
--- a/development/scripts/stack_core.py
+++ b/development/scripts/stack_core.py
@@ -463,10 +463,9 @@
         logging.debug('Identified lib: %s' % area)
         # If a calls b which further calls c and c is inlined to b, we want to
         # display "a -> b -> c" in the stack trace instead of just "a -> c"
-        # To use llvm symbolizer, the hexadecimal address has to start with 0x.
-        info = llvm_symbolizer.GetSymbolInformation(
-            os.path.join(symbol.SYMBOLS_DIR, symbol.TranslateLibPath(area)),
-            '0x' + code_addr)
+        library = os.path.join(symbol.SYMBOLS_DIR,
+                               symbol.TranslateLibPath(area))
+        info = llvm_symbolizer.GetSymbolInformation(library, int(code_addr,16))
         logging.debug('symbol information: %s' % info)
         nest_count = len(info) - 1
         for source_symbol, source_location in info:
@@ -481,15 +480,16 @@
             trace_lines.append((code_addr,
                                 source_symbol,
                                 source_location))
+
     match = _VALUE_LINE.match(line)
     if match:
       (_, addr, value, area, _, symbol_name) = match.groups()
       if area == UNKNOWN or area == HEAP or area == STACK or not area:
         value_lines.append((addr, value, '', area))
       else:
-        info = llvm_symbolizer.GetSymbolInformation(
-            os.path.join(symbol.SYMBOLS_DIR, symbol.TranslateLibPath(area)),
-            '0x' + value)
+        library = os.path.join(symbol.SYMBOLS_DIR,
+                               symbol.TranslateLibPath(area))
+        info = llvm_symbolizer.GetSymbolInformation(library, int(value,16))
         source_symbol, source_location = info.pop()
 
         value_lines.append((addr,
diff --git a/development/scripts/stack_test.py b/development/scripts/stack_test.py
index 4d53fa9..d5609b1 100755
--- a/development/scripts/stack_test.py
+++ b/development/scripts/stack_test.py
@@ -56,7 +56,7 @@
   def __init__(self, directory):
     self._lib_directory = directory
 
-  def GetSymbolInformation(self, library, address_string):
+  def GetSymbolInformation(self, library, address):
     basename = os.path.basename(library)
     local_file = os.path.join(self._lib_directory, basename)
 
@@ -67,7 +67,6 @@
 
     # If the address isn't in the library, LLVM symbolizer yields ??.
     lib_size = os.stat(local_file).st_size
-    address = int(address_string, 16)
     if address >= lib_size:
       return [('??', '??:0:0')]
 
diff --git a/development/scripts/symbol.py b/development/scripts/symbol.py
index f0a2dea..c8a36b9 100755
--- a/development/scripts/symbol.py
+++ b/development/scripts/symbol.py
@@ -34,11 +34,16 @@
                                 'build', 'android'))
 from pylib import constants
 from pylib.constants import host_paths
-from pylib.symbols import elf_symbolizer
 
+sys.path.insert(0, os.path.join(os.path.dirname(__file__),
+                                os.pardir, os.pardir, os.pardir, os.pardir,
+                                'tools', 'python'))
+from llvm_symbolizer import LLVMSymbolizer, IsValidLLVMSymbolizerTarget
+
+from llvm_objdump import LLVMObjdumper
 
 # WARNING: These global variables can be modified by other scripts!
-SYMBOLS_DIR = constants.DIR_SOURCE_ROOT
+SYMBOLS_DIR = constants.DIR_SOURCE_ROOT + os.sep
 CHROME_SYMBOLS_DIR = None
 ARCH = "arm"
 
@@ -220,8 +225,7 @@
     A list of matching library filenames for library_name.
   """
   def extant_library(filename):
-    if (os.path.exists(filename)
-        and elf_symbolizer.ContainsElfMagic(filename)):
+    if (os.path.exists(filename) and IsValidLLVMSymbolizerTarget(filename)):
       return [filename]
     return []
 
@@ -313,12 +317,15 @@
   if not lib:
     return None
 
-  addr_to_line = _CallAddr2LineForSet(lib, unique_addrs, cpu_arch)
+  symbols = SYMBOLS_DIR + lib
+
+  addr_to_line = _CallAddr2LineForSet(symbols, unique_addrs)
+
   if not addr_to_line:
     return None
 
   if get_detailed_info:
-    addr_to_objdump = _CallObjdumpForSet(lib, unique_addrs, cpu_arch)
+    addr_to_objdump = _CallObjdumpForSet(symbols, unique_addrs, cpu_arch)
     if not addr_to_objdump:
       return None
   else:
@@ -328,53 +335,27 @@
   for addr in unique_addrs:
     source_info = addr_to_line.get(addr)
     if not source_info:
-      source_info = [(None, None)]
+      source_info = [(None,None)]
+
     if addr in addr_to_objdump:
       (object_symbol, object_offset) = addr_to_objdump.get(addr)
       object_symbol_with_offset = _FormatSymbolWithOffset(object_symbol,
                                                           object_offset)
     else:
       object_symbol_with_offset = None
+
     result[addr] = [(source_symbol, source_location, object_symbol_with_offset)
         for (source_symbol, source_location) in source_info]
 
   return result
 
 
-class _MemoizedForSet:
-  """Decorator class used to memoize CallXXXForSet() results."""
-  def __init__(self, fn):
-    self.fn = fn
-    self.cache = {}
-    self.cpu_arch = None
-
-  def __call__(self, lib, unique_addrs, cpu_arch):
-    if self.cpu_arch is None:
-      self.cpu_arch = cpu_arch
-    else:
-      # Sanity check, this doesn't expect cpu_arch to change.
-      assert self.cpu_arch == cpu_arch
-
-    lib_cache = self.cache.setdefault(lib, {})
-
-    uncached_addrs = [k for k in unique_addrs if k not in lib_cache]
-    if uncached_addrs:
-      lib_cache.update((k, None) for k in uncached_addrs)
-      result = self.fn(lib, uncached_addrs, cpu_arch)
-      if result:
-        lib_cache.update(result)
-
-    return dict((k, lib_cache[k]) for k in unique_addrs if lib_cache[k])
-
-
-@_MemoizedForSet
-def _CallAddr2LineForSet(lib, unique_addrs, cpu_arch):
+def _CallAddr2LineForSet(lib, unique_addrs):
   """Look up line and symbol information for a set of addresses.
 
   Args:
     lib: library (or executable) pathname containing symbols
     unique_addrs: set of string hexidecimal addresses look up.
-    cpu_arch: Target CPU architecture.
 
   Returns:
     A dictionary of the form {addr: [(symbol, file:line)]} where
@@ -388,55 +369,23 @@
   if not lib:
     return None
 
-  symbols = SYMBOLS_DIR + lib
-  if not os.path.splitext(symbols)[1] in ['', '.so', '.apk']:
+  if not os.path.splitext(lib)[1] in ['', '.so', '.apk']:
     return None
 
-  if not os.path.isfile(symbols):
+  if not os.path.isfile(lib):
     return None
 
-  addrs = sorted(unique_addrs)
+  sorted_addrs = sorted(unique_addrs)
+
   result = {}
 
-  def _Callback(sym, addr):
-    records = []
-    while sym:  # Traverse all the inlines following the |inlined_by| chain.
-      if sym.source_path and sym.source_line:
-        location = '%s:%d' % (sym.source_path, sym.source_line)
-      else:
-        location = None
-      records += [(sym.name, location)]
-      sym = sym.inlined_by
-    result[addr] = records
+  with LLVMSymbolizer() as llvm_symbolizer:
+    for addr in sorted_addrs:
+      result[addr] = llvm_symbolizer.GetSymbolInformation(lib,addr)
 
-  symbolizer = elf_symbolizer.ELFSymbolizer(
-      elf_file_path=symbols,
-      addr2line_path=host_paths.ToolPath("addr2line", cpu_arch),
-      callback=_Callback,
-      inlines=True)
-
-  for addr in addrs:
-    symbolizer.SymbolizeAsync(int(addr, 16), addr)
-  symbolizer.Join()
   return result
 
 
-def _StripPC(addr, cpu_arch):
-  """Strips the Thumb bit a program counter address when appropriate.
-
-  Args:
-    addr: the program counter address
-    cpu_arch: Target CPU architecture.
-
-  Returns:
-    The stripped program counter address.
-  """
-  if cpu_arch == "arm":
-    return addr & ~1
-  return addr
-
-
-@_MemoizedForSet
 def _CallObjdumpForSet(lib, unique_addrs, cpu_arch):
   """Use objdump to find out the names of the containing functions.
 
@@ -448,72 +397,20 @@
   Returns:
     A dictionary of the form {addr: (string symbol, offset)}.
   """
+
   if not lib:
     return None
 
-  symbols = SYMBOLS_DIR + lib
-  if not os.path.exists(symbols):
-    return None
-
-  symbols = SYMBOLS_DIR + lib
-  if not os.path.exists(symbols):
+  if not os.path.exists(lib):
     return None
 
   result = {}
 
-  # Function lines look like:
-  #   000177b0 <android::IBinder::~IBinder()+0x2c>:
-  # We pull out the address and function first. Then we check for an optional
-  # offset. This is tricky due to functions that look like "operator+(..)+0x2c"
-  func_regexp = re.compile("(^[a-f0-9]*) \<(.*)\>:$")
-  offset_regexp = re.compile("(.*)\+0x([a-f0-9]*)")
-
-  # A disassembly line looks like:
-  #   177b2:  b510        push  {r4, lr}
-  asm_regexp = re.compile("(^[ a-f0-9]*):[ a-f0-0]*.*$")
-
-  for target_addr in unique_addrs:
-    start_addr_dec = str(_StripPC(int(target_addr, 16), cpu_arch))
-    stop_addr_dec = str(_StripPC(int(target_addr, 16), cpu_arch) + 8)
-    cmd = [host_paths.ToolPath("objdump", cpu_arch),
-           "--section=.text",
-           "--demangle",
-           "--disassemble",
-           "--start-address=" + start_addr_dec,
-           "--stop-address=" + stop_addr_dec,
-           symbols]
-
-    current_symbol = None    # The current function symbol in the disassembly.
-    current_symbol_addr = 0  # The address of the current function.
-
-    stream = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout
-    for line in stream:
-      # Is it a function line like:
-      #   000177b0 <android::IBinder::~IBinder()>:
-      components = func_regexp.match(line)
-      if components:
-        # This is a new function, so record the current function and its
-        # address.
-        current_symbol_addr = int(components.group(1), 16)
-        current_symbol = components.group(2)
-
-        # Does it have an optional offset like: "foo(..)+0x2c"?
-        components = offset_regexp.match(current_symbol)
-        if components:
-          current_symbol = components.group(1)
-          offset = components.group(2)
-          if offset:
-            current_symbol_addr -= int(offset, 16)
-
-      # Is it an disassembly line like:
-      #   177b2:  b510        push  {r4, lr}
-      components = asm_regexp.match(line)
-      if components:
-        addr = components.group(1)
-        i_addr = int(addr, 16)
-        i_target = _StripPC(int(target_addr, 16), cpu_arch)
-        if i_addr == i_target:
-          result[target_addr] = (current_symbol, i_target - current_symbol_addr)
-    stream.close()
+  with LLVMObjdumper() as llvm_objdumper:
+    for current_address in unique_addrs:
+      symbol_data = llvm_objdumper.GetSymbolInformation(lib=lib,
+                                                        address=current_address,
+                                                        cpu_arch=cpu_arch)
+      result[current_address] = (symbol_data.symbol, symbol_data.offset)
 
   return result