tools/wasm-sourcemap.py - external/github.com/emscripten-core/emscripten - Git at Google

 #!/usr/bin/env python
 # Copyright 2018 The Emscripten Authors.  All rights reserved.
 # Emscripten is available under two separate licenses, the MIT license and the
 # University of Illinois/NCSA Open Source License.  Both these licenses can be
 # found in the LICENSE file.

 """Utility tools that extracts DWARF information encoded in a wasm output
 produced by the LLVM tools, and encodes it as a wasm source map. Additionally,
 it can collect original sources, change files prefixes, and strip debug
 sections from a wasm file.
 """

 import argparse
 from collections import OrderedDict, namedtuple
 import json
 import logging
 from math import floor, log
 import os
 import re
 from subprocess import Popen, PIPE
 import sys

 sys.path.insert(1, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

 from tools.shared import asstr

 logger = logging.getLogger('wasm-sourcemap')


 def parse_args():
   parser = argparse.ArgumentParser(prog='wasm-sourcemap.py', description=__doc__)
   parser.add_argument('wasm', help='wasm file')
   parser.add_argument('-o', '--output', help='output source map')
   parser.add_argument('-p', '--prefix', nargs='*', help='replace source debug filename prefix for source map', default=[])
   parser.add_argument('-s', '--sources', action='store_true', help='read and embed source files from file system into source map')
   parser.add_argument('-l', '--load-prefix', nargs='*', help='replace source debug filename prefix for reading sources from file system (see also --sources)', default=[])
   parser.add_argument('-w', nargs='?', help='set output wasm file')
   parser.add_argument('-x', '--strip', action='store_true', help='removes debug and linking sections')
   parser.add_argument('-u', '--source-map-url', nargs='?', help='specifies sourceMappingURL section contest')
   parser.add_argument('--dwarfdump', help="path to llvm-dwarfdump executable")
   parser.add_argument('--dwarfdump-output', nargs='?', help=argparse.SUPPRESS)
   return parser.parse_args()


 class Prefixes:
   def __init__(self, args):
     prefixes = []
     for p in args:
       if '=' in p:
         prefix, replacement = p.split('=')
         prefixes.append({'prefix': prefix, 'replacement': replacement})
       else:
         prefixes.append({'prefix': p, 'replacement': None})
     self.prefixes = prefixes
     self.cache = {}

   def resolve(self, name):
     if name in self.cache:
       return self.cache[name]

     result = name
     for p in self.prefixes:
       if name.startswith(p['prefix']):
         if p['replacement'] is None:
           result = name[len(p['prefix'])::]
         else:
           result = p['replacement'] + name[len(p['prefix'])::]
         break
     self.cache[name] = result
     return result


 # SourceMapPrefixes contains resolver for file names that are:
 #  - "sources" is for names that output to source maps JSON
 #  - "load" is for paths that used to load source text
 SourceMapPrefixes = namedtuple('SourceMapPrefixes', 'sources, load')


 def encode_vlq(n):
   VLQ_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
   x = (n << 1) if n >= 0 else ((-n << 1) + 1)
   result = ""
   while x > 31:
     result = result + VLQ_CHARS[32 + (x & 31)]
     x = x >> 5
   return result + VLQ_CHARS[x]


 def read_var_uint(wasm, pos):
   n = 0
   shift = 0
   b = ord(wasm[pos:pos + 1])
   pos = pos + 1
   while b >= 128:
     n = n | ((b - 128) << shift)
     b = ord(wasm[pos:pos + 1])
     pos = pos + 1
     shift += 7
   return n + (b << shift), pos


 def strip_debug_sections(wasm):
   logger.debug('Strip debug sections')
   pos = 8
   stripped = wasm[:pos]

   while pos < len(wasm):
     section_start = pos
     section_id, pos_ = read_var_uint(wasm, pos)
     section_size, section_body = read_var_uint(wasm, pos_)
     pos = section_body + section_size
     if section_id == 0:
       name_len, name_pos = read_var_uint(wasm, section_body)
       name_end = name_pos + name_len
       name = wasm[name_pos:name_end]
       if name == "linking" or name == "sourceMappingURL" or name.startswith("reloc..debug_") or name.startswith(".debug_"):
         continue  # skip debug related sections
     stripped = stripped + wasm[section_start:pos]

   return stripped


 def encode_uint_var(n):
   result = bytearray()
   while n > 127:
     result.append(128 | (n & 127))
     n = n >> 7
   result.append(n)
   return bytes(result)


 def append_source_mapping(wasm, url):
   logger.debug('Append sourceMappingURL section')
   section_name = "sourceMappingURL"
   section_content = encode_uint_var(len(section_name)) + section_name + encode_uint_var(len(url)) + url
   return wasm + encode_uint_var(0) + encode_uint_var(len(section_content)) + section_content


 def get_code_section_offset(wasm):
   logger.debug('Read sections index')
   pos = 8

   while pos < len(wasm):
     section_id, pos_ = read_var_uint(wasm, pos)
     section_size, pos = read_var_uint(wasm, pos_)
     if section_id == 10:
       return pos
     pos = pos + section_size


 def remove_dead_entries(entries):
   # Remove entries for dead functions. It is a heuristics to ignore data if the
   # function starting address near to 0 (is equal to its size field length).
   block_start = 0
   cur_entry = 0
   while cur_entry < len(entries):
     if not entries[cur_entry]['eos']:
       cur_entry += 1
       continue
     fn_start = entries[block_start]['address']
     # Calculate the LEB encoded function size (including size field)
     fn_size_length = floor(log(entries[cur_entry]['address'] - fn_start + 1, 128)) + 1
     min_live_offset = 1 + fn_size_length # 1 byte is for code section entries
     if fn_start < min_live_offset:
       # Remove dead code debug info block.
       del entries[block_start:cur_entry + 1]
       cur_entry = block_start
       continue
     cur_entry += 1
     block_start = cur_entry


 def read_dwarf_entries(wasm, options):
   if options.dwarfdump_output:
     output = open(options.dwarfdump_output, 'r').read()
   elif options.dwarfdump:
     logger.debug('Reading DWARF information from %s' % wasm)
     if not os.path.exists(options.dwarfdump):
       logger.error('llvm-dwarfdump not found: ' + options.dwarfdump)
       sys.exit(1)
     process = Popen([options.dwarfdump, "-debug-info", "-debug-line", wasm], stdout=PIPE)
     output, err = process.communicate()
     exit_code = process.wait()
     if exit_code != 0:
       logger.error('Error during llvm-dwarfdump execution (%s)' % exit_code)
       sys.exit(1)
   else:
     logger.error('Please specify either --dwarfdump or --dwarfdump-output')
     sys.exit(1)

   entries = []
   debug_line_chunks = re.split(r"debug_line\[(0x[0-9a-f]*)\]", asstr(output))
   maybe_debug_info_content = debug_line_chunks[0]
   for i in range(1, len(debug_line_chunks), 2):
     stmt_list = debug_line_chunks[i]
     comp_dir_match = re.search(r"DW_AT_stmt_list\s+\(" + stmt_list + r"\)\s+" +
                                r"DW_AT_comp_dir\s+\(\"([^\"]+)", maybe_debug_info_content)
     comp_dir = comp_dir_match.group(1) if comp_dir_match is not None else ""

     line_chunk = debug_line_chunks[i + 1]

     # include_directories[  1] = "/Users/yury/Work/junk/sqlite-playground/src"
     # file_names[  1]:
     #            name: "playground.c"
     #       dir_index: 1
     #        mod_time: 0x00000000
     #          length: 0x00000000
     #
     # Address            Line   Column File   ISA Discriminator Flags
     # ------------------ ------ ------ ------ --- ------------- -------------
     # 0x0000000000000006     22      0      1   0             0  is_stmt
     # 0x0000000000000007     23     10      1   0             0  is_stmt prologue_end
     # 0x000000000000000f     23      3      1   0             0
     # 0x0000000000000010     23      3      1   0             0  end_sequence
     # 0x0000000000000011     28      0      1   0             0  is_stmt

     include_directories = {'0': comp_dir}
     for dir in re.finditer(r"include_directories\[\s*(\d+)\] = \"([^\"]*)", line_chunk):
       include_directories[dir.group(1)] = dir.group(2)

     files = {}
     for file in re.finditer(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)", line_chunk):
       dir = include_directories[file.group(3)]
       file_path = (dir + '/' if file.group(2)[0] != '/' else '') + file.group(2)
       files[file.group(1)] = file_path

     for line in re.finditer(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?", line_chunk):
       entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int(line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None}
       if not entry['eos']:
         entries.append(entry)
       else:
         # move end of function to the last END operator
         entry['address'] -= 1
         if entries[-1]['address'] == entry['address']:
           # last entry has the same address, reusing
           entries[-1]['eos'] = True
         else:
           entries.append(entry)

   remove_dead_entries(entries)

   # return entries sorted by the address field
   return sorted(entries, key=lambda entry: entry['address'])


 def build_sourcemap(entries, code_section_offset, prefixes, collect_sources):
   sources = []
   sources_content = [] if collect_sources else None
   mappings = []

   sources_map = {}
   last_address = 0
   last_source_id = 0
   last_line = 1
   last_column = 1
   for entry in entries:
     line = entry['line']
     column = entry['column']
     # ignore entries with line 0
     if line == 0:
       continue
     # start at least at column 1
     if column == 0:
       column = 1
     address = entry['address'] + code_section_offset
     file_name = entry['file']
     source_name = prefixes.sources.resolve(file_name)
     if source_name not in sources_map:
       source_id = len(sources)
       sources_map[source_name] = source_id
       sources.append(source_name)
       if collect_sources:
         load_name = prefixes.load.resolve(file_name)
         try:
           with open(load_name, 'r') as infile:
             source_content = infile.read()
           sources_content.append(source_content)
         except:
           print('Failed to read source: %s' % load_name)
           sources_content.append(None)
     else:
       source_id = sources_map[source_name]

     address_delta = address - last_address
     source_id_delta = source_id - last_source_id
     line_delta = line - last_line
     column_delta = column - last_column
     mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta))
     last_address = address
     last_source_id = source_id
     last_line = line
     last_column = column
   return OrderedDict([('version', 3),
                       ('names', []),
                       ('sources', sources),
                       ('sourcesContent', sources_content),
                       ('mappings', ','.join(mappings))])


 def main():
   options = parse_args()

   wasm_input = options.wasm
   with open(wasm_input, 'rb') as infile:
     wasm = infile.read()

   entries = read_dwarf_entries(wasm_input, options)

   code_section_offset = get_code_section_offset(wasm)

   prefixes = SourceMapPrefixes(sources=Prefixes(options.prefix), load=Prefixes(options.load_prefix))

   logger.debug('Saving to %s' % options.output)
   map = build_sourcemap(entries, code_section_offset, prefixes, options.sources)
   with open(options.output, 'w') as outfile:
     json.dump(map, outfile, separators=(',', ':'))

   if options.strip:
     wasm = strip_debug_sections(wasm)

   if options.source_map_url:
     wasm = append_source_mapping(wasm, options.source_map_url)

   if options.w:
     logger.debug('Saving wasm to %s' % options.w)
     with open(options.w, 'wb') as outfile:
       outfile.write(wasm)

   logger.debug('Done')
   return 0


 if __name__ == '__main__':
   logging.basicConfig(level=logging.DEBUG if os.environ.get('EMCC_DEBUG') else logging.INFO)
   sys.exit(main())
	#!/usr/bin/env python
	# Copyright 2018 The Emscripten Authors. All rights reserved.
	# Emscripten is available under two separate licenses, the MIT license and the
	# University of Illinois/NCSA Open Source License. Both these licenses can be
	# found in the LICENSE file.

	"""Utility tools that extracts DWARF information encoded in a wasm output
	produced by the LLVM tools, and encodes it as a wasm source map. Additionally,
	it can collect original sources, change files prefixes, and strip debug
	sections from a wasm file.
	"""

	import argparse
	from collections import OrderedDict, namedtuple
	import json
	import logging
	from math import floor, log
	import os
	import re
	from subprocess import Popen, PIPE
	import sys

	sys.path.insert(1, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from tools.shared import asstr

	logger = logging.getLogger('wasm-sourcemap')


	def parse_args():
	parser = argparse.ArgumentParser(prog='wasm-sourcemap.py', description=__doc__)
	parser.add_argument('wasm', help='wasm file')
	parser.add_argument('-o', '--output', help='output source map')
	parser.add_argument('-p', '--prefix', nargs='*', help='replace source debug filename prefix for source map', default=[])
	parser.add_argument('-s', '--sources', action='store_true', help='read and embed source files from file system into source map')
	parser.add_argument('-l', '--load-prefix', nargs='*', help='replace source debug filename prefix for reading sources from file system (see also --sources)', default=[])
	parser.add_argument('-w', nargs='?', help='set output wasm file')
	parser.add_argument('-x', '--strip', action='store_true', help='removes debug and linking sections')
	parser.add_argument('-u', '--source-map-url', nargs='?', help='specifies sourceMappingURL section contest')
	parser.add_argument('--dwarfdump', help="path to llvm-dwarfdump executable")
	parser.add_argument('--dwarfdump-output', nargs='?', help=argparse.SUPPRESS)
	return parser.parse_args()


	class Prefixes:
	def __init__(self, args):
	prefixes = []
	for p in args:
	if '=' in p:
	prefix, replacement = p.split('=')
	prefixes.append({'prefix': prefix, 'replacement': replacement})
	else:
	prefixes.append({'prefix': p, 'replacement': None})
	self.prefixes = prefixes
	self.cache = {}

	def resolve(self, name):
	if name in self.cache:
	return self.cache[name]

	result = name
	for p in self.prefixes:
	if name.startswith(p['prefix']):
	if p['replacement'] is None:
	result = name[len(p['prefix'])::]
	else:
	result = p['replacement'] + name[len(p['prefix'])::]
	break
	self.cache[name] = result
	return result


	# SourceMapPrefixes contains resolver for file names that are:
	# - "sources" is for names that output to source maps JSON
	# - "load" is for paths that used to load source text
	SourceMapPrefixes = namedtuple('SourceMapPrefixes', 'sources, load')


	def encode_vlq(n):
	VLQ_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
	x = (n << 1) if n >= 0 else ((-n << 1) + 1)
	result = ""
	while x > 31:
	result = result + VLQ_CHARS[32 + (x & 31)]
	x = x >> 5
	return result + VLQ_CHARS[x]


	def read_var_uint(wasm, pos):
	n = 0
	shift = 0
	b = ord(wasm[pos:pos + 1])
	pos = pos + 1
	while b >= 128:
	n = n \| ((b - 128) << shift)
	b = ord(wasm[pos:pos + 1])
	pos = pos + 1
	shift += 7
	return n + (b << shift), pos


	def strip_debug_sections(wasm):
	logger.debug('Strip debug sections')
	pos = 8
	stripped = wasm[:pos]

	while pos < len(wasm):
	section_start = pos
	section_id, pos_ = read_var_uint(wasm, pos)
	section_size, section_body = read_var_uint(wasm, pos_)
	pos = section_body + section_size
	if section_id == 0:
	name_len, name_pos = read_var_uint(wasm, section_body)
	name_end = name_pos + name_len
	name = wasm[name_pos:name_end]
	if name == "linking" or name == "sourceMappingURL" or name.startswith("reloc..debug_") or name.startswith(".debug_"):
	continue # skip debug related sections
	stripped = stripped + wasm[section_start:pos]

	return stripped


	def encode_uint_var(n):
	result = bytearray()
	while n > 127:
	result.append(128 \| (n & 127))
	n = n >> 7
	result.append(n)
	return bytes(result)


	def append_source_mapping(wasm, url):
	logger.debug('Append sourceMappingURL section')
	section_name = "sourceMappingURL"
	section_content = encode_uint_var(len(section_name)) + section_name + encode_uint_var(len(url)) + url
	return wasm + encode_uint_var(0) + encode_uint_var(len(section_content)) + section_content


	def get_code_section_offset(wasm):
	logger.debug('Read sections index')
	pos = 8

	while pos < len(wasm):
	section_id, pos_ = read_var_uint(wasm, pos)
	section_size, pos = read_var_uint(wasm, pos_)
	if section_id == 10:
	return pos
	pos = pos + section_size


	def remove_dead_entries(entries):
	# Remove entries for dead functions. It is a heuristics to ignore data if the
	# function starting address near to 0 (is equal to its size field length).
	block_start = 0
	cur_entry = 0
	while cur_entry < len(entries):
	if not entries[cur_entry]['eos']:
	cur_entry += 1
	continue
	fn_start = entries[block_start]['address']
	# Calculate the LEB encoded function size (including size field)
	fn_size_length = floor(log(entries[cur_entry]['address'] - fn_start + 1, 128)) + 1
	min_live_offset = 1 + fn_size_length # 1 byte is for code section entries
	if fn_start < min_live_offset:
	# Remove dead code debug info block.
	del entries[block_start:cur_entry + 1]
	cur_entry = block_start
	continue
	cur_entry += 1
	block_start = cur_entry


	def read_dwarf_entries(wasm, options):
	if options.dwarfdump_output:
	output = open(options.dwarfdump_output, 'r').read()
	elif options.dwarfdump:
	logger.debug('Reading DWARF information from %s' % wasm)
	if not os.path.exists(options.dwarfdump):
	logger.error('llvm-dwarfdump not found: ' + options.dwarfdump)
	sys.exit(1)
	process = Popen([options.dwarfdump, "-debug-info", "-debug-line", wasm], stdout=PIPE)
	output, err = process.communicate()
	exit_code = process.wait()
	if exit_code != 0:
	logger.error('Error during llvm-dwarfdump execution (%s)' % exit_code)
	sys.exit(1)
	else:
	logger.error('Please specify either --dwarfdump or --dwarfdump-output')
	sys.exit(1)

	entries = []
	debug_line_chunks = re.split(r"debug_line\[(0x[0-9a-f]*)\]", asstr(output))
	maybe_debug_info_content = debug_line_chunks[0]
	for i in range(1, len(debug_line_chunks), 2):
	stmt_list = debug_line_chunks[i]
	comp_dir_match = re.search(r"DW_AT_stmt_list\s+\(" + stmt_list + r"\)\s+" +
	r"DW_AT_comp_dir\s+\(\"([^\"]+)", maybe_debug_info_content)
	comp_dir = comp_dir_match.group(1) if comp_dir_match is not None else ""

	line_chunk = debug_line_chunks[i + 1]

	# include_directories[ 1] = "/Users/yury/Work/junk/sqlite-playground/src"
	# file_names[ 1]:
	# name: "playground.c"
	# dir_index: 1
	# mod_time: 0x00000000
	# length: 0x00000000
	#
	# Address Line Column File ISA Discriminator Flags
	# ------------------ ------ ------ ------ --- ------------- -------------
	# 0x0000000000000006 22 0 1 0 0 is_stmt
	# 0x0000000000000007 23 10 1 0 0 is_stmt prologue_end
	# 0x000000000000000f 23 3 1 0 0
	# 0x0000000000000010 23 3 1 0 0 end_sequence
	# 0x0000000000000011 28 0 1 0 0 is_stmt

	include_directories = {'0': comp_dir}
	for dir in re.finditer(r"include_directories\[\s(\d+)\] = \"([^\"])", line_chunk):
	include_directories[dir.group(1)] = dir.group(2)

	files = {}
	for file in re.finditer(r"file_names\[\s(\d+)\]:\s+name: \"([^\"])\"\s+dir_index: (\d+)", line_chunk):
	dir = include_directories[file.group(3)]
	file_path = (dir + '/' if file.group(2)[0] != '/' else '') + file.group(2)
	files[file.group(1)] = file_path

	for line in re.finditer(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?", line_chunk):
	entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int(line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None}
	if not entry['eos']:
	entries.append(entry)
	else:
	# move end of function to the last END operator
	entry['address'] -= 1
	if entries[-1]['address'] == entry['address']:
	# last entry has the same address, reusing
	entries[-1]['eos'] = True
	else:
	entries.append(entry)

	remove_dead_entries(entries)

	# return entries sorted by the address field
	return sorted(entries, key=lambda entry: entry['address'])


	def build_sourcemap(entries, code_section_offset, prefixes, collect_sources):
	sources = []
	sources_content = [] if collect_sources else None
	mappings = []

	sources_map = {}
	last_address = 0
	last_source_id = 0
	last_line = 1
	last_column = 1
	for entry in entries:
	line = entry['line']
	column = entry['column']
	# ignore entries with line 0
	if line == 0:
	continue
	# start at least at column 1
	if column == 0:
	column = 1
	address = entry['address'] + code_section_offset
	file_name = entry['file']
	source_name = prefixes.sources.resolve(file_name)
	if source_name not in sources_map:
	source_id = len(sources)
	sources_map[source_name] = source_id
	sources.append(source_name)
	if collect_sources:
	load_name = prefixes.load.resolve(file_name)
	try:
	with open(load_name, 'r') as infile:
	source_content = infile.read()
	sources_content.append(source_content)
	except:
	print('Failed to read source: %s' % load_name)
	sources_content.append(None)
	else:
	source_id = sources_map[source_name]

	address_delta = address - last_address
	source_id_delta = source_id - last_source_id
	line_delta = line - last_line
	column_delta = column - last_column
	mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta))
	last_address = address
	last_source_id = source_id
	last_line = line
	last_column = column
	return OrderedDict([('version', 3),
	('names', []),
	('sources', sources),
	('sourcesContent', sources_content),
	('mappings', ','.join(mappings))])


	def main():
	options = parse_args()

	wasm_input = options.wasm
	with open(wasm_input, 'rb') as infile:
	wasm = infile.read()

	entries = read_dwarf_entries(wasm_input, options)

	code_section_offset = get_code_section_offset(wasm)

	prefixes = SourceMapPrefixes(sources=Prefixes(options.prefix), load=Prefixes(options.load_prefix))

	logger.debug('Saving to %s' % options.output)
	map = build_sourcemap(entries, code_section_offset, prefixes, options.sources)
	with open(options.output, 'w') as outfile:
	json.dump(map, outfile, separators=(',', ':'))

	if options.strip:
	wasm = strip_debug_sections(wasm)

	if options.source_map_url:
	wasm = append_source_mapping(wasm, options.source_map_url)

	if options.w:
	logger.debug('Saving wasm to %s' % options.w)
	with open(options.w, 'wb') as outfile:
	outfile.write(wasm)

	logger.debug('Done')
	return 0


	if __name__ == '__main__':
	logging.basicConfig(level=logging.DEBUG if os.environ.get('EMCC_DEBUG') else logging.INFO)
	sys.exit(main())