blob: b53e6abe3fe4dde9f38f9c7197c1678718fb8bd8 [file] [log] [blame]
# Copyright 2020 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Generates C++ code representing structured data objects from
This script generates C++ objects based on a JSON+LD schema file. Blink uses the
generated code to scrape data from web pages.
import os
import sys
import json
import argparse
_current_dir = os.path.dirname(os.path.realpath(__file__))
# jinja2 is in chromium's third_party directory
# Insert at front to override system libraries, and after path[0] == script dir
1, os.path.join(_current_dir, *([os.pardir] * 2 + ['third_party'])))
import jinja2
from jinja2 import Environment, PackageLoader, select_autoescape
env = Environment(loader=PackageLoader('generate_schema_org_code', ''))
env.trim_blocks = True
env.lstrip_blocks = True
def schema_org_id(object_name):
return SCHEMA_ORG_PREFIX + object_name
def object_name_from_id(the_id):
"""Get the object name from a ID."""
return the_id[len(SCHEMA_ORG_PREFIX):]
def get_schema_obj(obj_id, schema):
"""Search the schema graph for an object with the given ID."""
matches = [obj for obj in schema['@graph'] if obj['@id'] == obj_id]
return matches[0] if len(matches) == 1 else None
def is_enum_type(class_obj):
if 'rdfs:subClassOf' in class_obj:
parent_class = class_obj['rdfs:subClassOf']
if isinstance(parent_class, list):
return any(parent['@id'] == schema_org_id('Enumeration')
for parent in parent_class)
return parent_class['@id'] == schema_org_id('Enumeration')
def make_entity(thing, names):
return {
"name": object_name_from_id(thing['@id']),
"name_hash": names[object_name_from_id(thing['@id'])]
def make_entity_from_name(name, names):
return {"name": name, "name_hash": names[name]}
def find_enum_options(obj_id, schema, names):
return [
make_entity(obj, names) for obj in schema['@graph']
if obj['@type'] == obj_id
def get_root_type(the_class, schema):
"""Get the base type the class is descended from."""
class_obj = get_schema_obj(the_class['@id'], schema)
if class_obj is None:
return the_class
if class_obj['@id'] == schema_org_id('Thing'):
return class_obj
# Consider URLs to be a base type as we will use have a struct field for
# them specifically.
if class_obj['@id'] == schema_org_id('URL'):
return class_obj
if ('@type' in class_obj
and schema_org_id('DataType') in class_obj['@type']):
return class_obj
if 'rdfs:subClassOf' in class_obj:
parent_class = class_obj['rdfs:subClassOf']
# All classes that use multiple inheritance are Thing type.
if isinstance(parent_class, list):
return get_schema_obj(schema_org_id('Thing'), schema)
# Enumeration classes are treated specially. Return the specific type
# of enum this class is.
if parent_class['@id'] == schema_org_id('Enumeration'):
return class_obj
return get_root_type(parent_class, schema)
return class_obj
def parse_property(prop, schema, names):
"""Parse out details about the property, including what type it can be."""
parsed_prop = {
'name': object_name_from_id(prop['@id']),
'name_hash': names[object_name_from_id(prop['@id'])],
'thing_types': [],
'enum_types': []
if not schema_org_id('rangeIncludes') in prop:
return parsed_prop
rangeIncludes = prop[schema_org_id('rangeIncludes')]
if not isinstance(rangeIncludes, list):
rangeIncludes = [rangeIncludes]
for possible_type in rangeIncludes:
root_type = get_root_type(possible_type, schema)
if root_type['@id'] == schema_org_id('Thing'):
elif root_type['@id'] == schema_org_id('Text'):
parsed_prop['has_text'] = True
elif root_type['@id'] == schema_org_id('Date'):
parsed_prop['has_date'] = True
elif root_type['@id'] == schema_org_id('Time'):
parsed_prop['has_time'] = True
elif root_type['@id'] == schema_org_id('Boolean'):
parsed_prop['has_boolean'] = True
elif root_type['@id'] == schema_org_id('Number'):
parsed_prop['has_number'] = True
elif root_type['@id'] == schema_org_id('DateTime'):
parsed_prop['has_date_time'] = True
elif root_type['@id'] == schema_org_id('URL'):
parsed_prop['has_url'] = True
elif is_enum_type(root_type):
return parsed_prop
def merge_with_schema(schema, overrides, thing):
indices = [
i for i, x in enumerate(schema['@graph']) if x['@id'] == thing['@id']
for index in indices:
schema['@graph'][index] = thing
if not indices:
def lookup_parents(thing, schema, lookup_table):
"""Recursively looks up all the parents of thing in the schema.
Returns the parents and populates them in lookup_table. The parents list may
contain duplicates if thing has multiple inheritance trees.
obj_name = object_name_from_id(thing['@id'])
if obj_name in lookup_table:
return lookup_table[obj_name]
lookup_table[obj_name] = set()
if 'rdfs:subClassOf' in thing:
parent_classes = thing['rdfs:subClassOf']
if not isinstance(parent_classes, list):
parent_classes = [parent_classes]
parent_classes = [
get_schema_obj(parent['@id'], schema) for parent in parent_classes
parent_classes = [
parent for parent in parent_classes if parent is not None
found_parents = [
lookup_parents(parent_thing, schema, lookup_table)
for parent_thing in parent_classes
# flatten the list
found_parents = [item for sublist in found_parents for item in sublist]
return lookup_table[obj_name]
def get_template_vars_from_file(schema_file_path, overrides_file_path,
with open(schema_file_path) as schema_file:
schema = json.loads(
with open(name_file_path) as names_file:
names = json.loads(
if overrides_file_path:
with open(overrides_file_path) as overrides_file:
overrides = json.loads(
for thing in overrides['@graph']:
merge_with_schema(schema, overrides, thing)
return get_template_vars(schema, names)
def get_template_vars(schema, names):
"""Read the needed template variables from the schema file."""
template_vars = {
'entities': [],
'properties': [],
'enums': [],
'entity_parent_lookup': []
entity_parent_lookup = {}
for thing in schema['@graph']:
if thing['@type'] == 'rdfs:Class':
template_vars['entities'].append(make_entity(thing, names))
lookup_parents(thing, schema, entity_parent_lookup)
if is_enum_type(thing):
find_enum_options(thing['@id'], schema, names)
elif thing['@type'] == 'rdf:Property':
parse_property(thing, schema, names))
for entity, parents in entity_parent_lookup.iteritems():
[make_entity_from_name(parent, names) for parent in parents]
template_vars['entities'].sort(key=lambda p: p['name_hash'])
template_vars['properties'].sort(key=lambda p: p['name'])
return template_vars
def generate_file(file_name, template_file, template_vars):
"""Generate and write file given a template and variables to render."""
template_vars['header_file'] = os.path.basename(
template_vars['header_guard'] = template_vars['header_file'].upper() + '_H'
with open(file_name, 'w') as f:
def main():
parser = argparse.ArgumentParser()
help=' JSON-LD schema file to use for code generation.')
help='JSON-LD schema file with overrides to support changes not in the '
'latest version. Optional.')
help='JSON file of hashed names to speed up lookups.')
help='Output directory in which to place generated code files.')
parser.add_argument('--templates', nargs='+')
args = parser.parse_args()
template_vars = get_template_vars_from_file(
args.schema_file, args.overrides_file, args.name_file)
for template_file in args.templates:
os.path.basename(template_file.replace('.tmpl', ''))),
template_file, template_vars)
if __name__ == '__main__':