blob: b53e6abe3fe4dde9f38f9c7197c1678718fb8bd8 [file] [log] [blame]
# Copyright 2020 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Generates C++ code representing structured data objects from schema.org
This script generates C++ objects based on a JSON+LD schema file. Blink uses the
generated code to scrape schema.org data from web pages.
"""
import os
import sys
import json
import argparse
_current_dir = os.path.dirname(os.path.realpath(__file__))
# jinja2 is in chromium's third_party directory
# Insert at front to override system libraries, and after path[0] == script dir
sys.path.insert(
1, os.path.join(_current_dir, *([os.pardir] * 2 + ['third_party'])))
import jinja2
from jinja2 import Environment, PackageLoader, select_autoescape
env = Environment(loader=PackageLoader('generate_schema_org_code', ''))
env.trim_blocks = True
env.lstrip_blocks = True
SCHEMA_ORG_PREFIX = 'http://schema.org/'
def schema_org_id(object_name):
return SCHEMA_ORG_PREFIX + object_name
def object_name_from_id(the_id):
"""Get the object name from a schema.org ID."""
return the_id[len(SCHEMA_ORG_PREFIX):]
def get_schema_obj(obj_id, schema):
"""Search the schema graph for an object with the given ID."""
matches = [obj for obj in schema['@graph'] if obj['@id'] == obj_id]
return matches[0] if len(matches) == 1 else None
def is_enum_type(class_obj):
if 'rdfs:subClassOf' in class_obj:
parent_class = class_obj['rdfs:subClassOf']
if isinstance(parent_class, list):
return any(parent['@id'] == schema_org_id('Enumeration')
for parent in parent_class)
return parent_class['@id'] == schema_org_id('Enumeration')
def make_entity(thing, names):
return {
"name": object_name_from_id(thing['@id']),
"name_hash": names[object_name_from_id(thing['@id'])]
}
def make_entity_from_name(name, names):
return {"name": name, "name_hash": names[name]}
def find_enum_options(obj_id, schema, names):
return [
make_entity(obj, names) for obj in schema['@graph']
if obj['@type'] == obj_id
]
def get_root_type(the_class, schema):
"""Get the base type the class is descended from."""
class_obj = get_schema_obj(the_class['@id'], schema)
if class_obj is None:
return the_class
if class_obj['@id'] == schema_org_id('Thing'):
return class_obj
# Consider URLs to be a base type as we will use have a struct field for
# them specifically.
if class_obj['@id'] == schema_org_id('URL'):
return class_obj
if ('@type' in class_obj
and schema_org_id('DataType') in class_obj['@type']):
return class_obj
if 'rdfs:subClassOf' in class_obj:
parent_class = class_obj['rdfs:subClassOf']
# All classes that use multiple inheritance are Thing type.
if isinstance(parent_class, list):
return get_schema_obj(schema_org_id('Thing'), schema)
# Enumeration classes are treated specially. Return the specific type
# of enum this class is.
if parent_class['@id'] == schema_org_id('Enumeration'):
return class_obj
return get_root_type(parent_class, schema)
return class_obj
def parse_property(prop, schema, names):
"""Parse out details about the property, including what type it can be."""
parsed_prop = {
'name': object_name_from_id(prop['@id']),
'name_hash': names[object_name_from_id(prop['@id'])],
'thing_types': [],
'enum_types': []
}
if not schema_org_id('rangeIncludes') in prop:
return parsed_prop
rangeIncludes = prop[schema_org_id('rangeIncludes')]
if not isinstance(rangeIncludes, list):
rangeIncludes = [rangeIncludes]
for possible_type in rangeIncludes:
root_type = get_root_type(possible_type, schema)
if root_type['@id'] == schema_org_id('Thing'):
parsed_prop['thing_types'].append(possible_type['@id'])
elif root_type['@id'] == schema_org_id('Text'):
parsed_prop['has_text'] = True
elif root_type['@id'] == schema_org_id('Date'):
parsed_prop['has_date'] = True
elif root_type['@id'] == schema_org_id('Time'):
parsed_prop['has_time'] = True
elif root_type['@id'] == schema_org_id('Boolean'):
parsed_prop['has_boolean'] = True
elif root_type['@id'] == schema_org_id('Number'):
parsed_prop['has_number'] = True
elif root_type['@id'] == schema_org_id('DateTime'):
parsed_prop['has_date_time'] = True
elif root_type['@id'] == schema_org_id('URL'):
parsed_prop['has_url'] = True
elif is_enum_type(root_type):
parsed_prop['enum_types'].append(possible_type['@id'])
return parsed_prop
def merge_with_schema(schema, overrides, thing):
indices = [
i for i, x in enumerate(schema['@graph']) if x['@id'] == thing['@id']
]
for index in indices:
schema['@graph'][index] = thing
if not indices:
schema['@graph'].append(thing)
def lookup_parents(thing, schema, lookup_table):
"""Recursively looks up all the parents of thing in the schema.
Returns the parents and populates them in lookup_table. The parents list may
contain duplicates if thing has multiple inheritance trees.
"""
obj_name = object_name_from_id(thing['@id'])
if obj_name in lookup_table:
return lookup_table[obj_name]
lookup_table[obj_name] = set()
if 'rdfs:subClassOf' in thing:
parent_classes = thing['rdfs:subClassOf']
if not isinstance(parent_classes, list):
parent_classes = [parent_classes]
parent_classes = [
get_schema_obj(parent['@id'], schema) for parent in parent_classes
]
parent_classes = [
parent for parent in parent_classes if parent is not None
]
found_parents = [
lookup_parents(parent_thing, schema, lookup_table)
for parent_thing in parent_classes
]
# flatten the list
found_parents = [item for sublist in found_parents for item in sublist]
lookup_table[obj_name].update(found_parents)
lookup_table[obj_name].add(obj_name)
return lookup_table[obj_name]
def get_template_vars_from_file(schema_file_path, overrides_file_path,
name_file_path):
with open(schema_file_path) as schema_file:
schema = json.loads(schema_file.read())
with open(name_file_path) as names_file:
names = json.loads(names_file.read())
if overrides_file_path:
with open(overrides_file_path) as overrides_file:
overrides = json.loads(overrides_file.read())
for thing in overrides['@graph']:
merge_with_schema(schema, overrides, thing)
return get_template_vars(schema, names)
def get_template_vars(schema, names):
"""Read the needed template variables from the schema file."""
template_vars = {
'entities': [],
'properties': [],
'enums': [],
'entity_parent_lookup': []
}
entity_parent_lookup = {}
for thing in schema['@graph']:
if thing['@type'] == 'rdfs:Class':
template_vars['entities'].append(make_entity(thing, names))
lookup_parents(thing, schema, entity_parent_lookup)
if is_enum_type(thing):
template_vars['enums'].append({
'name':
object_name_from_id(thing['@id']),
'id':
thing['@id'],
'id_hash':
names[thing['@id']],
'options':
find_enum_options(thing['@id'], schema, names)
})
elif thing['@type'] == 'rdf:Property':
template_vars['properties'].append(
parse_property(thing, schema, names))
for entity, parents in entity_parent_lookup.iteritems():
template_vars['entity_parent_lookup'].append({
'name':
entity,
'name_hash':
names[entity],
'parents':
[make_entity_from_name(parent, names) for parent in parents]
})
template_vars['entities'].sort(key=lambda p: p['name_hash'])
template_vars['properties'].sort(key=lambda p: p['name'])
return template_vars
def generate_file(file_name, template_file, template_vars):
"""Generate and write file given a template and variables to render."""
template_vars['header_file'] = os.path.basename(
template_file[:template_file.index('.')])
template_vars['header_guard'] = template_vars['header_file'].upper() + '_H'
with open(file_name, 'w') as f:
f.write(env.get_template(template_file).render(template_vars))
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'--schema-file',
help='Schema.org JSON-LD schema file to use for code generation.')
parser.add_argument(
'--overrides-file',
help='JSON-LD schema file with overrides to support changes not in the '
'latest schema.org version. Optional.')
parser.add_argument(
'--name-file',
help='JSON file of hashed schema.org names to speed up lookups.')
parser.add_argument(
'--output-dir',
help='Output directory in which to place generated code files.')
parser.add_argument('--templates', nargs='+')
args = parser.parse_args()
template_vars = get_template_vars_from_file(
args.schema_file, args.overrides_file, args.name_file)
for template_file in args.templates:
generate_file(
os.path.join(args.output_dir,
os.path.basename(template_file.replace('.tmpl', ''))),
template_file, template_vars)
if __name__ == '__main__':
sys.exit(main())