#!/usr/bin/env python3 # # Generate Kate syntax file for CMake # # SPDX-FileCopyrightText: 2017-2024 Alex Turbov # # To install prerequisites: # # $ pip install --user click jinja2 lxml pyyaml # # To use: # # $ ./generate-cmake-syntax.py cmake.yaml > ../syntax/cmake.xml # from __future__ import annotations import functools import re from dataclasses import dataclass, field import click import jinja2 import yaml import sys from lxml import etree _TEMPLATED_NAME = re.compile(r'(?:<[^>]+>)') _PROPERTY_KEYS = [ 'global-properties' , 'directory-properties' , 'target-properties' , 'source-properties' , 'test-properties' , 'cache-properties' , 'install-properties' ] _KW_RE_LIST = ['kw', 're'] _VAR_KIND_LIST = ['variables', 'deprecated-or-internal-variables', 'environment-variables'] _CONTROL_FLOW_LIST = { 'break' , 'continue' , 'elseif' , 'else' , 'endforeach' , 'endif' , 'endwhile' , 'foreach' , 'if' , 'return' , 'while' } _VAR_REF_ENTITY = '&var_ref_re;' _HEURISTICS = [ ( {'MAX(_(COUNT|MAJOR|MINOR|PATCH|TWEAK))?', 'MIN(_(COUNT|MAJOR|MINOR|PATCH|TWEAK))?'} , 'M(AX|IN)(_(COUNT|MAJOR|MINOR|PATCH|TWEAK))?' ) , ({'OUTPUTS', 'OUTPUT_(HEADER|SOURCE)'}, 'OUTPUT(S|_(HEADER|SOURCE))') , ({'PREFIX', 'SUFFIX'}, '(PRE|SUF)FIX') , ({'CPPCHECK', 'CPPLINT'}, 'CPP(CHECK|LINT)') , ({'DEPENDS', 'PREDEPENDS'}, '(PRE)?DEPENDS') , ({'ICON', 'ICONURL'}, 'ICON(URL)?') , ( { '&var%ref%re;(_INIT)?' , 'DEBUG(_INIT)?' , 'MINSIZEREL(_INIT)?' , 'RELEASE(_INIT)?' , 'RELWITHDEBINFO(_INIT)?' } , '(DEBUG|MINSIZEREL|REL(EASE|WITHDEBINFO)|&var%ref%re;)(_INIT)?' ) , ({'RELEASE', 'RELWITHDEBINFO'}, 'REL(EASE|WITHDEBINFO)') , ({'POST', 'POSTUN', 'PRE', 'PREUN'}, 'P(RE|OST)(UN)?') , ({'AUTOPROV', 'AUTOREQ', 'AUTOREQPROV'}, 'AUTO(PROV|REQ(PROV)?)') , ({'DEFINITIONS', 'OPTIONS'}, '(DEFINI|OP)TIONS') , ({'LIB_NAMES', 'LIBRARY'}, 'LIB(_NAMES|RARY)') , ({'EXTENSIONS', 'EXTRA_FLAGS'}, 'EXT(ENSIONS|RA_FLAGS)') , ({'DISABLED', 'DISPLAY_NAME'}, 'DIS(ABLED|PLAY_NAME)') , ({'LIBRARIES', 'LINK_LIBRARIES', 'STATIC_LINK_LIBRARIES'}, '((STATIC_)?LINK_)?LIBRARIES') , ({'INCLUDE_DIRS', 'LIBRARY_DIRS'}, '(INCLUDE|LIBRARY)_DIRS') , ({'BINARY_DIR', 'SOURCE_DIR'}, '(BINARY|SOURCE)_DIR') , ({'CFLAGS(_OTHER)?', 'LDFLAGS(_OTHER)?'}, '(C|LD)FLAGS(_OTHER)?') , ({'INCLUDE_DIRECTORIES', 'LIBRARIES'}, '(INCLUDE_DIRECTO|LIBRA)RIES') , ({'POSTFLIGHT_&var%ref%re;_SCRIPT', 'PREFLIGHT_&var%ref%re;_SCRIPT'}, 'P(RE|OST)FLIGHT_&var%ref%re;_SCRIPT') , ({'DIRECTORIES', 'FRAMEWORK_DIRECTORIES'}, '(FRAMEWORK_)?DIRECTORIES') , ({'FILE_FLAG', 'FILE'}, 'FILE(_FLAG)?') , ({'DIR_PERMISSIONS', 'FILE_PERMISSIONS'}, '(DIR|FILE)_PERMISSIONS') , ({'COMPILER_LAUNCHER', 'LINKER_LAUNCHER'}, '(COMPIL|LINK)ER_LAUNCHER') , ({'COMPILER', 'COMPILE_(DEFINI|OP)TIONS'}, 'COMPILE(R|_(DEFINI|OP)TIONS)') , ({'LICENSEURL', 'LICENSE_(EXPRESSION|FILE_NAME)'}, 'LICENSE(URL|_(EXPRESSION|FILE_NAME))') , ({'NO_SONAME', 'SONAME'}, '(NO_)?SONAME') , ({'CODE_SIGN_ON_COPY', 'REMOVE_HEADERS_ON_COPY'}, '(CODE_SIGN|REMOVE_HEADERS)_ON_COPY') , ({'(REFERENCE|REFERENCEPROP_&var%ref%re;_TAG)_&var%ref%re;'}, 'REFERENCE(PROP_&var%ref%re;_TAG)?_&var%ref%re;') , ({'DISABLE_FIND_PACKAGE', 'REQUIRE_FIND_PACKAGE'}, '(DISABLE|REQUIRE)_FIND_PACKAGE') , ( {'GROUP_USING_&var%ref%re;(_SUPPORTED)?', 'LIBRARY_USING_&var%ref%re;(_SUPPORTED)?'} , '(GROUP|LIBRARY)_USING_&var%ref%re;(_SUPPORTED)?' ) , ( { 'EXE_LINKER_FLAGS_&var%ref%re;(_INIT)?' , 'MODULE_LINKER_FLAGS_&var%ref%re;(_INIT)?' , 'SHARED_LINKER_FLAGS_&var%ref%re;(_INIT)?' , 'STATIC_LINKER_FLAGS_&var%ref%re;(_INIT)?' } , '(EXE|MODULE|SHARED|STATIC)_LINKER_FLAGS_&var%ref%re;(_INIT)?' ) , ( { 'ARCHIVE_OUTPUT_DIRECTORY' , 'COMPILE_PDB_OUTPUT_DIRECTORY' , 'LIBRARY_OUTPUT_DIRECTORY' , 'PDB_OUTPUT_DIRECTORY' , 'RUNTIME_OUTPUT_DIRECTORY' } , '(ARCHIVE|(COMPILE_)?PDB|LIBRARY|RUNTIME)_OUTPUT_DIRECTORY' ) , ( { 'ARCHIVE_OUTPUT_(DIRECTORY|NAME)' , 'LIBRARY_OUTPUT_(DIRECTORY|NAME)' , 'RUNTIME_OUTPUT_(DIRECTORY|NAME)' } , '(ARCHIVE|LIBRARY|RUNTIME)_OUTPUT_(DIRECTORY|NAME)' ) , ({'ASM&var_ref_re;', 'ASM&var_ref_re;FLAGS'}, 'ASM&var_ref_re;(FLAGS)?') , ( { 'CMAKE_POLICY_DEFAULT_CMP[0-9]{4}' , 'CMAKE_POLICY_WARNING_CMP[0-9]{4}' } , 'CMAKE_POLICY_(DEFAULT|WARNING)_CMP[0-9]{4}' ) , ({'CMAKE_ARGV[0-9]+', 'CMAKE_MATCH_[0-9]+'}, 'CMAKE_(ARGV|MATCH_)[0-9]+') ] @dataclass class RePartNode: children: dict[str, RePartNode] = field(default_factory=dict, hash=False) is_leaf: bool = False @dataclass class RegexCollection: special_cases: list[str] = field(default_factory=list, hash=False) re_tree: dict[str, RePartNode] = field(default_factory=dict, hash=False) def add_case(self, regex: str) -> RegexCollection: self.special_cases.append(regex) return self def update_tree(self, name_parts: list[str]) -> RegexCollection: safe_var_ref = _VAR_REF_ENTITY.replace('_', '%') functools.reduce( lambda current, part: ( self.re_tree if current is None else current.children ).setdefault(part, RePartNode()) , ( safe_var_ref .join(name_parts) .replace(f'{safe_var_ref}_{safe_var_ref}', safe_var_ref) .split('_') ) , None ).is_leaf = True return self def try_transform_placeholder_string_to_regex(state: RegexCollection, name: str): ''' NOTE Some placeholders are not IDs, but numbers... `CMAKE_MATCH_` 4 example ''' name_parts = _TEMPLATED_NAME.split(name) match name_parts: case ['CMAKE_MATCH_' as head, ''] | ['CMAKE_ARGV' as head, ''] | ['ARGV' as head, '']: return state.add_case(head + '[0-9]+') case ['CMAKE_POLICY_DEFAULT_CMP' as head, ''] | ['CMAKE_POLICY_WARNING_CMP' as head, '']: return state.add_case(head + '[0-9]{4}') case ['', '__TRYRUN_OUTPUT']: return state.add_case(f'{_VAR_REF_ENTITY}__TRYRUN_OUTPUT') case (['ASM', ''] | ['ASM', 'FLAGS']) as asm_env: return state.add_case(f'{asm_env[0]}{_VAR_REF_ENTITY}{asm_env[1]}') return state.update_tree(name_parts) def is_first_subset_of_second(first, second): subset = set(first) fullset = set(second) return subset.issubset(fullset) def try_optimize_known_alt_groups(groups: list[str]) -> list[str]: for case in _HEURISTICS: if is_first_subset_of_second(case[0], groups): groups = sorted([*filter(lambda item: item not in case[0], groups), case[1]]) return groups def try_optimize_trailing_var_ref_regex(groups: list[str]) -> list[str]: tail_var_ref_re = '_' + _VAR_REF_ENTITY.replace('_', '%') candidates = [*filter(lambda s: s.endswith(tail_var_ref_re), groups)] return sorted([ *filter(lambda item: item not in candidates, groups) , f'({"|".join(try_optimize_known_alt_groups([s[:-len(tail_var_ref_re)] for s in candidates]))}){tail_var_ref_re}' ]) if len(candidates) > 1 else groups def build_regex(state: list[str], kv: tuple[str, RePartNode]) -> list[str]: name, value = kv match (value, len(value.children)): case (RePartNode(children={}, is_leaf=True), 0): return [*state, name] case (node, sz) if sz > 0: alt_group = try_optimize_known_alt_groups( try_optimize_trailing_var_ref_regex( functools.reduce(build_regex, node.children.items(), []) ) ) match (len(alt_group), node.is_leaf): case (1, False): return [*state, f'{name}_{alt_group[0]}'] case (1, True): return [*state, f'{name}(_{alt_group[0]})?'] case (sz, False) if sz > 0: return [*state, f'{name}_({"|".join(alt_group)})'] case (sz, True) if sz > 0: return [*state, f'{name}(_({"|".join(alt_group)}))?'] case _: raise AssertionError('Zero children?') case _: raise AssertionError(f'NOT MATCHED: {name=}→{value=}') return state def try_placeholders_to_regex(names): if not names: return None data = functools.reduce( try_transform_placeholder_string_to_regex , names , RegexCollection() ) return ( '\\b(?:' + '|'.join( try_optimize_known_alt_groups( try_optimize_trailing_var_ref_regex( functools.reduce( build_regex , data.re_tree.items() , data.special_cases ) ) ) ).replace('%', '_') + ')\\b' ) def partition_iterable(fn, iterable): true, false = [], [] for i in iterable: (false, true)[int(fn(i))].append(i) return true, false def _transform_command_set(cmd, list_name): args, args_re = partition_iterable(lambda x: _TEMPLATED_NAME.search(x) is None, cmd[list_name]) del cmd[list_name] list_name = list_name.replace('-', '_') cmd[list_name] = {k: sorted(set(v)) for k, v in zip(_KW_RE_LIST, [args, args_re])} cmd[list_name]['re'] = try_placeholders_to_regex(args_re) return cmd def transform_command(cmd): can_be_nulary = True if 'name' not in cmd: raise RuntimeError('Command have no name') if 'named-args' in cmd: new_cmd = _transform_command_set(cmd, 'named-args') assert new_cmd == cmd can_be_nulary = False if 'special-args' in cmd: new_cmd = _transform_command_set(cmd, 'special-args') assert new_cmd == cmd can_be_nulary = False if 'property-args' in cmd: new_cmd = _transform_command_set(cmd, 'property-args') assert new_cmd == cmd can_be_nulary = False cmd['nested_parentheses'] = cmd.get('nested-parentheses?', False) if 'first-arg-is-target?' in cmd: cmd['first_arg_is_target'] = cmd['first-arg-is-target?'] can_be_nulary = False if 'first-args-are-targets?' in cmd: cmd['first_args_are_targets'] = cmd['first-args-are-targets?'] can_be_nulary = False if 'has-target-name-after-kw' in cmd: cmd['has_target_name_after_kw'] = cmd['has-target-name-after-kw'] can_be_nulary = False if 'has-target-names-after-kw' in cmd: match cmd['has-target-names-after-kw']: case str(): cmd['has_target_names_after_kw'] = [cmd['has-target-names-after-kw']] case list(): cmd['has_target_names_after_kw'] = cmd['has-target-names-after-kw'] case _: raise TypeError('Unexpected type for `has-target-names-after-kw`') can_be_nulary = False if 'second-arg-is-target?' in cmd: cmd['second_arg_is_target'] = cmd['second-arg-is-target?'] can_be_nulary = False if 'nulary?' in cmd and cmd['nulary?'] and not can_be_nulary: raise RuntimeError('Command `{}` w/ args declared nulary!?'.format(cmd['name'])) if 'start-region' in cmd: cmd['start_region'] = cmd['start-region'] if 'end-region' in cmd: cmd['end_region'] = cmd['end-region'] cmd['attribute'] = 'Control Flow' if cmd['name'] in _CONTROL_FLOW_LIST else 'Command' return cmd def remove_duplicate_list_nodes(root): remap = {} items_by_kws = {} # extract duplicate keyword list for items in root.iterfind('highlighting/list'): key = '<'.join(item.text for item in items) name = items.attrib['name'] if rename := items_by_kws.get(key): remap[name] = rename items.getparent().remove(items) else: items_by_kws[key] = name # update keyword list name referenced by each rule for rule in root.iterfind('highlighting/contexts/context/keyword'): name = rule.attrib['String'] rule.attrib['String'] = remap.get(name, name) def remove_duplicate_context_nodes(root): contexts = root[0].find('contexts') # 3 levels: ctx, ctx_op and ctx_op_nested # TODO Refactor it! for _ in range(3): remap = {} duplicated = {} # remove duplicate nodes for context in contexts: name = context.attrib['name'] context.attrib['name'] = 'dummy' ref = duplicated.setdefault(etree.tostring(context), []) if ref: contexts.remove(context) else: context.attrib['name'] = name ref.append(name) remap[name] = ref[0] # update context name referenced by each rule for context in contexts: for rule in context: ref = remap.get(rule.attrib.get('context')) if ref: rule.attrib['context'] = ref def remove_duplicate_nodes(xml_string): parser = etree.XMLParser(resolve_entities=False, collect_ids=False) root = etree.fromstring(xml_string.encode(), parser=parser) remove_duplicate_list_nodes(root) remove_duplicate_context_nodes(root) # reformat comments xml = etree.tostring(root) xml = re.sub(b'(?=[^\n ])(?=[^ \n])', b'-->\n', xml) # extract DOCTYPE removed by etree.fromstring and reformat doctype = xml_string[:xml_string.find(' xml = xml[xml.find(b'