scripts/mbedtls_dev/c_parsing_helper.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

"""Helper functions to parse C code in heavily constrained scenarios.

Currently supported functionality:

* read_function_declarations: read function declarations from a header file.
"""

# Copyright The Mbed TLS Contributors
# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later

### WARNING: the code in this file has not been extensively reviewed yet.
### We do not think it is harmful, but it may be below our normal standards
### for robustness and maintainability.

import re
from typing import Dict, Iterable, Iterator, List, Optional, Tuple


class ArgumentInfo:
    """Information about an argument to an API function."""
    #pylint: disable=too-few-public-methods

    _KEYWORDS = [
        'const', 'register', 'restrict',
        'int', 'long', 'short', 'signed', 'unsigned',
    ]
    _DECLARATION_RE = re.compile(
        r'(?P<type>\w[\w\s*]*?)\s*' +
        r'(?!(?:' + r'|'.join(_KEYWORDS) + r'))(?P<name>\b\w+\b)?' +
        r'\s*(?P<suffix>\[[^][]*\])?\Z',
        re.A | re.S)

    @classmethod
    def normalize_type(cls, typ: str) -> str:
        """Normalize whitespace in a type."""
        typ = re.sub(r'\s+', r' ', typ)
        typ = re.sub(r'\s*\*', r' *', typ)
        return typ

    def __init__(self, decl: str) -> None:
        self.decl = decl.strip()
        m = self._DECLARATION_RE.match(self.decl)
        if not m:
            raise ValueError(self.decl)
        self.type = self.normalize_type(m.group('type')) #type: str
        self.name = m.group('name') #type: Optional[str]
        self.suffix = m.group('suffix') if m.group('suffix') else '' #type: str


class FunctionInfo:
    """Information about an API function."""
    #pylint: disable=too-few-public-methods

    # Regex matching the declaration of a function that returns void.
    VOID_RE = re.compile(r'\s*\bvoid\s*\Z', re.A)

    def __init__(self, #pylint: disable=too-many-arguments
                 filename: str,
                 line_number: int,
                 qualifiers: Iterable[str],
                 return_type: str,
                 name: str,
                 arguments: List[str]) -> None:
        self.filename = filename
        self.line_number = line_number
        self.qualifiers = frozenset(qualifiers)
        self.return_type = return_type
        self.name = name
        self.arguments = [ArgumentInfo(arg) for arg in arguments]

    def returns_void(self) -> bool:
        """Whether the function returns void."""
        return bool(self.VOID_RE.search(self.return_type))


# Match one C comment.
# Note that we match both comment types, so things like // in a /*...*/
# comment are handled correctly.
_C_COMMENT_RE = re.compile(r'//(?:[^\n]|\\\n)*|/\*.*?\*/', re.S)
_NOT_NEWLINES_RE = re.compile(r'[^\n]+')

def read_logical_lines(filename: str) -> Iterator[Tuple[int, str]]:
    """Read logical lines from a file.

    Logical lines are one or more physical line, with balanced parentheses.
    """
    with open(filename, encoding='utf-8') as inp:
        content = inp.read()
    # Strip comments, but keep newlines for line numbering
    content = re.sub(_C_COMMENT_RE,
                     lambda m: re.sub(_NOT_NEWLINES_RE, "", m.group(0)),
                     content)
    lines = enumerate(content.splitlines(), 1)
    for line_number, line in lines:
        # Read a logical line, containing balanced parentheses.
        # We assume that parentheses are balanced (this should be ok
        # since comments have been stripped), otherwise there will be
        # a gigantic logical line at the end.
        paren_level = line.count('(') - line.count(')')
        while paren_level > 0:
            _, more = next(lines) #pylint: disable=stop-iteration-return
            paren_level += more.count('(') - more.count(')')
            line += '\n' + more
        yield line_number, line

_C_FUNCTION_DECLARATION_RE = re.compile(
    r'(?P<qualifiers>(?:(?:extern|inline|static)\b\s*)*)'
    r'(?P<return_type>\w[\w\s*]*?)\s*' +
    r'\b(?P<name>\w+)' +
    r'\s*\((?P<arguments>.*)\)\s*;',
    re.A | re.S)

def read_function_declarations(functions: Dict[str, FunctionInfo],
                               filename: str) -> None:
    """Collect function declarations from a C header file."""
    for line_number, line in read_logical_lines(filename):
        m = _C_FUNCTION_DECLARATION_RE.match(line)
        if not m:
            continue
        qualifiers = m.group('qualifiers').split()
        return_type = m.group('return_type')
        name = m.group('name')
        arguments = m.group('arguments').split(',')
        if len(arguments) == 1 and re.match(FunctionInfo.VOID_RE, arguments[0]):
            arguments = []
        # Note: we replace any existing declaration for the same name.
        functions[name] = FunctionInfo(filename, line_number,
                                       qualifiers,
                                       return_type,
                                       name,
                                       arguments)