contrib/struct_endianess.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369

#!/usr/bin/env python3

'''Using mad regexes, automatically make sure that all structs with sub-byte
integers have matching big-endian definitions. The idea is to save a lot of
manual effort, and to automatically verify that there are no errors.
This script most certainly has numerous holes and shortcomings, but actually,
if you hit problems with it, rather adjust your coding style so that this
script can deal with it...'''

import re
import sys
import codecs
import os.path

re_struct_start = re.compile(r'^struct\s*[a-zA-Z_][a-zA-Z_0-9]*\s*{\s*$')
re_struct_end = re.compile(r'^}[^;]*;\s*$')

re_substruct_start = re.compile(r'^\s+struct\s*{\s*$')
re_substruct_end = re.compile(r'^\s+}\s*([^;]*\s)[a-zA-Z_][a-zA-Z_0-9]*\s*;\s*$')

re_int_def = re.compile(r'(^\s*((const|unsigned|signed|char|int|long|int[0-9]+_t|uint[0-9]_t)\s+)+\s*)([^;]*;)',
                        re.DOTALL | re.MULTILINE)
re_int_members = re.compile(r'([a-zA-Z_][a-zA-Z_0-9]*|[a-zA-Z_][a-zA-Z_0-9]*\s*:\s*[0-9]+)\s*[,;]\s*', re.DOTALL | re.MULTILINE)

re_little_endian_ifdef = re.compile(r'#\s*(if|elif)\s+OSMO_IS_LITTLE_ENDIAN\s*(==\s*1\s*|)');
re_big_endian_ifdef = re.compile(r'#\s*(if|elif)\s+OSMO_IS_BIG_ENDIAN\s*');
re_else = re.compile(r'#\s*else\s*');
re_endif = re.compile(r'#\s*endif\s*');

re_c_comment = re.compile(r'(/\*[^*]+\*/|//.?$)')

def remove_c_comments(code_str):
    return ''.join(re_c_comment.split(code_str)[::2])

def section_struct_body(struct_body_lines):
    '''divide a top-level-struct body into sections of
    ['arbitrary string', ['body;\n', 'lines;\n'], 'arbitrary string', ...]
    Aim: handle each sub-struct on its own, and if there already are ifdefs for
    little and big endian, keep just the little endian bit and derive big
    endian from it.
    An arbitrary string is anything other than struct member definitions, like
    a 'struct {', '} sub_name;', ...
    "body lines" are lines that define struct members (possibly with comments).
    Return: list of alternate arbitrary strings and variable definitions.
    '''

    # these globals are needed so that end_def() can change them from inside
    # the function. Not very nice style, but easiest implementation.
    global struct_body_parts
    global arbitrary_part
    global def_part

    struct_body_parts = []
    arbitrary_part = []
    def_part = []

    def end_def():
        '''if there is any content, flush out recorded parts (def_part,
        arbitrary_part) and start a new part. In short, cut a section
        boundary.'''
        global struct_body_parts
        global arbitrary_part
        global def_part

        if def_part:
            struct_body_parts.append(arbitrary_part)
            arbitrary_part = []
            struct_body_parts.append(def_part)
            def_part = []

    j = 0
    while j < len(struct_body_lines):
        line = struct_body_lines[j]

        if (re_substruct_start.fullmatch(line)
            or re_substruct_end.fullmatch(line)):
            end_def()
            arbitrary_part.append(line)
            j += 1
            continue

        if re_big_endian_ifdef.fullmatch(line):
            end_def()
            # discard big endian section
            j += 1
            while j < len(struct_body_lines):
                line = struct_body_lines[j]
                if re_endif.fullmatch(line):
                    end_def()
                    j += 1
                    break;
                if re_little_endian_ifdef.fullmatch(line):
                    end_def()
                    # keep that start of little endian section, not j++
                    break;
                if re_else.fullmatch(line):
                    # there's an '#else' after big-endian. Shim a little-endian header in just for the loop.
                    struct_body_lines[j] = '#if OSMO_IS_LITTLE_ENDIAN\n'
                    break;
                j += 1
            continue

        if re_little_endian_ifdef.fullmatch(line):
            end_def()
            j += 1
            while j < len(struct_body_lines):
                line = struct_body_lines[j]
                if re_endif.fullmatch(line):
                    end_def()
                    j += 1
                    break;
                if re_big_endian_ifdef.fullmatch(line):
                    end_def()
                    # keep that start of big endian section, not j++
                    break;
                if re_else.fullmatch(line):
                    # there's an '#else' after little-endian. Shim a big-endian header in just for the loop.
                    struct_body_lines[j] = '#if OSMO_IS_BIG_ENDIAN\n'
                    break;
                def_part.append(line)
                j += 1

            continue

        def_part.append(line)
        j += 1

    # flush the last section remaining that didn't see an explicit end
    end_def()
    # end_def() only flushes arbitrary_part if there was a def_part, so:
    if arbitrary_part:
        struct_body_parts.append(arbitrary_part)

    return struct_body_parts

def struct_body_to_big_endian(body_str):
    '''Input: a multi-line string containing the body of a struct, i.e. without
    sub-structs and without #if OSMO_IS_BIG_ENDIAN. like

      '\tconst char *foo;\n\tuint8_t moo:3, goo:2;\n\tuint8_t loo:3;\n\tvoid *baz;\n'

    Return None to indicate that there is no little/big endian split
    required, or return a multi-line string of the big-endian version of this
    same struct body, where sub-byte ints are reversed at byte boundaries, and
    all others are copied 1:1. If there are no sub-byte integers, return None,
    to indicate that there is no little/big endian split required.'''

    # kick comments out of the code analysis. They will end up being stripped
    # from big-endian only.
    body_str = remove_c_comments(body_str)

    def_strs = body_str.split(';')
    def_strs = ('%s;' % def_str for def_str in def_strs if def_str.strip())

    # classify defs as containing sub-byte members or not
    # defs = [ (true, 'uint8_t ', ('foo:3', 'bar:5')),
    #          (false, 'int baz;'),...]
    defs = []
    any_sub_byte_ints = False
    for one_def in def_strs:

        # does it have sub-string integers?
        int_def = re_int_def.fullmatch(one_def)
        if not int_def:
            # not even a number, same for big and little endian
            defs.append((False, one_def))
            continue

        int_type = int_def.group(1)
        members_str = int_def.groups()[-1]
        has_sub_byte_ints = False

        members = []
        for int_member in re_int_members.finditer(members_str):
            member = int_member.group(1)
            members.append(member)
            if ':' in member:
                has_sub_byte_ints = True

        if not has_sub_byte_ints:
            defs.append((False, one_def))
        else:
            defs.append((True, one_def, int_type, members))
            any_sub_byte_ints = True

    if not any_sub_byte_ints:
        return None

    # now the interesting part, go over the defs, and reverse the sub-byte ints
    # at byte boundaries.

    i = 0
    got_bits = 0
    byte_type = None
    members_within_a_byte = []
    big_endian_defs = []

    big_defs = []
    for classified_def in defs:
        has_sub_byte_ints = classified_def[0]

        # now the big endian part
        if has_sub_byte_ints:
            _, one_def, int_type, members = classified_def

            if byte_type and byte_type.strip() != int_type.strip():
                raise Exception('mismatching type continuation after incomplete byte: %r %r to %r'
                                % (byte_type, members_within_a_byte, int_type))
            byte_type = int_type

            for member in members:
                member_name, bits_str = member.split(':')
                member_name = member_name.strip()
                bits = int(bits_str)
                member = '%s:%d' % (member_name, bits)
                members_within_a_byte.append(member)
                got_bits += bits

                if got_bits == 8:
                    # reverse these.
                    big_endian_defs.append('%s%s;' % (byte_type, ', '.join(reversed(members_within_a_byte))))
                    members_within_a_byte = []
                    byte_type = None
                    got_bits = 0

                elif got_bits > 8:
                    raise Exception('sub-byte int breaks clean byte bounds: %s -- %d + %d = %d bits'
                                    % (member, got_bits - bits, bits, got_bits))

        elif not has_sub_byte_ints:
            if got_bits:
                raise Exception('sub-byte members do not add up to clean byte bounds: %r' % members_within_a_byte)

            big_endian_defs.append(classified_def[1])

    # strip empty lines
    lines = [l for l in (''.join(big_endian_defs).split('\n')) if l.strip()]
    # clean lines' whitespace errors we might have taken in with the type names
    for i in range(len(lines)):
        line = lines[i]
        while len(line) and line[-1] in ' \t':
            line = line[:-1]
        lines[i] = line
    return '\n'.join(lines)

def handle_struct_body(body_str):

    big_endian_body_str = struct_body_to_big_endian(body_str)

    if big_endian_body_str:
        new_lines = ['#if OSMO_IS_LITTLE_ENDIAN\n']
        new_lines.append(body_str)
        new_lines.append('#elif OSMO_IS_BIG_ENDIAN\n'
                         '/* auto-generated from the little endian part above (libosmocore/contrib/struct_endianess.py) */\n')
        new_lines.append(big_endian_body_str)
        new_lines.append('\n#endif\n')
        return ''.join(new_lines)
    else:
        return body_str

def _check_file(f):
    if not (f.endswith('.h') or f.endswith('.c') or f.endswith('.cpp')):
        return

    # section the file into
    # [ ["no struct def"], ["struct {...};"], ["no struct def"], ... ]
    sections = []
    in_struct = False
    buf = []
    for line in codecs.open(f, "r", "utf-8").readlines():

        if not in_struct and re_struct_start.fullmatch(line):
            # flush whatever might still be in buf from before
            sections.append(buf)
            # start an in_struct section
            buf = [line]
            in_struct = True
        elif in_struct and re_struct_end.fullmatch(line):
            # add this end to the in_struct section and then start a non-struct section
            buf.append(line)
            sections.append(buf)
            in_struct = False
            buf = []
        else:
            buf.append(line)
    # flush any leftovers in buf
    if buf:
        sections.append(buf)

    # examine each struct, i.e. every second item in 'sections'
    for i in range(len(sections)):
        if not (i & 1):
            continue

        struct = sections[i]

        # If the struct isn't packed, we need not bother.
        # The practical use of this: in some structs we have booleans in the
        # form of
        #     integer flag:1;
        # and these don't add up to bytes, and cause errors. So let's skip all
        # non-packed structs, then all of those are out of the picture.
        if not 'packed' in struct[-1]:
            continue

        try:

            # assume the 'struct foo {' is on the first line, the closing brace
            # '} __attribute...;' on the last, and the rest are individual
            # definitions split by ';'.
            struct_body_lines = struct[1:-1]
            struct_body_parts = section_struct_body(struct_body_lines)

            new_struct_body_parts = []
            for j in range(len(struct_body_parts)):
                part = ''.join(struct_body_parts[j])
                if not (j & 1):
                    new_struct_body_parts.append(part)
                else:
                    new_struct_body_parts.append(handle_struct_body(part))

            new_struct = [struct[0], ''.join(new_struct_body_parts), struct[-1]]
            sections[i] = new_struct
        except Exception as e:
            raise Exception('ERROR in struct %r' % struct[0])

    # phew. result.
    result = ''.join((''.join(s) for s in sections))

    # see if osmocom/core/endian.h is needed and included.
    if (not f.endswith('endian.h')
        and 'OSMO_IS_LITTLE_ENDIAN' in result
        and '#include <osmocom/core/endian.h>' not in result):
        # add the include after the last 'osmocom/core' include
        last_include_start = result.rfind('#include <osmocom/core/')
        if last_include_start < 0:
            last_include_start = result.rfind('#include <osmocom/')
        if last_include_start < 0:
            last_include_start = result.rfind('#include')

        if last_include_start < 0:
            raise Exception('do not know where to include osmocom/core/endian.h in %r' % f)

        insert_at = result.find('\n', last_include_start)

        result = result[:insert_at] + '\n#include <osmocom/core/endian.h>' + result[insert_at:]

    with codecs.open(f, "w", "utf-8") as fd:
        fd.write(result)

def check_file(f):
        try:
            _check_file(f)
        except Exception as e:
            raise Exception('ERROR IN FILE %r' % f)

args = sys.argv[1:]
if not args:
    args = ['.']

for f in args:
    if os.path.isdir(f):
        for parent_path, subdirs, files in os.walk(f, None, None):
            for ff in files:
                check_file(os.path.join(parent_path, ff))
    else:
        check_file(f)

# vim: tabstop=4 shiftwidth=4 expandtab