aboutsummaryrefslogtreecommitdiffstats
path: root/tools/make-sminmpec.py
blob: c6cfaeca000a4ca7079371892ae984f2555cf291 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
# create the enterprises.tsv file from
# https://www.iana.org/assignments/enterprise-numbers/enterprise-numbers
# or an offline copy
#
# Copyright 2022 by Moshe Kaplan
# Based on make-sminmpec.pl by Gerald Combs
#
# Wireshark - Network traffic analyzer
# By Gerald Combs <gerald@wireshark.org>
# Copyright 2004 Gerald Combs
#
# SPDX-License-Identifier: GPL-2.0-or-later

import argparse
import re
import urllib.request


ENTERPRISE_NUMBERS_URL = "https://www.iana.org/assignments/enterprise-numbers/enterprise-numbers"

ENTERPRISES_HEADER = """\
#
# generated from https://www.iana.org/assignments/enterprise-numbers/enterprise-numbers
# run "tools/make-sminmpec.py [infile] outfile" to regenerate
#
# The format used here is: <NUMERICAL_ID><SPACE><NAME>
# Where SPACE can be any sequence of spaces and tabs.
#
"""

DECIMAL_PATTERN = r"^(\d+)"
# up to three spaces because of formatting errors in the source
ORGANIZATION_PATTERN = r"^   ?(\S.*)"
FORMERLY_PATTERN = r" \((formerly .*)\)"


def generate_enterprise_files(file_content):
    # We only care about the "Decimal" and "Organization",
    # not the contact or email
    org_lines = []
    last_updated = ""
    end_seen = False
    for line in file_content.splitlines():
        decimal_match = re.match(DECIMAL_PATTERN, line)
        if decimal_match:
            decimal = decimal_match.group(0)
        elif re.match(ORGANIZATION_PATTERN, line):
            organization = line.strip()
            if organization.lower() == "unassigned":
                continue
            organization = re.sub(FORMERLY_PATTERN, r"\t# \1", organization)
            org_lines += [decimal + "\t" + organization]
        elif "last updated" in line.lower():
            last_updated = line
        elif "end of document" in line.lower():
            end_seen = True

    if not end_seen:
        raise Exception('"End of Document" not found. Truncated source file?')

    last_updated_line = "# " + last_updated + "\n\n"
    output = ENTERPRISES_HEADER + last_updated_line + "\n".join(org_lines) + "\n"
    return output


def main():
    parser = argparse.ArgumentParser(description="Create the enterprises.tsv file.")
    parser.add_argument('infile', nargs='?')
    parser.add_argument('outfile', nargs=1)
    parsed_args = parser.parse_args()

    if parsed_args.infile:
        with open(parsed_args.infile, encoding='utf-8') as fh:
            data = fh.read()
    else:
        with urllib.request.urlopen(ENTERPRISE_NUMBERS_URL) as f:
            if f.status != 200:
                raise Exception("request for " + ENTERPRISE_NUMBERS_URL + " failed with result code " + f.status)
            data = f.read().decode('utf-8')

    enterprises_content = generate_enterprise_files(data)
    with open(parsed_args.outfile[0], encoding='utf-8', mode='w') as fh:
        fh.write(enterprises_content)


if __name__ == "__main__":
    main()