diff options
author | Gerald Combs <gerald.combs@riverbed.com> | 2018-09-14 17:41:28 +0000 |
---|---|---|
committer | Gerald Combs <gerald@wireshark.org> | 2018-09-14 18:01:49 +0000 |
commit | 56a30766ef40812fe79a0b04461d91a7aa170825 (patch) | |
tree | 55d83265a2f3501d14e56e598d78f86e3a891d28 | |
parent | 17c81011079acd905b082b8fc8caaef0156fad59 (diff) |
make-manuf.py: Improve truncation.
If the PyICU module is available, use it to truncate manufacturer
names by grapheme clusters.
Change-Id: Ib7dcbb126809df496a534f44a47871a1b28dc539
Reviewed-on: https://code.wireshark.org/review/29660
Reviewed-by: Gerald Combs <gerald@wireshark.org>
-rwxr-xr-x | tools/make-manuf.py | 39 |
1 files changed, 31 insertions, 8 deletions
diff --git a/tools/make-manuf.py b/tools/make-manuf.py index 693cacd112..a60fa946dd 100755 --- a/tools/make-manuf.py +++ b/tools/make-manuf.py @@ -32,6 +32,14 @@ if sys.version_info[0] >= 3: else: import urllib +have_icu = False +try: + # Use the grapheme or segments module instead? + import icu + have_icu = True +except ImportError: + pass + def exit_msg(msg=None, status=1): if msg is not None: sys.stderr.write(msg + '\n\n') @@ -69,15 +77,30 @@ def shorten(manuf): manuf = re.sub('\W(the|incorporated|inc|plc|systems|corporation|corp|s/a|a/s|ab|ag|kg|gmbh|company|co|limited|ltd|holding|spa)(?= )', '', manuf, flags=re.IGNORECASE) # Remove all spaces manuf = re.sub('\s+', '', manuf) - # Truncate all names to a reasonable length, say, 8 characters. - # If the string contains UTF-8, this may be substantially more than 8 - # bytes. It might also be less than 8 visible characters. Python slices - # unicode strings by code point, which is better than raw bytes but not - # as good as grapheme clusters. https://bugs.python.org/issue30717 + + # Truncate names to a reasonable length, say, 8 characters. If + # the string contains UTF-8, this may be substantially more than + # 8 bytes. It might also be less than 8 visible characters. Plain + # Python slices Unicode strings by code point, which is better + # than raw bytes but not as good as grapheme clusters. PyICU + # supports grapheme clusters. https://bugs.python.org/issue30717 # - # In our case 'Savroni̇k Elektroni̇k' is truncated to 'Savroni̇', which - # is 7 visible characters, 8 code points, and 9 bytes. - manuf = manuf[:8] + # In our case plain Python truncates 'Savroni̇k Elektroni̇k' + # to 'Savroni̇', which is 7 visible characters, 8 code points, + # and 9 bytes. + + # Truncate by code points + trunc_len = 8 + + if have_icu: + # Truncate by grapheme clusters + bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US')) + bi_ci.setText(manuf) + bounds = list(bi_ci) + bounds = bounds[0:8] + trunc_len = bounds[-1] + + manuf = manuf[:trunc_len] if manuf.lower() == orig_manuf.lower(): # Original manufacturer name was short and simple. |