diff options
-rwxr-xr-x | tools/make-manuf.py | 55 |
1 files changed, 49 insertions, 6 deletions
diff --git a/tools/make-manuf.py b/tools/make-manuf.py index 53b8aa920b..e36902616a 100755 --- a/tools/make-manuf.py +++ b/tools/make-manuf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # Wireshark - Network traffic analyzer # By Gerald Combs <gerald@wireshark.org> @@ -58,6 +57,49 @@ def open_url(url): return (body, dict(response.info())) +# These are applied after punctuation has been removed. +# More examples at https://en.wikipedia.org/wiki/Incorporation_(business) +general_terms = '|'.join([ + 'a/s', + 'ab', # Also follows "Oy", which is covered below. + 'ag', + 'b ?v', + 'closed joint stock company', + 'co', + 'company', + 'corp', + 'corporation', + 'de c ?v', # Follows "S.A.", which is covered separately below. + 'gmbh', + 'holding', + 'inc', + 'incorporated', + 'jsc', + 'kg', + 'k k', # "K.K." as in "kabushiki kaisha", but not "K+K" as in "K+K Messtechnik". + 'limited', + 'llc', + 'ltd', + 'n ?v', + 'oao', + 'open joint stock company', + 'ooo', + 'oy', + 'oyj', + 'plc', + 'pty', + 'pvt', + 's ?a ?r ?l', + 's ?a', + 's ?p ?a', + 'sp ?k', + 's ?r ?l', + 'systems', + 'the', + 'zao', + 'z ?o ?o' + ]) + def shorten(manuf): '''Convert a long manufacturer name to abbreviated and short names''' # Normalize whitespace. @@ -66,15 +108,16 @@ def shorten(manuf): # Add exactly one space on each end. # XXX This appears to be for the re.sub below. manuf = u' {} '.format(manuf) - # Convert to consistent case - manuf = manuf.title() + # Convert all caps to title case + if manuf.isupper(): + manuf = manuf.title() # Remove any punctuation # XXX Use string.punctuation? Note that it includes '-' and '*'. - manuf = re.sub(u"[',.()]", ' ', manuf) + manuf = re.sub(u"[\"',.()]", ' ', manuf) # & isn't needed when Standalone manuf = manuf.replace(" & ", " ") - # Remove any "the", "inc", "plc" ... - manuf = re.sub('\W(the|incorporated|inc|plc|systems|corporation|corp|s/a|a/s|ab|ag|kg|gmbh|company|co|limited|ltd|holding|spa)(?= )', '', manuf, flags=re.IGNORECASE) + # Remove business types and other general terms ("the", "inc", "plc", etc.) + manuf = re.sub('\W(' + general_terms + ')(?= )', '', manuf, flags=re.IGNORECASE) # Remove all spaces manuf = re.sub('\s+', '', manuf) |