diff options
author | Peter Wu <peter@lekensteyn.nl> | 2015-03-25 20:51:01 +0100 |
---|---|---|
committer | Anders Broman <a.broman58@gmail.com> | 2015-03-26 07:14:20 +0000 |
commit | 13c486c33f9d3a015aaf0e4ee0c99643575cd235 (patch) | |
tree | e642365ba3a64564c35810d70100b70da09e8330 | |
parent | 785fab7f01c98c54295596d4a25a2d2bc7343335 (diff) |
html2text.py: Python 2.5 compatibility, improve Unicode support
Add support for Python 2.5 (remove unicode_literals import which does
not seem to be necessary), check before using break_on_hyphens (without
this option in Python 2.5, the output is uglier though) and avoid the
'with' keyword.
While at it, fix reading Unicode text from file in Python 2 (tested
with: echo € | tools/html2text.py) and support reading from stdin using
the '-' filename.
Tested against Python 2.5.6, 2.6.6, 2.7.9, 3.2.6, 3.4.3 with the
commands from the previous html2text.py commit message, and additionally
with the Unicode character € as input (instead of the faq.py output).
Change-Id: I3de3f7a4e7cf7d702463c3a59758803843338a54
Reviewed-on: https://code.wireshark.org/review/7823
Reviewed-by: Peter Wu <peter@lekensteyn.nl>
Tested-by: Peter Wu <peter@lekensteyn.nl>
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
-rwxr-xr-x | tools/html2text.py | 29 |
1 files changed, 16 insertions, 13 deletions
diff --git a/tools/html2text.py b/tools/html2text.py index 54180fabcf..3f81b191bb 100755 --- a/tools/html2text.py +++ b/tools/html2text.py @@ -20,8 +20,6 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -from __future__ import unicode_literals - __author__ = "Peter Wu <peter@lekensteyn.nl>" __copyright__ = "Copyright 2015, Peter Wu" __license__ = "GPL (v2 or later)" @@ -68,8 +66,14 @@ class TextHTMLParser(HTMLParser): if self.list_item_prefix: initial_indent += self.list_item_prefix indent += ' ' - wrapper = TextWrapper(width=66, break_on_hyphens=False, - initial_indent=initial_indent, subsequent_indent=indent) + kwargs = { + 'width': 66, + 'initial_indent': initial_indent, + 'subsequent_indent': indent + } + if sys.version_info[0:2] >= (2, 6): + kwargs['break_on_hyphens'] = False + wrapper = TextWrapper(**kwargs) return '\n'.join(wrapper.wrap(text)) def _commit_block(self, newline='\n\n'): @@ -148,22 +152,21 @@ class TextHTMLParser(HTMLParser): def main(): htmlparser = TextHTMLParser() - if len(sys.argv) > 1: - if sys.version_info[0] >= 3: - # Python 3: read file as utf-8 - kwargs = { 'encoding': 'utf-8' } - else: - kwargs = {} - with open(sys.argv[1], **kwargs) as f: - for line in f: - htmlparser.feed(line) + if len(sys.argv) > 1 and sys.argv[1] != '-': + filename = sys.argv[1] + f = open(filename, 'rb') else: + filename = None f = sys.stdin + try: if hasattr(f, 'buffer'): # Access raw (byte) buffer in Python 3 instead of decoded one f = f.buffer # Read stdin as as Unicode string htmlparser.feed(f.read().decode('utf-8')) + finally: + if filename is not None: + f.close() htmlparser.close() if __name__ == '__main__': |