diff options
author | Gerald Combs <gerald@wireshark.org> | 2008-12-17 19:49:18 +0000 |
---|---|---|
committer | Gerald Combs <gerald@wireshark.org> | 2008-12-17 19:49:18 +0000 |
commit | f49377e0e7be6ef7d30ff1dfe685b079ef06249f (patch) | |
tree | f9301bcd79a6cdc4b5d9ff541e3fddfe0187aba2 | |
parent | 79413d1f89648fe292bd0e42391084d7b2f81b6f (diff) |
Update html2text.py to suit our needs. Add spaces in the faq.txt target
so to fix a problem with OS X 10.4. Add html2text.py to the end of the
faq.txt target.
svn path=/trunk/; revision=27040
-rw-r--r-- | help/Makefile.am | 10 | ||||
-rwxr-xr-x | tools/html2text.py | 110 |
2 files changed, 89 insertions, 31 deletions
diff --git a/help/Makefile.am b/help/Makefile.am index 7ae2993a3e..3d3ffe0213 100644 --- a/help/Makefile.am +++ b/help/Makefile.am @@ -42,10 +42,12 @@ MAINTAINERCLEANFILES = \ Makefile.in # Try our best to convert the FAQ to text. -# Maybe we should use a script (e.g. http://www.aaronsw.com/2002/html2text/) instead. +# The output of html2text.py isn't as pretty as elinks, links, or lynx. If that ever changes, we +# can use it exclusively. faq.txt: $(srcdir)/faq.py $(srcdir)/faq.py >$@.tmp && \ - (( which elinks > /dev/null && elinks -dump -dump-width 72 -no-numbering -no-references < $@.tmp > $@ )|| \ - ( which links > /dev/null && links -width 72 -html-numbered-links 0 -dump $@.tmp > $@ )|| \ - ( which lynx > /dev/null && lynx -dump -width=72 -nolist -stdin -force-html < $@.tmp > $@ ) ) && \ + (( which elinks > /dev/null && elinks -dump -dump-width 72 -no-numbering -no-references < $@.tmp > $@ ) || \ + ( which links > /dev/null && links -width 72 -html-numbered-links 0 -dump $@.tmp > $@ ) || \ + ( which lynx > /dev/null && lynx -dump -width=72 -nolist -stdin -force-html < $@.tmp > $@ ) || \ + $(srcdir)/../tools/html2text.py --width=72 --no-links $@.tmp > $@ ) && \ rm -f $@.tmp diff --git a/tools/html2text.py b/tools/html2text.py index 169ab0b894..9ae6c66fb6 100755 --- a/tools/html2text.py +++ b/tools/html2text.py @@ -1,34 +1,82 @@ #!/usr/bin/env python """html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "2.35" +__version__ = "2.35-Wireshark" __author__ = "Aaron Swartz (me@aaronsw.com)" __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"] +# NOTE: +# This is a modified version of html2text.py from http://www.aaronsw.com/2002/html2text/ +# Changes: +# Options can now be configured from the command line. +# SKIP_LINKS and INPUT_ENCODING options have been added. +# The script now requires Python 2.3 + # TODO: # Support decoded entities with unifiable. # Relative URL resolution +# Indent sections and lists similar to elinks/links/lynx if not hasattr(__builtins__, 'True'): True, False = 1, 0 import re, sys, urllib, htmlentitydefs, codecs, StringIO, types import sgmllib sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') +from optparse import OptionParser try: from textwrap import wrap except: pass -# Use Unicode characters instead of their ascii psuedo-replacements -UNICODE_SNOB = 0 - -# Put the links after each paragraph instead of at the end. -LINKS_EACH_PARAGRAPH = 0 - -# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) -BODY_WIDTH = 78 - -# Don't show internal links (href="#local-anchor") -- corresponding link targets -# won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = False +oparser = OptionParser() +options = None +args = None + +oparser.add_option( + "--force-unicode", + action="store_true", + dest="UNICODE_SNOB", + default=False, + help="Use Unicode characters instead of their ascii psuedo-replacements. [default: False]", + ) + +oparser.add_option( + "--links-after-paragraphs", + action="store_true", + dest="LINKS_EACH_PARAGRAPH", + default=False, + help="Put the links after each paragraph instead of at the end. [default: False]", + ) + +oparser.add_option( + "--width", + type="int", + dest="BODY_WIDTH", + default=78, + help="Wrap long lines at position. 0 for no wrapping. Requires Python 2.3. [default: 78 characters]", + ) + +oparser.add_option( + "--no-internal-links", + action="store_true", + dest="SKIP_INTERNAL_LINKS", + default=False, + help='''Don't show internal links (href="#local-anchor"). Corresponding link targets won't be visible in the plain text file anyway. [default: False]''', + ) + +oparser.add_option( + "--no-links", + action="store_true", + dest="SKIP_LINKS", + default=False, + help='''Don't show links. [default: False]''', + ) + +oparser.add_option( + "--input-encoding", + type="string", + dest="INPUT_ENCODING", + default='utf-8', + help='''Force the encoding of the input file. [default: utf-8]''', + ) ### Entity Nonsense ### @@ -56,18 +104,22 @@ for k in unifiable.keys(): unifiable_n[name2cp(k)] = unifiable[k] def charref(name): + global options + if name[0] in ['x','X']: c = int(name[1:], 16) else: c = int(name) - if not UNICODE_SNOB and c in unifiable_n.keys(): + if not options.UNICODE_SNOB and c in unifiable_n.keys(): return unifiable_n[c] else: return unichr(c) def entityref(c): - if not UNICODE_SNOB and c in unifiable.keys(): + global options + + if not options.UNICODE_SNOB and c in unifiable.keys(): return unifiable[c] else: try: name2cp(c) @@ -103,7 +155,8 @@ def onlywhite(line): def optwrap(text): """Wrap all paragraphs in the provided text.""" - if not BODY_WIDTH: + global options + if not options.BODY_WIDTH: return text assert wrap, "Requires Python 2.3." @@ -112,7 +165,7 @@ def optwrap(text): for para in text.split("\n"): if len(para) > 0: if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': - for line in wrap(para, BODY_WIDTH): + for line in wrap(para, options.BODY_WIDTH): result += line + "\n" result += "\n" newlines = 2 @@ -156,7 +209,7 @@ class _html2text(sgmllib.SGMLParser): self.abbr_title = None # current abbreviation definition self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_list = {} # stack of abbreviations to write later - + def outtextf(self, s): self.outtext += s @@ -204,6 +257,7 @@ class _html2text(sgmllib.SGMLParser): if match: return i def handle_tag(self, tag, attrs, start): + global options attrs = fixattrs(attrs) if hn(tag): @@ -258,7 +312,7 @@ class _html2text(sgmllib.SGMLParser): attrsD = {} for (x, y) in attrs: attrsD[x] = y attrs = attrsD - if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): + if attrs.has_key('href') and not (options.SKIP_LINKS or (options.SKIP_INTERNAL_LINKS and attrs['href'].startswith('#'))): self.astack.append(attrs) self.o("[") else: @@ -381,7 +435,7 @@ class _html2text(sgmllib.SGMLParser): if not self.lastWasNL: self.out(' ') self.space = 0 - if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): + if self.a and ((self.p_p == 2 and options.LINKS_EACH_PARAGRAPH) or force == "end"): if force == "end": self.out("\n") newa = [] @@ -415,6 +469,10 @@ class _html2text(sgmllib.SGMLParser): def wrapwrite(text): sys.stdout.write(text.encode('utf8')) def html2text_file(html, out=wrapwrite): + global options, args, oparser + if options is None or args is None: + (options, args) = oparser.parse_args(None, None) + h = _html2text(out) h.feed(html) h.feed("") @@ -424,8 +482,9 @@ def html2text(html): return optwrap(html2text_file(html, None)) if __name__ == "__main__": - if sys.argv[1:]: - arg = sys.argv[1] + (options, args) = oparser.parse_args() + if len(args) > 0: + arg = args[0] if arg.startswith('http://'): j = urllib.urlopen(arg) try: @@ -438,11 +497,8 @@ if __name__ == "__main__": data = text.decode(encoding) else: - encoding = 'utf8' - if len(sys.argv) > 2: - encoding = sys.argv[2] - data = open(arg, 'r').read().decode(encoding) + data = open(arg, 'r').read().decode(options.INPUT_ENCODING) else: - data = sys.stdin.read().decode('utf8') + data = sys.stdin.read().decode(options.INPUT_ENCODING) wrapwrite(html2text(data)) |