# Authors: John Dennis <jdennis@redhat.com>
#
# Copyright (C) 2007 Red Hat, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
__all__ = [
'escape_html',
'unescape_html',
'html_to_text',
'html_document',
]
import syslog
import sys
if sys.version_info > (3,):
import html
import html.parser
import html.entities
from io import StringIO
else:
import htmllib
from StringIO import StringIO
import formatter as Formatter
import string
from types import *
#------------------------------------------------------------------------------
class TextWriter(Formatter.DumbWriter):
def __init__(self, file=None, maxcol=80, indent_width=4):
Formatter.DumbWriter.__init__(self, file, maxcol)
self.indent_level = 0
self.indent_width = indent_width
self._set_indent()
def _set_indent(self):
self.indent_col = self.indent_level * self.indent_width
self.indent = ' ' * self.indent_col
def new_margin(self, margin, level):
self.indent_level = level
self._set_indent()
def send_label_data(self, data):
data = data + ' '
if len(data) > self.indent_col:
self.send_literal_data(data)
else:
offset = self.indent_col - len(data)
self.send_literal_data(' ' * offset + data)
def send_flowing_data(self, data):
if not data:
return
atbreak = self.atbreak or data[0] in string.whitespace
col = self.col
maxcol = self.maxcol
write = self.file.write
col = self.col
if col == 0:
write(self.indent)
col = self.indent_col
for word in data.split():
if atbreak:
if col + len(word) >= maxcol:
write('\n' + self.indent)
col = self.indent_col
else:
write(' ')
col = col + 1
write(word)
col = col + len(word)
atbreak = 1
self.col = col
self.atbreak = data[-1] in string.whitespace
if sys.version_info > (3,):
class HTMLParserAnchor(html.parser.HTMLParser):
def __init__(self, formatter, strict=False, convert_charrefs=False):
super(HTMLParserAnchor, self).__init__()
self.formatter = formatter
self.anchor_href = None
def handle_starttag(self, tag, attrs):
if tag == 'a':
for key, value in attrs:
if key == 'href':
self.anchor_href = value
def handle_endtag(self, tag):
if tag == 'a':
if self.anchor_href != None:
self.formatter.writer.send_flowing_data('(' + self.anchor_href + ')')
self.anchor_href = None
def handle_data(self, data):
self.formatter.writer.send_flowing_data(data)
else:
class HTMLParserAnchor(htmllib.HTMLParser):
def __init__(self, formatter, verbose=0):
htmllib.HTMLParser.__init__(self, formatter, verbose)
def anchor_bgn(self, href, name, type):
self.anchor = href
def anchor_end(self):
if self.anchor:
self.handle_data(' (%s) ' % self.anchor)
self.anchor = None
#------------------------------------------------------------------------------
def escape_html(s):
if s is None:
return None
try:
s = s.replace("&", "&") # Must be done first!
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace("'", "'")
s = s.replace('"', """)
except:
pass
return s
def unescape_html(s):
if s is None:
return None
if '&' not in s:
return s
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace("'", "'")
s = s.replace(""", '"')
s = s.replace("&", "&") # Must be last
return s
def html_to_text(html, maxcol=80):
try:
buffer = StringIO()
formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol))
parser = HTMLParserAnchor(formatter)
parser.feed(html)
parser.close()
text = buffer.getvalue()
buffer.close()
return text
except Exception as e:
syslog.syslog(syslog.LOG_ERR, 'cannot convert html to text: %s' % e)
return None
def html_document(*body_components):
'''Wrap the body components in a HTML document structure with a valid header.
Accepts a variable number of arguments of of which canb be:
* string
* a sequences of strings (tuple or list).
* a callable object taking no parameters and returning a string or sequence of strings.
'''
head = '<html>\n <head>\n <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n </head>\n <body>\n'
tail = '\n </body>\n</html>'
doc = head
for body_component in body_components:
if isinstance(body_component, six.string_types):
doc += body_component
elif isinstance(body_component, (tuple, list)):
for item in body_component:
doc += item
elif callable(body_component):
result = body_component()
if isinstance(result, (tuple, list)):
for item in result:
doc += item
else:
doc += result
else:
doc += body_component
doc += tail
return doc