diff --git a/Dockerfile b/Dockerfile index d61009d9..c2280892 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,12 @@ -FROM ubuntu:bionic +FROM ubuntu:noble RUN apt-get update \ && apt-get install -y --no-install-recommends \ - python-pip python-setuptools python-wheel \ + python3-pip python3-setuptools python3-wheel \ locales tzdata \ ca-certificates \ strace gdb lsof locate net-tools htop iputils-ping dnsutils \ - python2.7-dbg python2.7 libpython2.7 python-dbg libpython-dbg \ + python3-dbg libpython3-dbg \ curl nano vim tree less telnet patch \ graphviz sqlite3 \ dumb-init \ @@ -25,9 +25,9 @@ WORKDIR /planet ENTRYPOINT ["dumb-init"] RUN echo "#!/bin/bash -eux \n\ -python2.7 code/planet.py config/config.ini \n\ +python3.12 code/planet.py config/config.ini \n\ cd /srv/planetpython.org/ \n\ -python2.7 -mSimpleHTTPServer 8080 \n\ +python3.12 -m http.server 8080 \n\ "> /start.sh RUN chmod +x /start.sh EXPOSE 8080 diff --git a/code/planet-cache.py b/code/planet-cache.py index 9334583a..55831f80 100755 --- a/code/planet-cache.py +++ b/code/planet-cache.py @@ -1,65 +1,64 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Planet cache tool. +#!/usr/bin/env python3 +"""Planet cache tool.""" -""" - -__authors__ = [ "Scott James Remnant ", - "Jeff Waugh " ] +__authors__ = ["Scott James Remnant ", "Jeff Waugh "] __license__ = "Python" +import configparser +import shelve import os import sys import time -import dbhash -import ConfigParser import planet def usage(): - print "Usage: planet-cache [options] CACHEFILE [ITEMID]..." - print - print "Examine and modify information in the Planet cache." - print - print "Channel Commands:" - print " -C, --channel Display known information on the channel" - print " -L, --list List items in the channel" - print " -K, --keys List all keys found in channel items" - print - print "Item Commands (need ITEMID):" - print " -I, --item Display known information about the item(s)" - print " -H, --hide Mark the item(s) as hidden" - print " -U, --unhide Mark the item(s) as not hidden" - print - print "Other Options:" - print " -h, --help Display this help message and exit" + print("Usage: planet-cache [options] CACHEFILE [ITEMID]...") + print() + print("Examine and modify information in the Planet cache.") + print() + print("Channel Commands:") + print(" -C, --channel Display known information on the channel") + print(" -L, --list List items in the channel") + print(" -K, --keys List all keys found in channel items") + print() + print("Item Commands (need ITEMID):") + print(" -I, --item Display known information about the item(s)") + print(" -H, --hide Mark the item(s) as hidden") + print(" -U, --unhide Mark the item(s) as not hidden") + print() + print("Other Options:") + print(" -h, --help Display this help message and exit") sys.exit(0) + def usage_error(msg, *args): - print >>sys.stderr, msg, " ".join(args) - print >>sys.stderr, "Perhaps you need --help ?" + print(msg, " ".join(args), file=sys.stderr) + print("Perhaps you need --help ?", file=sys.stderr) sys.exit(1) + def print_keys(item, title): keys = item.keys() keys.sort() - key_len = max([ len(k) for k in keys ]) + key_len = max([len(k) for k in keys]) - print title + ":" + print(title + ":") for key in keys: if item.key_type(key) == item.DATE: value = time.strftime(planet.TIMEFMT_ISO, item[key]) else: value = str(item[key]) - print " %-*s %s" % (key_len, key, fit_str(value, 74 - key_len)) + print(" %-*s %s" % (key_len, key, fit_str(value, 74 - key_len))) + def fit_str(string, length): if len(string) <= length: return string else: - return string[:length-4] + " ..." + return string[: length - 4] + " ..." if __name__ == "__main__": @@ -101,13 +100,12 @@ def fit_str(string, length): want_ids = 1 elif arg.startswith("-"): usage_error("Unknown option:", arg) + elif cache_file is None: + cache_file = arg + elif want_ids: + ids.append(arg) else: - if cache_file is None: - cache_file = arg - elif want_ids: - ids.append(arg) - else: - usage_error("Unexpected extra argument:", arg) + usage_error("Unexpected extra argument:", arg) if cache_file is None: usage_error("Missing expected cache filename") @@ -116,24 +114,23 @@ def fit_str(string, length): # Open the cache file directly to get the URL it represents try: - db = dbhash.open(cache_file) - url = db["url"] - db.close() - except dbhash.bsddb._db.DBError, e: - print >>sys.stderr, cache_file + ":", e.args[1] + with shelve.open(cache_file, "r") as db: + url = db[b"url"].decode("utf-8") + except shelve.error as e: + print(f"{cache_file}: {e!s}", file=sys.stderr) sys.exit(1) except KeyError: - print >>sys.stderr, cache_file + ": Probably not a cache file" + print(f"{cache_file}: Probably not a cache file", file=sys.stderr) sys.exit(1) # Now do it the right way :-) - my_planet = planet.Planet(ConfigParser.ConfigParser()) + my_planet = planet.Planet(configparser.ConfigParser()) my_planet.cache_directory = os.path.dirname(cache_file) channel = planet.Channel(my_planet, url) for item_id in ids: if not channel.has_item(item_id): - print >>sys.stderr, item_id + ": Not in channel" + print(item_id + ": Not in channel", file=sys.stderr) sys.exit(1) # Do the user's bidding @@ -146,49 +143,48 @@ def fit_str(string, length): print_keys(item, "Item Keys for %s" % item_id) elif command == "list": - print "Items in Channel:" + print("Items in Channel:") for item in channel.items(hidden=1, sorted=1): - print " " + item.id - print " " + time.strftime(planet.TIMEFMT_ISO, item.date) + print(" " + item.id) + print(" " + time.strftime(planet.TIMEFMT_ISO, item.date)) if hasattr(item, "title"): - print " " + fit_str(item.title, 70) + print(" " + fit_str(item.title, 70)) if hasattr(item, "hidden"): - print " (hidden)" + print(" (hidden)") elif command == "keys": keys = {} for item in channel.items(): - for key in item.keys(): + for key in item: keys[key] = 1 - keys = keys.keys() - keys.sort() + keys = sorted(keys.keys()) - print "Keys used in Channel:" + print("Keys used in Channel:") for key in keys: - print " " + key - print + print(" " + key) + print() - print "Use --item to output values of particular items." + print("Use --item to output values of particular items.") elif command == "hide": for item_id in ids: item = channel.get_item(item_id) if hasattr(item, "hidden"): - print item_id + ": Already hidden." + print(item_id + ": Already hidden.") else: item.hidden = "yes" channel.cache_write() - print "Done." + print("Done.") elif command == "unhide": for item_id in ids: item = channel.get_item(item_id) if hasattr(item, "hidden"): - del(item.hidden) + del item.hidden else: - print item_id + ": Not hidden." + print(item_id + ": Not hidden.") channel.cache_write() - print "Done." + print("Done.") diff --git a/code/planet.py b/code/planet.py index 41141b67..c4c0d2eb 100755 --- a/code/planet.py +++ b/code/planet.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """The Planet aggregator. A flexible and easy-to-use aggregator for generating websites. @@ -9,22 +9,19 @@ Requires Python 2.1, recommends 2.3. """ -__authors__ = [ "Scott James Remnant ", - "Jeff Waugh " ] +__authors__ = ["Scott James Remnant ", "Jeff Waugh "] __license__ = "Python" -import os -import sys -import time +import configparser import locale +import os import socket -import urlparse +import sys +from urllib.parse import urljoin import planet -from ConfigParser import ConfigParser - # Default configuration file path CONFIG_FILE = "config.ini" @@ -32,16 +29,15 @@ PLANET_NAME = "Unconfigured Planet" PLANET_LINK = "Unconfigured Planet" PLANET_FEED = None -OWNER_NAME = "Anonymous Coward" +OWNER_NAME = "Anonymous Coward" OWNER_EMAIL = "" -LOG_LEVEL = "WARNING" -FEED_TIMEOUT = 20 # seconds +LOG_LEVEL = "WARNING" +FEED_TIMEOUT = 20 # seconds # Default template file list TEMPLATE_FILES = "examples/basic/planet.html.tmpl" - def config_get(config, section, option, default=None, raw=0, vars=None): """Get a value from the configuration, with a default.""" if config.has_option(section, option): @@ -49,6 +45,7 @@ def config_get(config, section, option, default=None, raw=0, vars=None): else: return default + def main(): config_file = CONFIG_FILE offline = 0 @@ -56,51 +53,50 @@ def main(): for arg in sys.argv[1:]: if arg == "-h" or arg == "--help": - print "Usage: planet [options] [CONFIGFILE]" - print - print "Options:" - print " -v, --verbose DEBUG level logging during update" - print " -o, --offline Update the Planet from the cache only" - print " -h, --help Display this help message and exit" - print + print("Usage: planet [options] [CONFIGFILE]") + print() + print("Options:") + print(" -v, --verbose DEBUG level logging during update") + print(" -o, --offline Update the Planet from the cache only") + print(" -h, --help Display this help message and exit") + print() sys.exit(0) elif arg == "-v" or arg == "--verbose": verbose = 1 elif arg == "-o" or arg == "--offline": offline = 1 elif arg.startswith("-"): - print >>sys.stderr, "Unknown option:", arg + print("Unknown option:", arg, file=sys.stderr) sys.exit(1) else: config_file = arg # Read the configuration file - config = ConfigParser() + config = configparser.ConfigParser() config.read(config_file) if not config.has_section("Planet"): - print >>sys.stderr, "Configuration missing [Planet] section." + print("Configuration missing [Planet] section.", file=sys.stderr) sys.exit(1) # Read the [Planet] config section - planet_name = config_get(config, "Planet", "name", PLANET_NAME) - planet_link = config_get(config, "Planet", "link", PLANET_LINK) - planet_feed = config_get(config, "Planet", "feed", PLANET_FEED) - owner_name = config_get(config, "Planet", "owner_name", OWNER_NAME) + planet_name = config_get(config, "Planet", "name", PLANET_NAME) + planet_link = config_get(config, "Planet", "link", PLANET_LINK) + planet_feed = config_get(config, "Planet", "feed", PLANET_FEED) + owner_name = config_get(config, "Planet", "owner_name", OWNER_NAME) owner_email = config_get(config, "Planet", "owner_email", OWNER_EMAIL) if verbose: log_level = "DEBUG" else: - log_level = config_get(config, "Planet", "log_level", LOG_LEVEL) - feed_timeout = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT) - template_files = config_get(config, "Planet", "template_files", - TEMPLATE_FILES).split(" ") + log_level = config_get(config, "Planet", "log_level", LOG_LEVEL) + feed_timeout = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT) + template_files = config_get(config, "Planet", "template_files", TEMPLATE_FILES).split(" ") # Default feed to the first feed for which there is a template if not planet_feed: for template_file in template_files: name = os.path.splitext(os.path.basename(template_file))[0] - if name.find('atom')>=0 or name.find('rss')>=0: - planet_feed = urlparse.urljoin(planet_link, name) + if name.find("atom") >= 0 or name.find("rss") >= 0: + planet_feed = urljoin(planet_link, name) break # Define locale @@ -108,7 +104,7 @@ def main(): # The user can specify more than one locale (separated by ":") as # fallbacks. locale_ok = False - for user_locale in config.get("Planet", "locale").split(':'): + for user_locale in config.get("Planet", "locale").split(":"): user_locale = user_locale.strip() try: locale.setlocale(locale.LC_ALL, user_locale) @@ -118,7 +114,7 @@ def main(): locale_ok = True break if not locale_ok: - print >>sys.stderr, "Unsupported locale setting." + print("Unsupported locale setting.", file=sys.stderr) sys.exit(1) # Activate logging @@ -145,10 +141,8 @@ def main(): my_planet = planet.Planet(config) my_planet.run(planet_name, planet_link, template_files, offline) - my_planet.generate_all_files(template_files, planet_name, - planet_link, planet_feed, owner_name, owner_email) + my_planet.generate_all_files(template_files, planet_name, planet_link, planet_feed, owner_name, owner_email) if __name__ == "__main__": main() - diff --git a/code/planet/__init__.py b/code/planet/__init__.py index 929920b0..7cb1a1d9 100644 --- a/code/planet/__init__.py +++ b/code/planet/__init__.py @@ -1,5 +1,4 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- +#!/usr/bin/env python3 """Planet aggregator library. This package is a library for developing web sites or software that @@ -8,41 +7,38 @@ """ __version__ = "2.0" -__authors__ = [ "Scott James Remnant ", - "Jeff Waugh " ] +__authors__ = ["Scott James Remnant ", "Jeff Waugh "] __license__ = "Python" - # Modules available without separate import -import cache -import feedparser -import sanitize -import htmltmpl -import sgmllib +import shelve +import os +import re +import sys +import time +from hashlib import md5 +from html.parser import HTMLParser + try: import logging except: import compat_logging as logging -# Limit the effect of "from planet import *" -__all__ = ("cache", "feedparser", "htmltmpl", "logging", - "Planet", "Channel", "NewsItem") - - -import os -import md5 -import time -import dbhash -import re - -try: +try: from xml.sax.saxutils import escape except: + def escape(data): - return data.replace("&","&").replace(">",">").replace("<","<") + return data.replace("&", "&").replace(">", ">").replace("<", "<") + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from . import cache, feedparser, htmltmpl, sanitize + +# Limit the effect of "from planet import *" +__all__ = ("cache", "feedparser", "htmltmpl", "logging", "Planet", "Channel", "NewsItem") # Version information (for generator headers) -VERSION = ("Planet/%s +http://www.planetplanet.org" % __version__) +VERSION = "Planet/%s +http://www.planetplanet.org" % __version__ # Default User-Agent header to send when retreiving feeds USER_AGENT = VERSION + " " + feedparser.USER_AGENT @@ -57,7 +53,6 @@ def escape(data): TIMEFMT_ISO = "%Y-%m-%dT%H:%M:%S+00:00" TIMEFMT_822 = "%a, %d %b %Y %H:%M:%S +0000" - # Log instance to use here log = logging.getLogger("planet") try: @@ -66,23 +61,28 @@ def escape(data): log.warning = log.warn # Defaults for the template file config sections -ENCODING = "utf-8" -ITEMS_PER_PAGE = 60 -DAYS_PER_PAGE = 0 -OUTPUT_DIR = "output" -DATE_FORMAT = "%B %d, %Y %I:%M %p" +ENCODING = "utf-8" +ITEMS_PER_PAGE = 60 +DAYS_PER_PAGE = 0 +OUTPUT_DIR = "output" +DATE_FORMAT = "%B %d, %Y %I:%M %p" NEW_DATE_FORMAT = "%B %d, %Y" ACTIVITY_THRESHOLD = 0 -class stripHtml(sgmllib.SGMLParser): - "remove all tags from the data" - def __init__(self, data): - sgmllib.SGMLParser.__init__(self) - self.result='' - self.feed(data) - self.close() + +class stripHtml(HTMLParser): + """remove all tags from the data""" + + def __init__(self): + super().__init__() + self.result = [] + def handle_data(self, data): - if data: self.result+=data + self.result.append(data) + + def get_data(self): + return "".join(self.result) + def template_info(item, date_format): """Produce a dictionary of template information.""" @@ -95,8 +95,8 @@ def template_info(item, date_format): info[key + "_822"] = time.strftime(TIMEFMT_822, date) else: info[key] = item[key] - if 'title' in item.keys(): - info['title_plain'] = stripHtml(info['title']).result + if "title" in item.keys(): + info["title_plain"] = stripHtml(info["title"]).result return info @@ -114,6 +114,7 @@ class Planet: filter A regular expression that articles must match. exclude A regular expression that articles must not match. """ + def __init__(self, config): self.config = config @@ -135,16 +136,12 @@ def tmpl_config_get(self, template, option, default=None, raw=0, vars=None): return default def gather_channel_info(self, template_file="Planet"): - date_format = self.tmpl_config_get(template_file, - "date_format", DATE_FORMAT, raw=1) + date_format = self.tmpl_config_get(template_file, "date_format", DATE_FORMAT, raw=1) - activity_threshold = int(self.tmpl_config_get(template_file, - "activity_threshold", - ACTIVITY_THRESHOLD)) + activity_threshold = int(self.tmpl_config_get(template_file, "activity_threshold", ACTIVITY_THRESHOLD)) if activity_threshold: - activity_horizon = \ - time.gmtime(time.time()-86400*activity_threshold) + activity_horizon = time.gmtime(time.time() - 86400 * activity_threshold) else: activity_horizon = 0 @@ -157,25 +154,25 @@ def gather_channel_info(self, template_file="Planet"): # identify inactive feeds if activity_horizon: latest = channel.items(sorted=1) - if len(latest)==0 or latest[0].date < activity_horizon: - channels[channel]["message"] = \ - "no activity in %d days" % activity_threshold + if len(latest) == 0 or latest[0].date < activity_horizon: + channels[channel]["message"] = "no activity in %d days" % activity_threshold # report channel level errors - if not channel.url_status: continue + if not channel.url_status: + continue status = int(channel.url_status) if status == 403: - channels[channel]["message"] = "403: forbidden" + channels[channel]["message"] = "403: forbidden" elif status == 404: - channels[channel]["message"] = "404: not found" + channels[channel]["message"] = "404: not found" elif status == 408: - channels[channel]["message"] = "408: request timeout" + channels[channel]["message"] = "408: request timeout" elif status == 410: - channels[channel]["message"] = "410: gone" + channels[channel]["message"] = "410: gone" elif status == 500: - channels[channel]["message"] = "internal server error" + channels[channel]["message"] = "internal server error" elif status >= 400: - channels[channel]["message"] = "http status %s" % status + channels[channel]["message"] = "http status %s" % status return channels, channels_list @@ -184,40 +181,32 @@ def gather_items_info(self, channels, template_file="Planet", channel_list=None) prev_date = [] prev_channel = None - date_format = self.tmpl_config_get(template_file, - "date_format", DATE_FORMAT, raw=1) - items_per_page = int(self.tmpl_config_get(template_file, - "items_per_page", ITEMS_PER_PAGE)) - days_per_page = int(self.tmpl_config_get(template_file, - "days_per_page", DAYS_PER_PAGE)) - new_date_format = self.tmpl_config_get(template_file, - "new_date_format", NEW_DATE_FORMAT, raw=1) - - for newsitem in self.items(max_items=items_per_page, - max_days=days_per_page, - channels=channel_list): + date_format = self.tmpl_config_get(template_file, "date_format", DATE_FORMAT, raw=1) + items_per_page = int(self.tmpl_config_get(template_file, "items_per_page", ITEMS_PER_PAGE)) + days_per_page = int(self.tmpl_config_get(template_file, "days_per_page", DAYS_PER_PAGE)) + new_date_format = self.tmpl_config_get(template_file, "new_date_format", NEW_DATE_FORMAT, raw=1) + + for newsitem in self.items(max_items=items_per_page, max_days=days_per_page, channels=channel_list): item_info = template_info(newsitem, date_format) chan_info = channels[newsitem._channel] for k, v in chan_info.items(): item_info["channel_" + k] = v - + # Check for the start of a new day if prev_date[:3] != newsitem.date[:3]: prev_date = newsitem.date - item_info["new_date"] = time.strftime(new_date_format, - newsitem.date) - + item_info["new_date"] = time.strftime(new_date_format, newsitem.date) + # Check for the start of a new channel - if item_info.has_key("new_date") \ - or prev_channel != newsitem._channel: + if "new_date" in item_info or prev_channel != newsitem._channel: prev_channel = newsitem._channel item_info["new_channel"] = newsitem._channel.url - + items_list.append(item_info) return items_list - def run(self, planet_name, planet_link, template_files, offline = False): + def run(self, planet_name, planet_link, template_files, offline=False): log = logging.getLogger("planet.runner") # Create a planet @@ -225,9 +214,8 @@ def run(self, planet_name, planet_link, template_files, offline = False): if self.config.has_option("Planet", "cache_directory"): self.cache_directory = self.config.get("Planet", "cache_directory") if self.config.has_option("Planet", "new_feed_items"): - self.new_feed_items = int(self.config.get("Planet", "new_feed_items")) - self.user_agent = "%s +%s %s" % (planet_name, planet_link, - self.user_agent) + self.new_feed_items = int(self.config.get("Planet", "new_feed_items")) + self.user_agent = f"{planet_name} +{planet_link} {self.user_agent}" if self.config.has_option("Planet", "filter"): self.filter = self.config.get("Planet", "filter") @@ -242,16 +230,14 @@ def run(self, planet_name, planet_link, template_files, offline = False): # Update it try: - if not offline and not channel.url_status == '410': + if not offline and channel.url_status != "410": channel.update() except KeyboardInterrupt: raise except: log.exception("Update of <%s> failed", feed_url) - def generate_all_files(self, template_files, planet_name, - planet_link, planet_feed, owner_name, owner_email): - + def generate_all_files(self, template_files, planet_name, planet_link, planet_feed, owner_name, owner_email): log = logging.getLogger("planet.runner") # Go-go-gadget-template for template_file in template_files: @@ -262,45 +248,43 @@ def generate_all_files(self, template_files, planet_name, except htmltmpl.TemplateError: template = manager.prepare(os.path.basename(template_file)) # Read the configuration - output_dir = self.tmpl_config_get(template_file, - "output_dir", OUTPUT_DIR) - date_format = self.tmpl_config_get(template_file, - "date_format", DATE_FORMAT, raw=1) + output_dir = self.tmpl_config_get(template_file, "output_dir", OUTPUT_DIR) + date_format = self.tmpl_config_get(template_file, "date_format", DATE_FORMAT, raw=1) encoding = self.tmpl_config_get(template_file, "encoding", ENCODING) - + # We treat each template individually base = os.path.splitext(os.path.basename(template_file))[0] url = os.path.join(planet_link, base) output_file = os.path.join(output_dir, base) # Gather information - channels, channels_list = self.gather_channel_info(template_file) - items_list = self.gather_items_info(channels, template_file) + channels, channels_list = self.gather_channel_info(template_file) + items_list = self.gather_items_info(channels, template_file) # Gather item information - + # Process the template tp = htmltmpl.TemplateProcessor(html_escape=0) tp.set("Items", items_list) tp.set("Channels", channels_list) - + # Generic information - tp.set("generator", VERSION) - tp.set("name", planet_name) - tp.set("link", planet_link) - tp.set("owner_name", owner_name) + tp.set("generator", VERSION) + tp.set("name", planet_name) + tp.set("link", planet_link) + tp.set("owner_name", owner_name) tp.set("owner_email", owner_email) - tp.set("url", url) - + tp.set("url", url) + if planet_feed: tp.set("feed", planet_feed) - tp.set("feedtype", planet_feed.find('rss')>=0 and 'rss' or 'atom') - + tp.set("feedtype", planet_feed.find("rss") >= 0 and "rss" or "atom") + # Update time date = time.gmtime() - tp.set("date", time.strftime(date_format, date)) - tp.set("date_iso", time.strftime(TIMEFMT_ISO, date)) - tp.set("date_822", time.strftime(TIMEFMT_822, date)) + tp.set("date", time.strftime(date_format, date)) + tp.set("date_iso", time.strftime(TIMEFMT_ISO, date)) + tp.set("date_822", time.strftime(TIMEFMT_822, date)) try: log.info("Writing %s", output_file) @@ -326,17 +310,18 @@ def channels(self, hidden=0, sorted=1): """Return the list of channels.""" channels = [] for channel in self._channels: - if hidden or not channel.has_key("hidden"): + if hidden or "hidden" not in channel: channels.append((channel.name, channel)) if sorted: channels.sort() - return [ c[-1] for c in channels ] + return [c[-1] for c in channels] def find_by_basename(self, basename): for channel in self._channels: - if basename == channel.cache_basename(): return channel + if basename == channel.cache_basename(): + return channel def subscribe(self, channel): """Subscribe the planet to the channel.""" @@ -372,55 +357,48 @@ def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None): """ planet_filter_re = None if self.filter: - planet_filter_re = re.compile(self.filter, re.I) + planet_filter_re = re.compile(self.filter, re.IGNORECASE) planet_exclude_re = None if self.exclude: - planet_exclude_re = re.compile(self.exclude, re.I) - + planet_exclude_re = re.compile(self.exclude, re.IGNORECASE) + items = [] seen_guids = {} - if not channels: channels=self.channels(hidden=hidden, sorted=0) + if not channels: + channels = self.channels(hidden=hidden, sorted=0) for channel in channels: for item in channel._items.values(): - if hidden or not item.has_key("hidden"): - + if hidden or "hidden" not in item: channel_filter_re = None if channel.filter: - channel_filter_re = re.compile(channel.filter, - re.I) + channel_filter_re = re.compile(channel.filter, re.IGNORECASE) channel_exclude_re = None if channel.exclude: - channel_exclude_re = re.compile(channel.exclude, - re.I) - if (planet_filter_re or planet_exclude_re \ - or channel_filter_re or channel_exclude_re): + channel_exclude_re = re.compile(channel.exclude, re.IGNORECASE) + if planet_filter_re or planet_exclude_re or channel_filter_re or channel_exclude_re: title = "" - if item.has_key("title"): + if "title" in item: title = item.title content = item.get_content("content") if planet_filter_re: - if not (planet_filter_re.search(title) \ - or planet_filter_re.search(content)): + if not (planet_filter_re.search(title) or planet_filter_re.search(content)): continue if planet_exclude_re: - if (planet_exclude_re.search(title) \ - or planet_exclude_re.search(content)): + if planet_exclude_re.search(title) or planet_exclude_re.search(content): continue if channel_filter_re: - if not (channel_filter_re.search(title) \ - or channel_filter_re.search(content)): + if not (channel_filter_re.search(title) or channel_filter_re.search(content)): continue if channel_exclude_re: - if (channel_exclude_re.search(title) \ - or channel_exclude_re.search(content)): + if channel_exclude_re.search(title) or channel_exclude_re.search(content): continue - if not seen_guids.has_key(item.id): - seen_guids[item.id] = 1; + if item.id not in seen_guids: + seen_guids[item.id] = 1 items.append((time.mktime(item.date), item.order, item)) # Sort the list @@ -443,7 +421,8 @@ def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None): items = items[:max_count] break - return [ i[-1] for i in items ] + return [i[-1] for i in items] + class Channel(cache.CachedInfo): """A list of news items. @@ -497,14 +476,26 @@ class Channel(cache.CachedInfo): Some feeds may define additional properties to those above. """ - IGNORE_KEYS = ("links", "contributors", "textinput", "cloud", "categories", - "url", "href", "url_etag", "url_modified", "tags", "itunes_explicit") + + IGNORE_KEYS = ( + "links", + "contributors", + "textinput", + "cloud", + "categories", + "url", + "href", + "url_etag", + "url_modified", + "tags", + "itunes_explicit", + ) def __init__(self, planet, url): if not os.path.isdir(planet.cache_directory): os.makedirs(planet.cache_directory) cache_filename = cache.filename(planet.cache_directory, url) - cache_file = dbhash.open(cache_filename, "c", 0666) + cache_file = shelve.open(cache_filename, "c") cache.CachedInfo.__init__(self, cache_file, url, root=1) @@ -533,7 +524,7 @@ def __init__(self, planet, url): def has_item(self, id_): """Check whether the item exists in the channel.""" - return self._items.has_key(id_) + return id_ in self._items def get_item(self, id_): """Return the item from the channel.""" @@ -546,14 +537,14 @@ def items(self, hidden=0, sorted=0): """Return the item list.""" items = [] for item in self._items.values(): - if hidden or not item.has_key("hidden"): + if hidden or "hidden" not in item: items.append((time.mktime(item.date), item.order, item)) if sorted: items.sort() items.reverse() - return [ i[-1] for i in items ] + return [i[-1] for i in items] def __iter__(self): """Iterate the sorted item list.""" @@ -563,14 +554,16 @@ def cache_read_entries(self): """Read entry information from the cache.""" keys = self._cache.keys() for key in keys: - if key.find(" ") != -1: continue - if self.has_key(key): continue + if key.find(" ") != -1: + continue + if key in self: + continue item = NewsItem(self, key) self._items[key] = item def cache_basename(self): - return cache.filename('',self._id) + return cache.filename("", self._id) def cache_write(self, sync=1): """Write channel and item information to the cache.""" @@ -583,8 +576,7 @@ def cache_write(self, sync=1): self._expired = [] def feed_information(self): - """ - Returns a description string for the feed embedded in this channel. + """Returns a description string for the feed embedded in this channel. This will usually simply be the feed url embedded in <>, but in the case where the current self.url has changed from the original @@ -596,7 +588,7 @@ def feed_information(self): if self.url == self.configured_url: return "<%s>" % self.url else: - return "<%s> (formerly <%s>)" % (self.url, self.configured_url) + return f"<{self.url}> (formerly <{self.configured_url}>)" def update(self): """Download the feed to refresh the information. @@ -604,51 +596,52 @@ def update(self): This does the actual work of pulling down the feed and if it changes updates the cached information about the feed and entries within it. """ - info = feedparser.parse(self.url, - etag=self.url_etag, modified=self.url_modified, - agent=self._planet.user_agent) - if info.has_key("status"): - self.url_status = str(info.status) - elif info.has_key("entries") and len(info.entries)>0: - self.url_status = str(200) - elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout': - self.url_status = str(408) + info = feedparser.parse(self.url, etag=self.url_etag, modified=self.url_modified, agent=self._planet.user_agent) + + if hasattr(info, "status"): + self.url_status = str(info.status) + elif hasattr(info, "entries") and info.entries: + self.url_status = "200" + elif hasattr(info, "bozo") and info.bozo and hasattr(info, "bozo_exception"): + if info.bozo_exception.__class__.__name__ == "Timeout": + self.url_status = "408" + else: + self.url_status = "500" else: - self.url_status = str(500) + self.url_status = "500" - if self.url_status == '301' and \ - (info.has_key("entries") and len(info.entries)>0): + if self.url_status == "301" and ("entries" in info and len(info.entries) > 0): log.warning("Feed has moved from <%s> to <%s>", self.url, info.url) try: - os.link(cache.filename(self._planet.cache_directory, self.url), - cache.filename(self._planet.cache_directory, info.url)) + os.link( + cache.filename(self._planet.cache_directory, self.url), + cache.filename(self._planet.cache_directory, info.url), + ) except: pass self.url = info.url - elif self.url_status == '304': + elif self.url_status == "304": log.info("Feed %s unchanged", self.feed_information()) return - elif self.url_status == '410': + elif self.url_status == "410": log.info("Feed %s gone", self.feed_information()) self.cache_write() return - elif self.url_status == '408': + elif self.url_status == "408": log.warning("Feed %s timed out", self.feed_information()) return elif int(self.url_status) >= 400: - log.error("Error %s while updating feed %s", - self.url_status, self.feed_information()) + log.error("Error %s while updating feed %s", self.url_status, self.feed_information()) return else: log.info("Updating feed %s", self.feed_information()) - self.url_etag = info.has_key("etag") and info.etag or None - self.url_modified = info.has_key("modified") and info.modified or None + self.url_etag = "etag" in info and info.etag or None + self.url_modified = "modified" in info and info.modified or None if self.url_etag is not None: log.debug("E-Tag: %s", self.url_etag) if self.url_modified is not None: - log.debug("Last Modified: %s", - time.strftime(TIMEFMT_ISO, self.url_modified)) + log.debug("Last Modified: %s", time.strftime(TIMEFMT_ISO, self.url_modified)) self.update_info(info.feed) self.update_entries(info.entries) @@ -665,51 +658,48 @@ def update_info(self, feed): if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS: # Ignored fields pass - elif feed.has_key(key + "_parsed"): + elif key + "_parsed" in feed: # Ignore unparsed date fields pass elif key.endswith("_detail"): # retain name and email sub-fields - if feed[key].has_key('name') and feed[key].name: - self.set_as_string(key.replace("_detail","_name"), \ - feed[key].name) - if feed[key].has_key('email') and feed[key].email: - self.set_as_string(key.replace("_detail","_email"), \ - feed[key].email) + if "name" in feed[key] and feed[key].name: + self.set_as_string(key.replace("_detail", "_name"), feed[key].name) + if "email" in feed[key] and feed[key].email: + self.set_as_string(key.replace("_detail", "_email"), feed[key].email) elif key == "items": # Ignore items field pass elif key.endswith("_parsed"): # Date fields if feed[key] is not None: - self.set_as_date(key[:-len("_parsed")], feed[key]) + self.set_as_date(key[: -len("_parsed")], feed[key]) elif key == "image": # Image field: save all the information - if feed[key].has_key("url"): + if "url" in feed[key]: self.set_as_string(key + "_url", feed[key].url) - if feed[key].has_key("link"): + if "link" in feed[key]: self.set_as_string(key + "_link", feed[key].link) - if feed[key].has_key("title"): + if "title" in feed[key]: self.set_as_string(key + "_title", feed[key].title) - if feed[key].has_key("width"): + if "width" in feed[key]: self.set_as_string(key + "_width", str(feed[key].width)) - if feed[key].has_key("height"): + if "height" in feed[key]: self.set_as_string(key + "_height", str(feed[key].height)) - elif isinstance(feed[key], (str, unicode)): + elif isinstance(feed[key], str): # String fields try: - detail = key + '_detail' - if feed.has_key(detail) and feed[detail].has_key('type'): - if feed[detail].type == 'text/html': + detail = key + "_detail" + if detail in feed and "type" in feed[detail]: + if feed[detail].type == "text/html": feed[key] = sanitize.HTML(feed[key]) - elif feed[detail].type == 'text/plain': + elif feed[detail].type == "text/plain": feed[key] = escape(feed[key]) self.set_as_string(key, feed[key]) except KeyboardInterrupt: raise except: - log.exception("Ignored '%s' of <%s>, unknown format", - key, self.url) + log.exception("Ignored '%s' of <%s>, unknown format", key, self.url) def update_entries(self, entries): """Update entries from the feed. @@ -736,16 +726,14 @@ def update_entries(self, entries): feed_items = [] for entry in entries: # Try really hard to find some kind of unique identifier - if entry.has_key("id"): + if "id" in entry: entry_id = cache.utf8(entry.id) - elif entry.has_key("link"): + elif "link" in entry: entry_id = cache.utf8(entry.link) - elif entry.has_key("title"): - entry_id = (self.url + "/" - + md5.new(cache.utf8(entry.title)).hexdigest()) - elif entry.has_key("summary"): - entry_id = (self.url + "/" - + md5.new(cache.utf8(entry.summary)).hexdigest()) + elif "title" in entry: + entry_id = self.url + "/" + md5.new(cache.utf8(entry.title)).hexdigest() + elif "summary" in entry: + entry_id = self.url + "/" + md5.new(cache.utf8(entry.summary)).hexdigest() else: log.error("Unable to find or generate id, entry ignored") continue @@ -761,8 +749,11 @@ def update_entries(self, entries): feed_items.append(entry_id) # Hide excess items the first time through - if self.last_updated is None and self._planet.new_feed_items \ - and len(feed_items) > self._planet.new_feed_items: + if ( + self.last_updated is None + and self._planet.new_feed_items + and len(feed_items) > self._planet.new_feed_items + ): item.hidden = "yes" log.debug("Marked <%s> as hidden (new feed)", entry_id) @@ -779,19 +770,20 @@ def update_entries(self, entries): break elif item.id in feed_items: feed_count -= 1 - elif item._channel.url_status != '226': - del(self._items[item.id]) + elif item._channel.url_status != "226": + del self._items[item.id] self._expired.append(item) log.debug("Removed expired or replaced item <%s>", item.id) def get_name(self, key): """Return the key containing the name.""" for key in ("name", "title"): - if self.has_key(key) and self.key_type(key) != self.NULL: + if key in self and self.key_type(key) != self.NULL: return self.get_as_string(key) return "" + class NewsItem(cache.CachedInfo): """An item of news. @@ -830,15 +822,15 @@ class NewsItem(cache.CachedInfo): Some feeds may define additional properties to those above. """ - IGNORE_KEYS = ("categories", "contributors", "enclosures", "links", - "guidislink", "date", "tags") + + IGNORE_KEYS = ("categories", "contributors", "enclosures", "links", "guidislink", "date", "tags") def __init__(self, channel, id_): cache.CachedInfo.__init__(self, channel._cache, id_) self._channel = channel self.id = id_ - self.id_hash = md5.new(id_).hexdigest() + self.id_hash = md5(id_.encode()).hexdigest() self.date = None self.order = None self.content = None @@ -850,62 +842,62 @@ def update(self, entry): if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS: # Ignored fields pass - elif entry.has_key(key + "_parsed"): + elif key + "_parsed" in entry: # Ignore unparsed date fields pass elif key.endswith("_detail"): # retain name, email, and language sub-fields - if entry[key].has_key('name') and entry[key].name: - self.set_as_string(key.replace("_detail","_name"), \ - entry[key].name) - if entry[key].has_key('email') and entry[key].email: - self.set_as_string(key.replace("_detail","_email"), \ - entry[key].email) - if entry[key].has_key('language') and entry[key].language and \ - (not self._channel.has_key('language') or \ - entry[key].language != self._channel.language): - self.set_as_string(key.replace("_detail","_language"), \ - entry[key].language) + if "name" in entry[key] and entry[key].name: + self.set_as_string(key.replace("_detail", "_name"), entry[key].name) + if "email" in entry[key] and entry[key].email: + self.set_as_string(key.replace("_detail", "_email"), entry[key].email) + if ( + "language" in entry[key] + and entry[key].language + and ("language" not in self._channel or entry[key].language != self._channel.language) + ): + self.set_as_string(key.replace("_detail", "_language"), entry[key].language) elif key.endswith("_parsed"): # Date fields if entry[key] is not None: - self.set_as_date(key[:-len("_parsed")], entry[key]) + self.set_as_date(key[: -len("_parsed")], entry[key]) elif key == "source": # Source field: save both url and value - if entry[key].has_key("value"): + if "value" in entry[key]: self.set_as_string(key + "_name", entry[key].value) - if entry[key].has_key("url"): + if "url" in entry[key]: self.set_as_string(key + "_link", entry[key].url) elif key == "content": # Content field: concatenate the values value = "" for item in entry[key]: - if item.type == 'text/html': + if item.type == "text/html": item.value = sanitize.HTML(item.value) - elif item.type == 'text/plain': + elif item.type == "text/plain": item.value = escape(item.value) - if item.has_key('language') and item.language and \ - (not self._channel.has_key('language') or - item.language != self._channel.language) : + if ( + "language" in item + and item.language + and ("language" not in self._channel or item.language != self._channel.language) + ): self.set_as_string(key + "_language", item.language) value += cache.utf8(item.value) self.set_as_string(key, value) - elif isinstance(entry[key], (str, unicode)): + elif isinstance(entry[key], str): # String fields try: - detail = key + '_detail' - if entry.has_key(detail): - if entry[detail].has_key('type'): - if entry[detail].type == 'text/html': + detail = key + "_detail" + if detail in entry: + if "type" in entry[detail]: + if entry[detail].type == "text/html": entry[key] = sanitize.HTML(entry[key]) - elif entry[detail].type == 'text/plain': + elif entry[detail].type == "text/plain": entry[key] = escape(entry[key]) self.set_as_string(key, entry[key]) except KeyboardInterrupt: raise except: - log.exception("Ignored '%s' of <%s>, unknown format", - key, self.id) + log.exception("Ignored '%s' of <%s>, unknown format", key, self.id) # Generate the date field if we need to self.get_date("date") @@ -923,9 +915,8 @@ def get_date(self, key): entries appear in posting sequence but don't overlap entries added in previous updates and don't creep into the next one. """ - for other_key in ("updated", "modified", "published", "issued", "created"): - if self.has_key(other_key): + if other_key in self: date = self.get_as_date(other_key) break else: @@ -934,9 +925,9 @@ def get_date(self, key): if date is not None: if date > self._channel.updated: date = self._channel.updated -# elif date < self._channel.last_updated: -# date = self._channel.updated - elif self.has_key(key) and self.key_type(key) != self.NULL: + # elif date < self._channel.last_updated: + # date = self._channel.updated + elif key in self and self.key_type(key) != self.NULL: return self.get_as_date(key) else: date = self._channel.updated @@ -947,7 +938,7 @@ def get_date(self, key): def get_content(self, key): """Return the key containing the content.""" for key in ("content", "tagline", "summary"): - if self.has_key(key) and self.key_type(key) != self.NULL: + if key in self and self.key_type(key) != self.NULL: return self.get_as_string(key) return "" diff --git a/code/planet/atomstyler.py b/code/planet/atomstyler.py index 9220702c..88d3a211 100644 --- a/code/planet/atomstyler.py +++ b/code/planet/atomstyler.py @@ -1,124 +1,137 @@ -from xml.dom import minidom, Node -from urlparse import urlparse, urlunparse -from xml.parsers.expat import ExpatError -from htmlentitydefs import name2codepoint import re +from html.entities import name2codepoint +from urllib.parse import urlparse, urlunparse +from xml.dom import Node, minidom +from xml.parsers.expat import ExpatError + # select and apply an xml:base for this entry class relativize: - def __init__(self, parent): - self.score = {} - self.links = [] - self.collect_and_tally(parent) - self.base = self.select_optimal_base() - if self.base: - if not parent.hasAttribute('xml:base'): - self.rebase(parent) - parent.setAttribute('xml:base', self.base) - - # collect and tally cite, href and src attributes - def collect_and_tally(self,parent): - uri = None - if parent.hasAttribute('cite'): uri=parent.getAttribute('cite') - if parent.hasAttribute('href'): uri=parent.getAttribute('href') - if parent.hasAttribute('src'): uri=parent.getAttribute('src') - - if uri: - parts=urlparse(uri) - if parts[0].lower() == 'http': - parts = (parts[1]+parts[2]).split('/') - base = None - for i in range(1,len(parts)): - base = tuple(parts[0:i]) - self.score[base] = self.score.get(base,0) + len(base) - if base and base not in self.links: self.links.append(base) + def __init__(self, parent): + self.score = {} + self.links = [] + self.collect_and_tally(parent) + self.base = self.select_optimal_base() + if self.base: + if not parent.hasAttribute("xml:base"): + self.rebase(parent) + parent.setAttribute("xml:base", self.base) + + # collect and tally cite, href and src attributes + def collect_and_tally(self, parent): + uri = None + if parent.hasAttribute("cite"): + uri = parent.getAttribute("cite") + if parent.hasAttribute("href"): + uri = parent.getAttribute("href") + if parent.hasAttribute("src"): + uri = parent.getAttribute("src") + + if uri: + parts = urlparse(uri) + if parts[0].lower() == "http": + parts = (parts[1] + parts[2]).split("/") + base = None + for i in range(1, len(parts)): + base = tuple(parts[0:i]) + self.score[base] = self.score.get(base, 0) + len(base) + if base and base not in self.links: + self.links.append(base) + + for node in parent.childNodes: + if node.nodeType == Node.ELEMENT_NODE: + self.collect_and_tally(node) + + # select the xml:base with the highest score + def select_optimal_base(self): + if not self.score: + return None + for link in self.links: + self.score[link] = 0 + winner = max(self.score.values()) + if not winner: + return None + for key in self.score.keys(): + if self.score[key] == winner: + if winner == len(key): + return None + return urlunparse(("http", key[0], "/".join(key[1:]), "", "", "")) + "/" + + # rewrite cite, href and src attributes using this base + def rebase(self, parent): + uri = None + if parent.hasAttribute("cite"): + uri = parent.getAttribute("cite") + if parent.hasAttribute("href"): + uri = parent.getAttribute("href") + if parent.hasAttribute("src"): + uri = parent.getAttribute("src") + if uri and uri.startswith(self.base): + uri = uri[len(self.base) :] or "." + if parent.hasAttribute("href"): + uri = parent.setAttribute("href", uri) + if parent.hasAttribute("src"): + uri = parent.setAttribute("src", uri) + + for node in parent.childNodes: + if node.nodeType == Node.ELEMENT_NODE: + self.rebase(node) - for node in parent.childNodes: - if node.nodeType == Node.ELEMENT_NODE: - self.collect_and_tally(node) - - # select the xml:base with the highest score - def select_optimal_base(self): - if not self.score: return None - for link in self.links: - self.score[link] = 0 - winner = max(self.score.values()) - if not winner: return None - for key in self.score.keys(): - if self.score[key] == winner: - if winner == len(key): return None - return urlunparse(('http', key[0], '/'.join(key[1:]), '', '', '')) + '/' - - # rewrite cite, href and src attributes using this base - def rebase(self,parent): - uri = None - if parent.hasAttribute('cite'): uri=parent.getAttribute('cite') - if parent.hasAttribute('href'): uri=parent.getAttribute('href') - if parent.hasAttribute('src'): uri=parent.getAttribute('src') - if uri and uri.startswith(self.base): - uri = uri[len(self.base):] or '.' - if parent.hasAttribute('href'): uri=parent.setAttribute('href', uri) - if parent.hasAttribute('src'): uri=parent.setAttribute('src', uri) - - for node in parent.childNodes: - if node.nodeType == Node.ELEMENT_NODE: - self.rebase(node) # convert type="html" to type="plain" or type="xhtml" as appropriate def retype(parent): - for node in parent.childNodes: - if node.nodeType == Node.ELEMENT_NODE: - - if node.hasAttribute('type') and node.getAttribute('type') == 'html': - if len(node.childNodes)==0: - node.removeAttribute('type') - elif len(node.childNodes)==1: - - # replace html entity defs with utf-8 - chunks=re.split('&(\w+);', node.childNodes[0].nodeValue) - for i in range(1,len(chunks),2): - if chunks[i] in ['amp', 'lt', 'gt', 'apos', 'quot']: - chunks[i] ='&' + chunks[i] +';' - elif chunks[i] in name2codepoint: - chunks[i]=unichr(name2codepoint[chunks[i]]) - else: - chunks[i]='&' + chunks[i] + ';' - text = u"".join(chunks) - - try: - # see if the resulting text is a well-formed XML fragment - div = '
%s
' - data = minidom.parseString((div % text.encode('utf-8'))) - - if text.find('<') < 0: - # plain text - node.removeAttribute('type') - text = data.documentElement.childNodes[0].nodeValue - node.childNodes[0].replaceWholeText(text) - - elif len(text) > 80: - # xhtml - node.setAttribute('type', 'xhtml') - node.removeChild(node.childNodes[0]) - node.appendChild(data.documentElement) - - except ExpatError: - # leave as html - pass - - else: - # recurse - retype(node) - - if parent.nodeName == 'entry': - relativize(parent) - -if __name__ == '__main__': - - # run styler on each file mention on the command line - import sys - for feed in sys.argv[1:]: - doc = minidom.parse(feed) - doc.normalize() - retype(doc.documentElement) - open(feed,'w').write(doc.toxml('utf-8')) + for node in parent.childNodes: + if node.nodeType == Node.ELEMENT_NODE: + if node.hasAttribute("type") and node.getAttribute("type") == "html": + if len(node.childNodes) == 0: + node.removeAttribute("type") + elif len(node.childNodes) == 1: + # replace html entity defs with utf-8 + chunks = re.split(r"&(\w+);", node.childNodes[0].nodeValue) + for i in range(1, len(chunks), 2): + if chunks[i] in ["amp", "lt", "gt", "apos", "quot"]: + chunks[i] = "&" + chunks[i] + ";" + elif chunks[i] in name2codepoint: + chunks[i] = chr(name2codepoint[chunks[i]]) + else: + chunks[i] = "&" + chunks[i] + ";" + text = "".join(chunks) + + try: + # see if the resulting text is a well-formed XML fragment + div = '
%s
' + data = minidom.parseString(div % text.encode("utf-8")) + + if text.find("<") < 0: + # plain text + node.removeAttribute("type") + text = data.documentElement.childNodes[0].nodeValue + node.childNodes[0].replaceWholeText(text) + + elif len(text) > 80: + # xhtml + node.setAttribute("type", "xhtml") + node.removeChild(node.childNodes[0]) + node.appendChild(data.documentElement) + + except ExpatError: + # leave as html + pass + + else: + # recurse + retype(node) + + if parent.nodeName == "entry": + relativize(parent) + + +if __name__ == "__main__": + # run styler on each file mention on the command line + import sys + + for feed in sys.argv[1:]: + doc = minidom.parse(feed) + doc.normalize() + retype(doc.documentElement) + open(feed, "w").write(doc.toxml("utf-8")) diff --git a/code/planet/cache.py b/code/planet/cache.py index dfc529b7..7c2ef52e 100644 --- a/code/planet/cache.py +++ b/code/planet/cache.py @@ -1,5 +1,4 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- +#!/usr/bin/env python3 """Item cache. Between runs of Planet we need somewhere to store the feed information @@ -12,13 +11,13 @@ import os import re - +import time # Regular expressions to sanitise cache filenames -re_url_scheme = re.compile(r'^[^:]*://') -re_slash = re.compile(r'[?/]+') -re_initial_cruft = re.compile(r'^[,.]*') -re_final_cruft = re.compile(r'[,.]*$') +re_url_scheme = re.compile(r"^[^:]*://") +re_slash = re.compile(r"[?/]+") +re_initial_cruft = re.compile(r"^[,.]*") +re_final_cruft = re.compile(r"[,.]*$") class CachedInfo: @@ -33,9 +32,10 @@ class CachedInfo: and implement get_FIELD and set_FIELD functions which will be automatically called. """ + STRING = "string" - DATE = "date" - NULL = "null" + DATE = "date" + NULL = "null" def __init__(self, cache, id_, root=0): self._type = {} @@ -56,22 +56,19 @@ def cache_key(self, key): def cache_read(self): """Read information from the cache.""" - if self._root: - keys_key = " keys" - else: - keys_key = self._id + keys_key = " keys" if self._root else self._id - if self._cache.has_key(keys_key): + if keys_key in self._cache: keys = self._cache[keys_key].split(" ") else: return for key in keys: cache_key = self.cache_key(key) - if not self._cached.has_key(key) or self._cached[key]: + if key not in self._cached or self._cached[key]: # Key either hasn't been loaded, or is one for the cache self._value[key] = self._cache[cache_key] - self._type[key] = self._cache[cache_key + " type"] + self._type[key] = self._cache[f"{cache_key} type"] self._cached[key] = 1 def cache_write(self, sync=1): @@ -82,42 +79,34 @@ def cache_write(self, sync=1): for key in self.keys(): cache_key = self.cache_key(key) if not self._cached[key]: - if self._cache.has_key(cache_key): + if cache_key in self._cache: # Non-cached keys need to be cleared - del(self._cache[cache_key]) - del(self._cache[cache_key + " type"]) + del self._cache[cache_key] + del self._cache[f"{cache_key} type"] continue keys.append(key) self._cache[cache_key] = self._value[key] - self._cache[cache_key + " type"] = self._type[key] - - if self._root: - keys_key = " keys" - else: - keys_key = self._id + self._cache[f"{cache_key} type"] = self._type[key] + keys_key = " keys" if self._root else self._id self._cache[keys_key] = " ".join(keys) if sync: self._cache.sync() def cache_clear(self, sync=1): """Remove information from the cache.""" - if self._root: - keys_key = " keys" - else: - keys_key = self._id + keys_key = " keys" if self._root else self._id - if self._cache.has_key(keys_key): - keys = self._cache[keys_key].split(" ") - del(self._cache[keys_key]) - else: + if keys_key not in self._cache: return + keys = self._cache[keys_key].split(" ") + del self._cache[keys_key] for key in keys: cache_key = self.cache_key(key) - del(self._cache[cache_key]) - del(self._cache[cache_key + " type"]) + del self._cache[cache_key] + del self._cache[f"{cache_key} type"] if sync: self._cache.sync() @@ -125,7 +114,7 @@ def cache_clear(self, sync=1): def has_key(self, key): """Check whether the key exists.""" key = key.replace(" ", "_") - return self._value.has_key(key) + return key in self._value def key_type(self, key): """Return the key type.""" @@ -150,6 +139,8 @@ def set(self, key, value, cached=1): if value == None: return self.set_as_null(key, value) + elif isinstance(value, time.struct_time): + return self.set_as_date(key, value) else: try: return self.set_as_string(key, value) @@ -197,9 +188,8 @@ def set_as_string(self, key, value, cached=1): def get_as_string(self, key): """Return the key as a string value.""" key = key.replace(" ", "_") - if not self.has_key(key): - raise KeyError, key - + if key not in self._value: + raise KeyError(key) return self._value[key] def set_as_date(self, key, value, cached=1): @@ -207,7 +197,7 @@ def set_as_date(self, key, value, cached=1): The date should be a 9-item tuple as returned by time.gmtime(). """ - value = " ".join([ str(s) for s in value ]) + value = " ".join([str(s) for s in value]) key = key.replace(" ", "_") self._value[key] = value @@ -217,11 +207,10 @@ def set_as_date(self, key, value, cached=1): def get_as_date(self, key): """Return the key as a date value.""" key = key.replace(" ", "_") - if not self.has_key(key): - raise KeyError, key - + if key not in self._value: + raise KeyError(key) value = self._value[key] - return tuple([ int(i) for i in value.split(" ") ]) + return tuple(int(i) for i in value.split(" ")) def set_as_null(self, key, value, cached=1): """Set the key to the null value. @@ -236,20 +225,18 @@ def set_as_null(self, key, value, cached=1): def get_as_null(self, key): """Return the key as the null value.""" key = key.replace(" ", "_") - if not self.has_key(key): - raise KeyError, key - - return None + if key not in self._value: + raise KeyError(key) def del_key(self, key): """Delete the given key.""" key = key.replace(" ", "_") - if not self.has_key(key): - raise KeyError, key + if key not in self._value: + raise KeyError(key) - del(self._value[key]) - del(self._type[key]) - del(self._cached[key]) + del self._value[key] + del self._type[key] + del self._cached[key] def keys(self): """Return the list of cached keys.""" @@ -261,10 +248,10 @@ def __iter__(self): # Special methods __contains__ = has_key - __setitem__ = set_as_string - __getitem__ = get - __delitem__ = del_key - __delattr__ = del_key + __setitem__ = set_as_string + __getitem__ = get + __delitem__ = del_key + __delattr__ = del_key def __setattr__(self, key, value): if key.startswith("_"): @@ -273,10 +260,9 @@ def __setattr__(self, key, value): self.set(key, value) def __getattr__(self, key): - if self.has_key(key): + if key in self._value: return self.get(key) - else: - raise AttributeError, key + raise AttributeError(key) def filename(directory, filename): @@ -292,15 +278,9 @@ def filename(directory, filename): return os.path.join(directory, filename) + def utf8(value): """Return the value as a UTF-8 string.""" - if type(value) == type(u''): - return value.encode("utf-8") - else: - try: - return unicode(value, "utf-8").encode("utf-8") - except UnicodeError: - try: - return unicode(value, "iso-8859-1").encode("utf-8") - except UnicodeError: - return unicode(value, "ascii", "replace").encode("utf-8") + if isinstance(value, str): + return value + return value.decode("utf-8") if isinstance(value, bytes) else str(value) diff --git a/code/planet/compat_logging/__init__.py b/code/planet/compat_logging/__init__.py index 3bd0c6d7..63d60aed 100644 --- a/code/planet/compat_logging/__init__.py +++ b/code/planet/compat_logging/__init__.py @@ -14,8 +14,7 @@ # IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -""" -Logging package for Python. Based on PEP 282 and comments thereto in +"""Logging package for Python. Based on PEP 282 and comments thereto in comp.lang.python, and influenced by Apache's log4j system. Should work under Python versions >= 1.5.2, except that source line @@ -26,29 +25,35 @@ To use, simply 'import logging' and log away! """ -import sys, os, types, time, string, cStringIO +import io +import os +import string +import sys +import time +import types try: - import thread import threading + + import thread except ImportError: thread = None -__author__ = "Vinay Sajip " -__status__ = "beta" +__author__ = "Vinay Sajip " +__status__ = "beta" __version__ = "0.4.8.1" -__date__ = "26 June 2003" +__date__ = "26 June 2003" -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # Miscellaneous module data -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # -#_srcfile is used when walking the stack to check when we've got the first +# _srcfile is used when walking the stack to check when we've got the first # caller stack frame. # -if string.lower(__file__[-4:]) in ['.pyc', '.pyo']: - _srcfile = __file__[:-4] + '.py' +if string.lower(__file__[-4:]) in [".pyc", ".pyo"]: + _srcfile = __file__[:-4] + ".py" else: _srcfile = __file__ _srcfile = os.path.normcase(_srcfile) @@ -61,19 +66,19 @@ _srcfile = None # -#_startTime is used as the base when calculating the relative time of events +# _startTime is used as the base when calculating the relative time of events # _startTime = time.time() # -#raiseExceptions is used to see if exceptions during handling should be -#propagated +# raiseExceptions is used to see if exceptions during handling should be +# propagated # raiseExceptions = 1 -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # Level related stuff -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # # Default levels and level names, these can be replaced with any positive set # of values having corresponding names. There is a pseudo-level, NOTSET, which @@ -91,24 +96,24 @@ NOTSET = 0 _levelNames = { - CRITICAL : 'CRITICAL', - ERROR : 'ERROR', - WARNING : 'WARNING', - INFO : 'INFO', - DEBUG : 'DEBUG', - NOTSET : 'NOTSET', - 'CRITICAL' : CRITICAL, - 'ERROR' : ERROR, - 'WARN' : WARNING, - 'WARNING' : WARNING, - 'INFO' : INFO, - 'DEBUG' : DEBUG, - 'NOTSET' : NOTSET, + CRITICAL: "CRITICAL", + ERROR: "ERROR", + WARNING: "WARNING", + INFO: "INFO", + DEBUG: "DEBUG", + NOTSET: "NOTSET", + "CRITICAL": CRITICAL, + "ERROR": ERROR, + "WARN": WARNING, + "WARNING": WARNING, + "INFO": INFO, + "DEBUG": DEBUG, + "NOTSET": NOTSET, } + def getLevelName(level): - """ - Return the textual representation of logging level 'level'. + """Return the textual representation of logging level 'level'. If the level is one of the predefined levels (CRITICAL, ERROR, WARNING, INFO, DEBUG) then you get the corresponding string. If you have @@ -118,36 +123,37 @@ def getLevelName(level): """ return _levelNames.get(level, ("Level %s" % level)) + def addLevelName(level, levelName): - """ - Associate 'levelName' with 'level'. + """Associate 'levelName' with 'level'. This is used when converting levels to text during message formatting. """ _acquireLock() - try: #unlikely to cause an exception, but you never know... + try: # unlikely to cause an exception, but you never know... _levelNames[level] = levelName _levelNames[levelName] = level finally: _releaseLock() -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Thread-related stuff -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # -#_lock is used to serialize access to shared data structures in this module. -#This needs to be an RLock because fileConfig() creates Handlers and so -#might arbitrary user threads. Since Handler.__init__() updates the shared -#dictionary _handlers, it needs to acquire the lock. But if configuring, -#the lock would already have been acquired - so we need an RLock. -#The same argument applies to Loggers and Manager.loggerDict. +# _lock is used to serialize access to shared data structures in this module. +# This needs to be an RLock because fileConfig() creates Handlers and so +# might arbitrary user threads. Since Handler.__init__() updates the shared +# dictionary _handlers, it needs to acquire the lock. But if configuring, +# the lock would already have been acquired - so we need an RLock. +# The same argument applies to Loggers and Manager.loggerDict. # _lock = None + def _acquireLock(): - """ - Acquire the module-level lock for serializing access to shared data. + """Acquire the module-level lock for serializing access to shared data. This should be released with _releaseLock(). """ @@ -157,20 +163,20 @@ def _acquireLock(): if _lock: _lock.acquire() + def _releaseLock(): - """ - Release the module-level lock acquired by calling _acquireLock(). - """ + """Release the module-level lock acquired by calling _acquireLock().""" if _lock: _lock.release() -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # The logging record -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + class LogRecord: - """ - A LogRecord instance represents an event being logged. + """A LogRecord instance represents an event being logged. LogRecord instances are created every time something is logged. They contain all the information pertinent to the event being logged. The @@ -180,10 +186,9 @@ class LogRecord: the source line where the logging call was made, and any exception information to be logged. """ + def __init__(self, name, level, pathname, lineno, msg, args, exc_info): - """ - Initialize a logging record with interesting information. - """ + """Initialize a logging record with interesting information.""" ct = time.time() self.name = name self.msg = msg @@ -200,42 +205,40 @@ def __init__(self, name, level, pathname, lineno, msg, args, exc_info): self.exc_info = exc_info self.lineno = lineno self.created = ct - self.msecs = (ct - long(ct)) * 1000 + self.msecs = (ct - int(ct)) * 1000 self.relativeCreated = (self.created - _startTime) * 1000 if thread: self.thread = thread.get_ident() else: self.thread = None - if hasattr(os, 'getpid'): + if hasattr(os, "getpid"): self.process = os.getpid() else: self.process = None def __str__(self): - return ''%(self.name, self.levelno, - self.pathname, self.lineno, self.msg) + return f'' def getMessage(self): - """ - Return the message for this LogRecord. + """Return the message for this LogRecord. Return the message for this LogRecord after merging any user-supplied arguments with the message. """ - if not hasattr(types, "UnicodeType"): #if no unicode support... + if not hasattr(types, "UnicodeType"): # if no unicode support... msg = str(self.msg) else: try: msg = str(self.msg) except UnicodeError: - msg = self.msg #Defer encoding till later + msg = self.msg # Defer encoding till later if self.args: msg = msg % self.args return msg + def makeLogRecord(dict): - """ - Make a LogRecord whose attributes are defined by the specified dictionary, + """Make a LogRecord whose attributes are defined by the specified dictionary, This function is useful for converting a logging event received over a socket connection (which is sent as a dictionary) into a LogRecord instance. @@ -244,13 +247,14 @@ def makeLogRecord(dict): rv.__dict__.update(dict) return rv -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Formatter classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + class Formatter: - """ - Formatter instances are used to convert a LogRecord to text. + """Formatter instances are used to convert a LogRecord to text. Formatters need to know how a LogRecord is constructed. They are responsible for converting a LogRecord to (usually) a string which can @@ -291,8 +295,7 @@ class Formatter: converter = time.localtime def __init__(self, fmt=None, datefmt=None): - """ - Initialize the formatter with specified format strings. + """Initialize the formatter with specified format strings. Initialize the formatter either with the specified format string, or a default as described above. Allow for specialized date formatting with @@ -305,8 +308,7 @@ def __init__(self, fmt=None, datefmt=None): self.datefmt = datefmt def formatTime(self, record, datefmt=None): - """ - Return the creation time of the specified LogRecord as formatted text. + """Return the creation time of the specified LogRecord as formatted text. This method should be called from format() by a formatter which wants to make use of a formatted time. This method can be overridden @@ -331,14 +333,14 @@ def formatTime(self, record, datefmt=None): return s def formatException(self, ei): - """ - Format and return the specified exception information as a string. + """Format and return the specified exception information as a string. This default implementation just uses traceback.print_exception() """ import traceback - sio = cStringIO.StringIO() + + sio = io.StringIO() traceback.print_exception(ei[0], ei[1], ei[2], None, sio) s = sio.getvalue() sio.close() @@ -347,8 +349,7 @@ def formatException(self, ei): return s def format(self, record): - """ - Format the specified record as text. + """Format the specified record as text. The record's attribute dictionary is used as the operand to a string formatting operation which yields the returned string. @@ -360,7 +361,7 @@ def format(self, record): formatException() and appended to the message. """ record.message = record.getMessage() - if string.find(self._fmt,"%(asctime)") >= 0: + if string.find(self._fmt, "%(asctime)") >= 0: record.asctime = self.formatTime(record, self.datefmt) s = self._fmt % record.__dict__ if record.exc_info: @@ -369,18 +370,18 @@ def format(self, record): s = s + self.formatException(record.exc_info) return s + # # The default formatter to use when no other is specified # _defaultFormatter = Formatter() + class BufferingFormatter: - """ - A formatter suitable for formatting a number of records. - """ + """A formatter suitable for formatting a number of records.""" + def __init__(self, linefmt=None): - """ - Optionally specify a formatter which will be used to format each + """Optionally specify a formatter which will be used to format each individual record. """ if linefmt: @@ -389,21 +390,15 @@ def __init__(self, linefmt=None): self.linefmt = _defaultFormatter def formatHeader(self, records): - """ - Return the header string for the specified records. - """ + """Return the header string for the specified records.""" return "" def formatFooter(self, records): - """ - Return the footer string for the specified records. - """ + """Return the footer string for the specified records.""" return "" def format(self, records): - """ - Format the specified records and return the result as a string. - """ + """Format the specified records and return the result as a string.""" rv = "" if len(records) > 0: rv = rv + self.formatHeader(records) @@ -412,13 +407,14 @@ def format(self, records): rv = rv + self.formatFooter(records) return rv -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Filter classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + class Filter: - """ - Filter instances are used to perform arbitrary filtering of LogRecords. + """Filter instances are used to perform arbitrary filtering of LogRecords. Loggers and Handlers can optionally use Filter instances to filter records as desired. The base filter class only allows events which are @@ -427,9 +423,9 @@ class Filter: "A.B.C", "A.B.C.D", "A.B.D" etc. but not "A.BB", "B.A.B" etc. If initialized with the empty string, all events are passed. """ - def __init__(self, name=''): - """ - Initialize a filter. + + def __init__(self, name=""): + """Initialize a filter. Initialize with the name of the logger which, together with its children, will have its events allowed through the filter. If no @@ -439,48 +435,39 @@ def __init__(self, name=''): self.nlen = len(name) def filter(self, record): - """ - Determine if the specified record is to be logged. + """Determine if the specified record is to be logged. Is the specified record to be logged? Returns 0 for no, nonzero for yes. If deemed appropriate, the record may be modified in-place. """ - if self.nlen == 0: - return 1 - elif self.name == record.name: + if self.nlen == 0 or self.name == record.name: return 1 elif string.find(record.name, self.name, 0, self.nlen) != 0: return 0 - return (record.name[self.nlen] == ".") + return record.name[self.nlen] == "." + class Filterer: - """ - A base class for loggers and handlers which allows them to share + """A base class for loggers and handlers which allows them to share common code. """ + def __init__(self): - """ - Initialize the list of filters to be an empty list. - """ + """Initialize the list of filters to be an empty list.""" self.filters = [] def addFilter(self, filter): - """ - Add the specified filter to this handler. - """ - if not (filter in self.filters): + """Add the specified filter to this handler.""" + if filter not in self.filters: self.filters.append(filter) def removeFilter(self, filter): - """ - Remove the specified filter from this handler. - """ + """Remove the specified filter from this handler.""" if filter in self.filters: self.filters.remove(filter) def filter(self, record): - """ - Determine if a record is loggable by consulting all the filters. + """Determine if a record is loggable by consulting all the filters. The default is to allow the record to be logged; any filter can veto this and the record is then dropped. Returns a zero value if a record @@ -493,69 +480,61 @@ def filter(self, record): break return rv -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Handler classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + +_handlers = {} # repository of handlers (for flushing when shutdown called) -_handlers = {} #repository of handlers (for flushing when shutdown called) class Handler(Filterer): - """ - Handler instances dispatch logging events to specific destinations. + """Handler instances dispatch logging events to specific destinations. The base handler class. Acts as a placeholder which defines the Handler interface. Handlers can optionally use Formatter instances to format records as desired. By default, no formatter is specified; in this case, the 'raw' message as determined by record.message is logged. """ + def __init__(self, level=NOTSET): - """ - Initializes the instance - basically setting the formatter to None + """Initializes the instance - basically setting the formatter to None and the filter list to empty. """ Filterer.__init__(self) self.level = level self.formatter = None - #get the module data lock, as we're updating a shared structure. + # get the module data lock, as we're updating a shared structure. _acquireLock() - try: #unlikely to raise an exception, but you never know... + try: # unlikely to raise an exception, but you never know... _handlers[self] = 1 finally: _releaseLock() self.createLock() def createLock(self): - """ - Acquire a thread lock for serializing access to the underlying I/O. - """ + """Acquire a thread lock for serializing access to the underlying I/O.""" if thread: self.lock = thread.allocate_lock() else: self.lock = None def acquire(self): - """ - Acquire the I/O thread lock. - """ + """Acquire the I/O thread lock.""" if self.lock: self.lock.acquire() def release(self): - """ - Release the I/O thread lock. - """ + """Release the I/O thread lock.""" if self.lock: self.lock.release() def setLevel(self, level): - """ - Set the logging level of this handler. - """ + """Set the logging level of this handler.""" self.level = level def format(self, record): - """ - Format the specified record. + """Format the specified record. If a formatter is set, use it. Otherwise, use the default formatter for the module. @@ -567,18 +546,15 @@ def format(self, record): return fmt.format(record) def emit(self, record): - """ - Do whatever it takes to actually log the specified logging record. + """Do whatever it takes to actually log the specified logging record. This version is intended to be implemented by subclasses and so raises a NotImplementedError. """ - raise NotImplementedError, 'emit must be implemented '\ - 'by Handler subclasses' + raise NotImplementedError("emit must be implemented by Handler subclasses") def handle(self, record): - """ - Conditionally emit the specified logging record. + """Conditionally emit the specified logging record. Emission depends on filters which may have been added to the handler. Wrap the actual emission of the record with acquisition/release of @@ -595,32 +571,25 @@ def handle(self, record): return rv def setFormatter(self, fmt): - """ - Set the formatter for this handler. - """ + """Set the formatter for this handler.""" self.formatter = fmt def flush(self): - """ - Ensure all logging output has been flushed. + """Ensure all logging output has been flushed. This version does nothing and is intended to be implemented by subclasses. """ - pass def close(self): - """ - Tidy up any resources used by the handler. + """Tidy up any resources used by the handler. This version does nothing and is intended to be implemented by subclasses. """ - pass def handleError(self, record): - """ - Handle errors which occur during an emit() call. + """Handle errors which occur during an emit() call. This method should be called from handlers when an exception is encountered during an emit() call. If raiseExceptions is false, @@ -632,19 +601,20 @@ def handleError(self, record): """ if raiseExceptions: import traceback + ei = sys.exc_info() traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr) del ei + class StreamHandler(Handler): - """ - A handler class which writes logging records, appropriately formatted, + """A handler class which writes logging records, appropriately formatted, to a stream. Note that this class does not close the stream, as sys.stdout or sys.stderr may be used. """ + def __init__(self, strm=None): - """ - Initialize the handler. + """Initialize the handler. If strm is not specified, sys.stderr is used. """ @@ -655,14 +625,11 @@ def __init__(self, strm=None): self.formatter = None def flush(self): - """ - Flushes the stream. - """ + """Flushes the stream.""" self.stream.flush() def emit(self, record): - """ - Emit a record. + """Emit a record. If a formatter is specified, it is used to format the record. The record is then written to the stream with a trailing newline @@ -672,7 +639,7 @@ def emit(self, record): """ try: msg = self.format(record) - if not hasattr(types, "UnicodeType"): #if no unicode support... + if not hasattr(types, "UnicodeType"): # if no unicode support... self.stream.write("%s\n" % msg) else: try: @@ -683,82 +650,74 @@ def emit(self, record): except: self.handleError(record) + class FileHandler(StreamHandler): - """ - A handler class which writes formatted logging records to disk files. - """ + """A handler class which writes formatted logging records to disk files.""" + def __init__(self, filename, mode="a"): - """ - Open the specified file and use it as the stream for logging. - """ + """Open the specified file and use it as the stream for logging.""" StreamHandler.__init__(self, open(filename, mode)) self.baseFilename = filename self.mode = mode def close(self): - """ - Closes the stream. - """ + """Closes the stream.""" self.stream.close() -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Manager classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + class PlaceHolder: - """ - PlaceHolder instances are used in the Manager logger hierarchy to take + """PlaceHolder instances are used in the Manager logger hierarchy to take the place of nodes for which no loggers have been defined [FIXME add example]. """ + def __init__(self, alogger): - """ - Initialize with the specified logger being a child of this placeholder. - """ + """Initialize with the specified logger being a child of this placeholder.""" self.loggers = [alogger] def append(self, alogger): - """ - Add the specified logger as a child of this placeholder. - """ + """Add the specified logger as a child of this placeholder.""" if alogger not in self.loggers: self.loggers.append(alogger) + # # Determine which class to use when instantiating loggers. # _loggerClass = None + def setLoggerClass(klass): - """ - Set the class to be used when instantiating a logger. The class should + """Set the class to be used when instantiating a logger. The class should define __init__() such that only a name argument is required, and the __init__() should call Logger.__init__() """ if klass != Logger: if not issubclass(klass, Logger): - raise TypeError, "logger not derived from logging.Logger: " + \ - klass.__name__ + raise TypeError(f"logger not derived from logging.Logger: {klass.__name__}") global _loggerClass _loggerClass = klass + class Manager: - """ - There is [under normal circumstances] just one Manager instance, which + """There is [under normal circumstances] just one Manager instance, which holds the hierarchy of loggers. """ + def __init__(self, rootnode): - """ - Initialize the manager with the root node of the logger hierarchy. - """ + """Initialize the manager with the root node of the logger hierarchy.""" self.root = rootnode self.disable = 0 self.emittedNoHandlerWarning = 0 self.loggerDict = {} def getLogger(self, name): - """ - Get a logger with the specified name (channel name), creating it + """Get a logger with the specified name (channel name), creating it if it doesn't yet exist. If a PlaceHolder existed for the specified name [i.e. the logger @@ -788,8 +747,7 @@ def getLogger(self, name): return rv def _fixupParents(self, alogger): - """ - Ensure that there are either loggers or placeholders all the way + """Ensure that there are either loggers or placeholders all the way from the specified logger to the root of the logger hierarchy. """ name = alogger.name @@ -812,22 +770,22 @@ def _fixupParents(self, alogger): alogger.parent = rv def _fixupChildren(self, ph, alogger): - """ - Ensure that children of the placeholder ph are connected to the + """Ensure that children of the placeholder ph are connected to the specified logger. """ for c in ph.loggers: - if string.find(c.parent.name, alogger.name) <> 0: + if string.find(c.parent.name, alogger.name) != 0: alogger.parent = c.parent c.parent = alogger -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Logger classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + class Logger(Filterer): - """ - Instances of the Logger class represent a single logging channel. A + """Instances of the Logger class represent a single logging channel. A "logging channel" indicates an area of an application. Exactly how an "area" is defined is up to the application developer. Since an application can have any number of areas, logging channels are identified @@ -840,10 +798,9 @@ class Logger(Filterer): level, and "input.csv", "input.xls" and "input.gnu" for the sub-levels. There is no arbitrary limit to the depth of nesting. """ + def __init__(self, name, level=NOTSET): - """ - Initialize the logger with a name and an optional level. - """ + """Initialize the logger with a name and an optional level.""" Filterer.__init__(self) self.name = name self.level = level @@ -853,20 +810,17 @@ def __init__(self, name, level=NOTSET): self.disabled = 0 def setLevel(self, level): - """ - Set the logging level of this logger. - """ + """Set the logging level of this logger.""" self.level = level -# def getRoot(self): -# """ -# Get the root of the logger hierarchy. -# """ -# return Logger.root + # def getRoot(self): + # """ + # Get the root of the logger hierarchy. + # """ + # return Logger.root def debug(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'DEBUG'. + """Log 'msg % args' with severity 'DEBUG'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -875,12 +829,11 @@ def debug(self, msg, *args, **kwargs): """ if self.manager.disable >= DEBUG: return - if DEBUG >= self.getEffectiveLevel(): - apply(self._log, (DEBUG, msg, args), kwargs) + if self.getEffectiveLevel() <= DEBUG: + self._log(DEBUG, msg, *args, **kwargs) def info(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'INFO'. + """Log 'msg % args' with severity 'INFO'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -889,12 +842,11 @@ def info(self, msg, *args, **kwargs): """ if self.manager.disable >= INFO: return - if INFO >= self.getEffectiveLevel(): - apply(self._log, (INFO, msg, args), kwargs) + if self.getEffectiveLevel() <= INFO: + self._log(INFO, msg, args, **kwargs) def warning(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'WARNING'. + """Log 'msg % args' with severity 'WARNING'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -904,13 +856,12 @@ def warning(self, msg, *args, **kwargs): if self.manager.disable >= WARNING: return if self.isEnabledFor(WARNING): - apply(self._log, (WARNING, msg, args), kwargs) + self._log(WARNING, msg, args, **kwargs) warn = warning def error(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'ERROR'. + """Log 'msg % args' with severity 'ERROR'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -920,17 +871,14 @@ def error(self, msg, *args, **kwargs): if self.manager.disable >= ERROR: return if self.isEnabledFor(ERROR): - apply(self._log, (ERROR, msg, args), kwargs) + self._log(ERROR, msg, args, **kwargs) def exception(self, msg, *args): - """ - Convenience method for logging an ERROR with exception information. - """ - apply(self.error, (msg,) + args, {'exc_info': 1}) + """Convenience method for logging an ERROR with exception information.""" + self.error(msg, *args, exc_info=True) def critical(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'CRITICAL'. + """Log 'msg % args' with severity 'CRITICAL'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -939,14 +887,13 @@ def critical(self, msg, *args, **kwargs): """ if self.manager.disable >= CRITICAL: return - if CRITICAL >= self.getEffectiveLevel(): - apply(self._log, (CRITICAL, msg, args), kwargs) + if self.getEffectiveLevel() <= CRITICAL: + self._log(CRITICAL, msg, *args, **kwargs) fatal = critical def log(self, level, msg, *args, **kwargs): - """ - Log 'msg % args' with the severity 'level'. + """Log 'msg % args' with the severity 'level'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -956,11 +903,10 @@ def log(self, level, msg, *args, **kwargs): if self.manager.disable >= level: return if self.isEnabledFor(level): - apply(self._log, (level, msg, args), kwargs) + self._log(level, msg, args, **kwargs) def findCaller(self): - """ - Find the stack frame of the caller so that we can note the source + """Find the stack frame of the caller so that we can note the source file name and line number. """ f = sys._getframe(1) @@ -973,15 +919,13 @@ def findCaller(self): return filename, f.f_lineno def makeRecord(self, name, level, fn, lno, msg, args, exc_info): - """ - A factory method which can be overridden in subclasses to create + """A factory method which can be overridden in subclasses to create specialized LogRecords. """ return LogRecord(name, level, fn, lno, msg, args, exc_info) def _log(self, level, msg, args, exc_info=None): - """ - Low-level logging routine which creates a LogRecord and then calls + """Low-level logging routine which creates a LogRecord and then calls all the handlers of this logger to handle the record. """ if _srcfile: @@ -994,8 +938,7 @@ def _log(self, level, msg, args, exc_info=None): self.handle(record) def handle(self, record): - """ - Call the handlers for the specified record. + """Call the handlers for the specified record. This method is used for unpickled records received from a socket, as well as those created locally. Logger-level filtering is applied. @@ -1004,23 +947,18 @@ def handle(self, record): self.callHandlers(record) def addHandler(self, hdlr): - """ - Add the specified handler to this logger. - """ - if not (hdlr in self.handlers): + """Add the specified handler to this logger.""" + if hdlr not in self.handlers: self.handlers.append(hdlr) def removeHandler(self, hdlr): - """ - Remove the specified handler from this logger. - """ + """Remove the specified handler from this logger.""" if hdlr in self.handlers: - #hdlr.close() + # hdlr.close() self.handlers.remove(hdlr) def callHandlers(self, record): - """ - Pass a record to all relevant handlers. + """Pass a record to all relevant handlers. Loop through all handlers for this logger and its parents in the logger hierarchy. If no handler was found, output a one-off error @@ -1036,17 +974,15 @@ def callHandlers(self, record): if record.levelno >= hdlr.level: hdlr.handle(record) if not c.propagate: - c = None #break out + c = None # break out else: c = c.parent if (found == 0) and not self.manager.emittedNoHandlerWarning: - sys.stderr.write("No handlers could be found for logger" - " \"%s\"\n" % self.name) + sys.stderr.write("No handlers could be found for logger" ' "%s"\n' % self.name) self.manager.emittedNoHandlerWarning = 1 def getEffectiveLevel(self): - """ - Get the effective level for this logger. + """Get the effective level for this logger. Loop through this logger and its parents in the logger hierarchy, looking for a non-zero logging level. Return the first one found. @@ -1059,40 +995,38 @@ def getEffectiveLevel(self): return NOTSET def isEnabledFor(self, level): - """ - Is this logger enabled for level 'level'? - """ + """Is this logger enabled for level 'level'?""" if self.manager.disable >= level: return 0 return level >= self.getEffectiveLevel() + class RootLogger(Logger): - """ - A root logger is not that different to any other logger, except that + """A root logger is not that different to any other logger, except that it must have a logging level and there is only one instance of it in the hierarchy. """ + def __init__(self, level): - """ - Initialize the logger with the name "root". - """ + """Initialize the logger with the name "root".""" Logger.__init__(self, "root", level) + _loggerClass = Logger root = RootLogger(WARNING) Logger.root = root Logger.manager = Manager(Logger.root) -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # Configuration classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- BASIC_FORMAT = "%(levelname)s:%(name)s:%(message)s" + def basicConfig(): - """ - Do basic configuration for the logging system by creating a + """Do basic configuration for the logging system by creating a StreamHandler with a default Formatter and adding it to the root logger. """ @@ -1102,14 +1036,15 @@ def basicConfig(): hdlr.setFormatter(fmt) root.addHandler(hdlr) -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Utility functions at module level. # Basically delegate everything to the root logger. -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + def getLogger(name=None): - """ - Return a logger with the specified name, creating it if necessary. + """Return a logger with the specified name, creating it if necessary. If no name is specified, return the root logger. """ @@ -1118,7 +1053,8 @@ def getLogger(name=None): else: return root -#def getRootLogger(): + +# def getRootLogger(): # """ # Return the root logger. # @@ -1127,70 +1063,66 @@ def getLogger(name=None): # """ # return root + def critical(msg, *args, **kwargs): - """ - Log a message with severity 'CRITICAL' on the root logger. - """ + """Log a message with severity 'CRITICAL' on the root logger.""" if len(root.handlers) == 0: basicConfig() - apply(root.critical, (msg,)+args, kwargs) + root.critical(msg, *args, **kwargs) + fatal = critical + def error(msg, *args, **kwargs): - """ - Log a message with severity 'ERROR' on the root logger. - """ + """Log a message with severity 'ERROR' on the root logger.""" if len(root.handlers) == 0: basicConfig() - apply(root.error, (msg,)+args, kwargs) + root.error(msg, *args, **kwargs) + def exception(msg, *args): - """ - Log a message with severity 'ERROR' on the root logger, + """Log a message with severity 'ERROR' on the root logger, with exception information. """ - apply(error, (msg,)+args, {'exc_info': 1}) + error(msg, *args, exc_info=True) + def warning(msg, *args, **kwargs): - """ - Log a message with severity 'WARNING' on the root logger. - """ + """Log a message with severity 'WARNING' on the root logger.""" if len(root.handlers) == 0: basicConfig() - apply(root.warning, (msg,)+args, kwargs) + root.warning(msg, *args, **kwargs) + warn = warning + def info(msg, *args, **kwargs): - """ - Log a message with severity 'INFO' on the root logger. - """ + """Log a message with severity 'INFO' on the root logger.""" if len(root.handlers) == 0: basicConfig() - apply(root.info, (msg,)+args, kwargs) + root.info(msg, *args, **kwargs) + def debug(msg, *args, **kwargs): - """ - Log a message with severity 'DEBUG' on the root logger. - """ + """Log a message with severity 'DEBUG' on the root logger.""" if len(root.handlers) == 0: basicConfig() - apply(root.debug, (msg,)+args, kwargs) + root.debug(msg, *args, **kwargs) + def disable(level): - """ - Disable all logging calls less severe than 'level'. - """ + """Disable all logging calls less severe than 'level'.""" root.manager.disable = level + def shutdown(): - """ - Perform any cleanup actions in the logging system (e.g. flushing + """Perform any cleanup actions in the logging system (e.g. flushing buffers). Should be called at application exit. """ - for h in _handlers.keys(): + for h in _handlers: h.flush() h.close() diff --git a/code/planet/compat_logging/config.py b/code/planet/compat_logging/config.py index d4d08f01..c25643aa 100644 --- a/code/planet/compat_logging/config.py +++ b/code/planet/compat_logging/config.py @@ -14,8 +14,7 @@ # IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -""" -Logging package for Python. Based on PEP 282 and comments thereto in +"""Logging package for Python. Based on PEP 282 and comments thereto in comp.lang.python, and influenced by Apache's log4j system. Should work under Python versions >= 1.5.2, except that source line @@ -26,16 +25,20 @@ To use, simply 'import logging' and log away! """ -import sys, logging, logging.handlers, string, thread, threading, socket, struct, os - -from SocketServer import ThreadingTCPServer, StreamRequestHandler - +import logging +import logging.handlers +import os +import string +import struct +import sys +import threading +from socketserver import StreamRequestHandler, ThreadingTCPServer DEFAULT_LOGGING_CONFIG_PORT = 9030 if sys.platform == "win32": - RESET_ERROR = 10054 #WSAECONNRESET + RESET_ERROR = 10054 # WSAECONNRESET else: - RESET_ERROR = 104 #ECONNRESET + RESET_ERROR = 104 # ECONNRESET # # The following code implements a socket listener for on-the-fly @@ -44,9 +47,9 @@ # _listener holds the server object doing the listening _listener = None + def fileConfig(fname, defaults=None): - """ - Read the logging configuration from a ConfigParser-format file. + """Read the logging configuration from a ConfigParser-format file. This can be called several times from an application, allowing an end user the ability to select from various pre-canned configurations (if the @@ -57,14 +60,14 @@ def fileConfig(fname, defaults=None): rather than a filename, in which case the file-like object will be read using readfp. """ - import ConfigParser + import configparser - cp = ConfigParser.ConfigParser(defaults) - if hasattr(cp, 'readfp') and hasattr(fname, 'readline'): + cp = configparser.ConfigParser(defaults) + if hasattr(cp, "readfp") and hasattr(fname, "readline"): cp.readfp(fname) else: cp.read(fname) - #first, do the formatters... + # first, do the formatters... flist = cp.get("formatters", "keys") if len(flist): flist = string.split(flist, ",") @@ -82,19 +85,19 @@ def fileConfig(fname, defaults=None): dfs = None f = logging.Formatter(fs, dfs) formatters[form] = f - #next, do the handlers... - #critical section... + # next, do the handlers... + # critical section... logging._acquireLock() try: try: - #first, lose the existing handlers... + # first, lose the existing handlers... logging._handlers.clear() - #now set up the new ones... + # now set up the new ones... hlist = cp.get("handlers", "keys") if len(hlist): hlist = string.split(hlist, ",") handlers = {} - fixups = [] #for inter-handler references + fixups = [] # for inter-handler references for hand in hlist: sectname = "handler_%s" % hand klass = cp.get(sectname, "class") @@ -106,27 +109,27 @@ def fileConfig(fname, defaults=None): klass = eval(klass, vars(logging)) args = cp.get(sectname, "args") args = eval(args, vars(logging)) - h = apply(klass, args) + h = klass(*args) if "level" in opts: level = cp.get(sectname, "level") h.setLevel(logging._levelNames[level]) if len(fmt): h.setFormatter(formatters[fmt]) - #temporary hack for FileHandler and MemoryHandler. + # temporary hack for FileHandler and MemoryHandler. if klass == logging.handlers.MemoryHandler: if "target" in opts: - target = cp.get(sectname,"target") + target = cp.get(sectname, "target") else: target = "" - if len(target): #the target handler may not be loaded yet, so keep for later... + if len(target): # the target handler may not be loaded yet, so keep for later... fixups.append((h, target)) handlers[hand] = h - #now all handlers are loaded, fixup inter-handler references... + # now all handlers are loaded, fixup inter-handler references... for fixup in fixups: h = fixup[0] t = fixup[1] h.setTarget(handlers[t]) - #at last, the loggers...first the root... + # at last, the loggers...first the root... llist = cp.get("loggers", "keys") llist = string.split(llist, ",") llist.remove("root") @@ -144,17 +147,17 @@ def fileConfig(fname, defaults=None): hlist = string.split(hlist, ",") for hand in hlist: log.addHandler(handlers[hand]) - #and now the others... - #we don't want to lose the existing loggers, - #since other threads may have pointers to them. - #existing is set to contain all existing loggers, - #and as we go through the new configuration we - #remove any which are configured. At the end, - #what's left in existing is the set of loggers - #which were in the previous configuration but - #which are not in the new configuration. + # and now the others... + # we don't want to lose the existing loggers, + # since other threads may have pointers to them. + # existing is set to contain all existing loggers, + # and as we go through the new configuration we + # remove any which are configured. At the end, + # what's left in existing is the set of loggers + # which were in the previous configuration but + # which are not in the new configuration. existing = root.manager.loggerDict.keys() - #now set up the new ones... + # now set up the new ones... for log in llist: sectname = "logger_%s" % log qn = cp.get(sectname, "qualname") @@ -178,22 +181,23 @@ def fileConfig(fname, defaults=None): hlist = string.split(hlist, ",") for hand in hlist: logger.addHandler(handlers[hand]) - #Disable any old loggers. There's no point deleting - #them as other threads may continue to hold references - #and by disabling them, you stop them doing any logging. + # Disable any old loggers. There's no point deleting + # them as other threads may continue to hold references + # and by disabling them, you stop them doing any logging. for log in existing: root.manager.loggerDict[log].disabled = 1 except: import traceback + ei = sys.exc_info() traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr) del ei finally: logging._releaseLock() + def listen(port=DEFAULT_LOGGING_CONFIG_PORT): - """ - Start up a socket server on the specified port, and listen for new + """Start up a socket server on the specified port, and listen for new configurations. These will be sent as a file suitable for processing by fileConfig(). @@ -201,25 +205,25 @@ def listen(port=DEFAULT_LOGGING_CONFIG_PORT): and which you can join() when appropriate. To stop the server, call stopListening(). """ - if not thread: - raise NotImplementedError, "listen() needs threading to work" + if not threading: + raise NotImplementedError("listen() needs threading to work") class ConfigStreamHandler(StreamRequestHandler): - """ - Handler for a logging configuration request. + """Handler for a logging configuration request. It expects a completely new logging configuration and uses fileConfig to install it. """ + def handle(self): - """ - Handle a request. + """Handle a request. Each request is expected to be a 4-byte length, followed by the config file. Uses fileConfig() to do the grunt work. """ import tempfile + try: conn = self.connection chunk = conn.recv(4) @@ -228,19 +232,19 @@ def handle(self): chunk = self.connection.recv(slen) while len(chunk) < slen: chunk = chunk + conn.recv(slen - len(chunk)) - #Apply new configuration. We'd like to be able to - #create a StringIO and pass that in, but unfortunately - #1.5.2 ConfigParser does not support reading file - #objects, only actual files. So we create a temporary - #file and remove it later. + # Apply new configuration. We'd like to be able to + # create a StringIO and pass that in, but unfortunately + # 1.5.2 ConfigParser does not support reading file + # objects, only actual files. So we create a temporary + # file and remove it later. file = tempfile.mktemp(".ini") f = open(file, "w") f.write(chunk) f.close() fileConfig(file) os.remove(file) - except socket.error, e: - if type(e.args) != types.TupleType: + except OSError as e: + if type(e.args) != tuple: raise else: errcode = e.args[0] @@ -248,14 +252,11 @@ def handle(self): raise class ConfigSocketReceiver(ThreadingTCPServer): - """ - A simple TCP socket-based logging config receiver. - """ + """A simple TCP socket-based logging config receiver.""" allow_reuse_address = 1 - def __init__(self, host='localhost', port=DEFAULT_LOGGING_CONFIG_PORT, - handler=None): + def __init__(self, host="localhost", port=DEFAULT_LOGGING_CONFIG_PORT, handler=None): ThreadingTCPServer.__init__(self, (host, port), handler) logging._acquireLock() self.abort = 0 @@ -264,11 +265,10 @@ def __init__(self, host='localhost', port=DEFAULT_LOGGING_CONFIG_PORT, def serve_until_stopped(self): import select + abort = 0 while not abort: - rd, wr, ex = select.select([self.socket.fileno()], - [], [], - self.timeout) + rd, wr, ex = select.select([self.socket.fileno()], [], [], self.timeout) if rd: self.handle_request() logging._acquireLock() @@ -283,14 +283,11 @@ def serve(rcvr, hdlr, port): logging._releaseLock() server.serve_until_stopped() - return threading.Thread(target=serve, - args=(ConfigSocketReceiver, - ConfigStreamHandler, port)) + return threading.Thread(target=serve, args=(ConfigSocketReceiver, ConfigStreamHandler, port)) + def stopListening(): - """ - Stop the listening server which was created with a call to listen(). - """ + """Stop the listening server which was created with a call to listen().""" global _listener if _listener: logging._acquireLock() diff --git a/code/planet/compat_logging/handlers.py b/code/planet/compat_logging/handlers.py index 26ca8adc..2eafc0b2 100644 --- a/code/planet/compat_logging/handlers.py +++ b/code/planet/compat_logging/handlers.py @@ -14,8 +14,7 @@ # IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -""" -Logging package for Python. Based on PEP 282 and comments thereto in +"""Logging package for Python. Based on PEP 282 and comments thereto in comp.lang.python, and influenced by Apache's log4j system. Should work under Python versions >= 1.5.2, except that source line @@ -26,25 +25,29 @@ To use, simply 'import logging' and log away! """ -import sys, logging, socket, types, os, string, cPickle, struct, time - -from SocketServer import ThreadingTCPServer, StreamRequestHandler +import logging +import os +import pickle +import socket +import string +import struct +import time +import types # # Some constants... # -DEFAULT_TCP_LOGGING_PORT = 9020 -DEFAULT_UDP_LOGGING_PORT = 9021 -DEFAULT_HTTP_LOGGING_PORT = 9022 -DEFAULT_SOAP_LOGGING_PORT = 9023 -SYSLOG_UDP_PORT = 514 +DEFAULT_TCP_LOGGING_PORT = 9020 +DEFAULT_UDP_LOGGING_PORT = 9021 +DEFAULT_HTTP_LOGGING_PORT = 9022 +DEFAULT_SOAP_LOGGING_PORT = 9023 +SYSLOG_UDP_PORT = 514 class RotatingFileHandler(logging.FileHandler): def __init__(self, filename, mode="a", maxBytes=0, backupCount=0): - """ - Open the specified file and use it as the stream for logging. + """Open the specified file and use it as the stream for logging. By default, the file grows indefinitely. You can specify particular values of maxBytes and backupCount to allow the file to rollover at @@ -70,17 +73,14 @@ def __init__(self, filename, mode="a", maxBytes=0, backupCount=0): self.mode = "a" def doRollover(self): - """ - Do a rollover, as described in __init__(). - """ - + """Do a rollover, as described in __init__().""" self.stream.close() if self.backupCount > 0: for i in range(self.backupCount - 1, 0, -1): sfn = "%s.%d" % (self.baseFilename, i) dfn = "%s.%d" % (self.baseFilename, i + 1) if os.path.exists(sfn): - #print "%s -> %s" % (sfn, dfn) + # print "%s -> %s" % (sfn, dfn) if os.path.exists(dfn): os.remove(dfn) os.rename(sfn, dfn) @@ -88,27 +88,25 @@ def doRollover(self): if os.path.exists(dfn): os.remove(dfn) os.rename(self.baseFilename, dfn) - #print "%s -> %s" % (self.baseFilename, dfn) + # print "%s -> %s" % (self.baseFilename, dfn) self.stream = open(self.baseFilename, "w") def emit(self, record): - """ - Emit a record. + """Emit a record. Output the record to the file, catering for rollover as described in doRollover(). """ - if self.maxBytes > 0: # are we rolling over? + if self.maxBytes > 0: # are we rolling over? msg = "%s\n" % self.format(record) - self.stream.seek(0, 2) #due to non-posix-compliant Windows feature + self.stream.seek(0, 2) # due to non-posix-compliant Windows feature if self.stream.tell() + len(msg) >= self.maxBytes: self.doRollover() logging.FileHandler.emit(self, record) class SocketHandler(logging.Handler): - """ - A handler class which writes logging records, in pickle format, to + """A handler class which writes logging records, in pickle format, to a streaming socket. The socket is kept open across logging calls. If the peer resets it, an attempt is made to reconnect on the next call. The pickle which is sent is that of the LogRecord's attribute dictionary @@ -120,8 +118,7 @@ class SocketHandler(logging.Handler): """ def __init__(self, host, port): - """ - Initializes the handler with a specific host address and port. + """Initializes the handler with a specific host address and port. The attribute 'closeOnError' is set to 1 - which means that if a socket error occurs, the socket is silently closed and then @@ -134,8 +131,7 @@ def __init__(self, host, port): self.closeOnError = 0 def makeSocket(self): - """ - A factory method which allows subclasses to define the precise + """A factory method which allows subclasses to define the precise type of socket they want. """ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) @@ -143,8 +139,7 @@ def makeSocket(self): return s def send(self, s): - """ - Send a pickled string to the socket. + """Send a pickled string to the socket. This function allows for partial sends which can happen when the network is busy. @@ -160,19 +155,17 @@ def send(self, s): left = left - sent def makePickle(self, record): - """ - Pickles the record in binary format with a length prefix, and + """Pickles the record in binary format with a length prefix, and returns it ready for transmission across the socket. """ - s = cPickle.dumps(record.__dict__, 1) - #n = len(s) - #slen = "%c%c" % ((n >> 8) & 0xFF, n & 0xFF) + s = pickle.dumps(record.__dict__, 1) + # n = len(s) + # slen = "%c%c" % ((n >> 8) & 0xFF, n & 0xFF) slen = struct.pack(">L", len(s)) return slen + s def handleError(self, record): - """ - Handle an error during logging. + """Handle an error during logging. An error has occurred during logging. Most likely cause - connection lost. Close the socket so that we can retry on the @@ -180,13 +173,12 @@ def handleError(self, record): """ if self.closeOnError and self.sock: self.sock.close() - self.sock = None #try to reconnect next time + self.sock = None # try to reconnect next time else: logging.Handler.handleError(self, record) def emit(self, record): - """ - Emit a record. + """Emit a record. Pickles the record and writes it to the socket in binary format. If there is an error with the socket, silently drop the packet. @@ -202,16 +194,14 @@ def emit(self, record): self.handleError(record) def close(self): - """ - Closes the socket. - """ + """Closes the socket.""" if self.sock: self.sock.close() self.sock = None + class DatagramHandler(SocketHandler): - """ - A handler class which writes logging records, in pickle format, to + """A handler class which writes logging records, in pickle format, to a datagram socket. The pickle which is sent is that of the LogRecord's attribute dictionary (__dict__), so that the receiver does not need to have the logging module installed in order to process the logging event. @@ -220,24 +210,21 @@ class DatagramHandler(SocketHandler): makeLogRecord function. """ + def __init__(self, host, port): - """ - Initializes the handler with a specific host address and port. - """ + """Initializes the handler with a specific host address and port.""" SocketHandler.__init__(self, host, port) self.closeOnError = 0 def makeSocket(self): - """ - The factory method of SocketHandler is here overridden to create + """The factory method of SocketHandler is here overridden to create a UDP socket (SOCK_DGRAM). """ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) return s def send(self, s): - """ - Send a pickled string to a socket. + """Send a pickled string to a socket. This function no longer allows for partial sends which can happen when the network is busy - UDP does not guarantee delivery and @@ -245,9 +232,9 @@ def send(self, s): """ self.sock.sendto(s, (self.host, self.port)) + class SysLogHandler(logging.Handler): - """ - A handler class which sends formatted logging records to a syslog + """A handler class which sends formatted logging records to a syslog server. Based on Sam Rushing's syslog module: http://www.nightmare.com/squirl/python-ext/misc/syslog.py Contributed by Nicolas Untz (after which minor refactoring changes @@ -264,79 +251,78 @@ class SysLogHandler(logging.Handler): # # priorities (these are ordered) - LOG_EMERG = 0 # system is unusable - LOG_ALERT = 1 # action must be taken immediately - LOG_CRIT = 2 # critical conditions - LOG_ERR = 3 # error conditions - LOG_WARNING = 4 # warning conditions - LOG_NOTICE = 5 # normal but significant condition - LOG_INFO = 6 # informational - LOG_DEBUG = 7 # debug-level messages + LOG_EMERG = 0 # system is unusable + LOG_ALERT = 1 # action must be taken immediately + LOG_CRIT = 2 # critical conditions + LOG_ERR = 3 # error conditions + LOG_WARNING = 4 # warning conditions + LOG_NOTICE = 5 # normal but significant condition + LOG_INFO = 6 # informational + LOG_DEBUG = 7 # debug-level messages # facility codes - LOG_KERN = 0 # kernel messages - LOG_USER = 1 # random user-level messages - LOG_MAIL = 2 # mail system - LOG_DAEMON = 3 # system daemons - LOG_AUTH = 4 # security/authorization messages - LOG_SYSLOG = 5 # messages generated internally by syslogd - LOG_LPR = 6 # line printer subsystem - LOG_NEWS = 7 # network news subsystem - LOG_UUCP = 8 # UUCP subsystem - LOG_CRON = 9 # clock daemon - LOG_AUTHPRIV = 10 # security/authorization messages (private) + LOG_KERN = 0 # kernel messages + LOG_USER = 1 # random user-level messages + LOG_MAIL = 2 # mail system + LOG_DAEMON = 3 # system daemons + LOG_AUTH = 4 # security/authorization messages + LOG_SYSLOG = 5 # messages generated internally by syslogd + LOG_LPR = 6 # line printer subsystem + LOG_NEWS = 7 # network news subsystem + LOG_UUCP = 8 # UUCP subsystem + LOG_CRON = 9 # clock daemon + LOG_AUTHPRIV = 10 # security/authorization messages (private) # other codes through 15 reserved for system use - LOG_LOCAL0 = 16 # reserved for local use - LOG_LOCAL1 = 17 # reserved for local use - LOG_LOCAL2 = 18 # reserved for local use - LOG_LOCAL3 = 19 # reserved for local use - LOG_LOCAL4 = 20 # reserved for local use - LOG_LOCAL5 = 21 # reserved for local use - LOG_LOCAL6 = 22 # reserved for local use - LOG_LOCAL7 = 23 # reserved for local use + LOG_LOCAL0 = 16 # reserved for local use + LOG_LOCAL1 = 17 # reserved for local use + LOG_LOCAL2 = 18 # reserved for local use + LOG_LOCAL3 = 19 # reserved for local use + LOG_LOCAL4 = 20 # reserved for local use + LOG_LOCAL5 = 21 # reserved for local use + LOG_LOCAL6 = 22 # reserved for local use + LOG_LOCAL7 = 23 # reserved for local use priority_names = { - "alert": LOG_ALERT, - "crit": LOG_CRIT, + "alert": LOG_ALERT, + "crit": LOG_CRIT, "critical": LOG_CRIT, - "debug": LOG_DEBUG, - "emerg": LOG_EMERG, - "err": LOG_ERR, - "error": LOG_ERR, # DEPRECATED - "info": LOG_INFO, - "notice": LOG_NOTICE, - "panic": LOG_EMERG, # DEPRECATED - "warn": LOG_WARNING, # DEPRECATED - "warning": LOG_WARNING, - } + "debug": LOG_DEBUG, + "emerg": LOG_EMERG, + "err": LOG_ERR, + "error": LOG_ERR, # DEPRECATED + "info": LOG_INFO, + "notice": LOG_NOTICE, + "panic": LOG_EMERG, # DEPRECATED + "warn": LOG_WARNING, # DEPRECATED + "warning": LOG_WARNING, + } facility_names = { - "auth": LOG_AUTH, + "auth": LOG_AUTH, "authpriv": LOG_AUTHPRIV, - "cron": LOG_CRON, - "daemon": LOG_DAEMON, - "kern": LOG_KERN, - "lpr": LOG_LPR, - "mail": LOG_MAIL, - "news": LOG_NEWS, - "security": LOG_AUTH, # DEPRECATED - "syslog": LOG_SYSLOG, - "user": LOG_USER, - "uucp": LOG_UUCP, - "local0": LOG_LOCAL0, - "local1": LOG_LOCAL1, - "local2": LOG_LOCAL2, - "local3": LOG_LOCAL3, - "local4": LOG_LOCAL4, - "local5": LOG_LOCAL5, - "local6": LOG_LOCAL6, - "local7": LOG_LOCAL7, - } - - def __init__(self, address=('localhost', SYSLOG_UDP_PORT), facility=LOG_USER): - """ - Initialize a handler. + "cron": LOG_CRON, + "daemon": LOG_DAEMON, + "kern": LOG_KERN, + "lpr": LOG_LPR, + "mail": LOG_MAIL, + "news": LOG_NEWS, + "security": LOG_AUTH, # DEPRECATED + "syslog": LOG_SYSLOG, + "user": LOG_USER, + "uucp": LOG_UUCP, + "local0": LOG_LOCAL0, + "local1": LOG_LOCAL1, + "local2": LOG_LOCAL2, + "local3": LOG_LOCAL3, + "local4": LOG_LOCAL4, + "local5": LOG_LOCAL5, + "local6": LOG_LOCAL6, + "local7": LOG_LOCAL7, + } + + def __init__(self, address=("localhost", SYSLOG_UDP_PORT), facility=LOG_USER): + """Initialize a handler. If address is specified as a string, UNIX socket is used. If facility is not specified, LOG_USER is used. @@ -350,7 +336,7 @@ def __init__(self, address=('localhost', SYSLOG_UDP_PORT), facility=LOG_USER): # syslog may require either DGRAM or STREAM sockets try: self.socket.connect(address) - except socket.error: + except OSError: self.socket.close() self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) self.socket.connect(address) @@ -365,11 +351,10 @@ def __init__(self, address=('localhost', SYSLOG_UDP_PORT), facility=LOG_USER): # zero-terminator seems to be required. this string is placed # into a class variable so that it can be overridden if # necessary. - log_format_string = '<%d>%s\000' + log_format_string = "<%d>%s\000" - def encodePriority (self, facility, priority): - """ - Encode the facility and priority. You can pass in strings or + def encodePriority(self, facility, priority): + """Encode the facility and priority. You can pass in strings or integers - if strings are passed, the facility_names and priority_names mapping dictionaries are used to convert them to integers. @@ -380,16 +365,13 @@ def encodePriority (self, facility, priority): priority = self.priority_names[priority] return (facility << 3) | priority - def close (self): - """ - Closes the socket. - """ + def close(self): + """Closes the socket.""" if self.unixsocket: self.socket.close() def emit(self, record): - """ - Emit a record. + """Emit a record. The record is formatted, and then sent to the syslog server. If exception information is present, it is NOT sent to the server. @@ -399,10 +381,7 @@ def emit(self, record): We need to convert record level to lowercase, maybe this will change in the future. """ - msg = self.log_format_string % ( - self.encodePriority(self.facility, - string.lower(record.levelname)), - msg) + msg = self.log_format_string % (self.encodePriority(self.facility, string.lower(record.levelname)), msg) try: if self.unixsocket: self.socket.send(msg) @@ -411,13 +390,12 @@ def emit(self, record): except: self.handleError(record) + class SMTPHandler(logging.Handler): - """ - A handler class which sends an SMTP email for each logging event. - """ + """A handler class which sends an SMTP email for each logging event.""" + def __init__(self, mailhost, fromaddr, toaddrs, subject): - """ - Initialize the handler. + """Initialize the handler. Initialize the instance with the from and to addresses and subject line of the email. To specify a non-standard SMTP port, use the @@ -438,55 +416,51 @@ def __init__(self, mailhost, fromaddr, toaddrs, subject): self.subject = subject def getSubject(self, record): - """ - Determine the subject for the email. + """Determine the subject for the email. If you want to specify a subject line which is record-dependent, override this method. """ return self.subject - weekdayname = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + weekdayname = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] - monthname = [None, - 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', - 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + monthname = [None, "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] def date_time(self): """Return the current date and time formatted for a MIME header.""" year, month, day, hh, mm, ss, wd, y, z = time.gmtime(time.time()) - s = "%s, %02d %3s %4d %02d:%02d:%02d GMT" % ( - self.weekdayname[wd], - day, self.monthname[month], year, - hh, mm, ss) + s = "%s, %02d %3s %4d %02d:%02d:%02d GMT" % (self.weekdayname[wd], day, self.monthname[month], year, hh, mm, ss) return s def emit(self, record): - """ - Emit a record. + """Emit a record. Format the record and send it to the specified addressees. """ try: import smtplib + port = self.mailport if not port: port = smtplib.SMTP_PORT smtp = smtplib.SMTP(self.mailhost, port) msg = self.format(record) - msg = "From: %s\r\nTo: %s\r\nSubject: %s\r\nDate: %s\r\n\r\n%s" % ( - self.fromaddr, - string.join(self.toaddrs, ","), - self.getSubject(record), - self.date_time(), msg) + msg = "From: {}\r\nTo: {}\r\nSubject: {}\r\nDate: {}\r\n\r\n{}".format( + self.fromaddr, + string.join(self.toaddrs, ","), + self.getSubject(record), + self.date_time(), + msg, + ) smtp.sendmail(self.fromaddr, self.toaddrs, msg) smtp.quit() except: self.handleError(record) + class NTEventLogHandler(logging.Handler): - """ - A handler class which sends events to the NT Event Log. Adds a + """A handler class which sends events to the NT Event Log. Adds a registry entry for the specified application name. If no dllname is provided, win32service.pyd (which contains some basic message placeholders) is used. Note that use of these placeholders will make @@ -494,35 +468,36 @@ class NTEventLogHandler(logging.Handler): If you want slimmer logs, you have to pass in the name of your own DLL which contains the message definitions you want to use in the event log. """ + def __init__(self, appname, dllname=None, logtype="Application"): logging.Handler.__init__(self) try: - import win32evtlogutil, win32evtlog + import win32evtlog + import win32evtlogutil + self.appname = appname self._welu = win32evtlogutil if not dllname: dllname = os.path.split(self._welu.__file__) dllname = os.path.split(dllname[0]) - dllname = os.path.join(dllname[0], r'win32service.pyd') + dllname = os.path.join(dllname[0], r"win32service.pyd") self.dllname = dllname self.logtype = logtype self._welu.AddSourceToRegistry(appname, dllname, logtype) self.deftype = win32evtlog.EVENTLOG_ERROR_TYPE self.typemap = { - logging.DEBUG : win32evtlog.EVENTLOG_INFORMATION_TYPE, - logging.INFO : win32evtlog.EVENTLOG_INFORMATION_TYPE, - logging.WARNING : win32evtlog.EVENTLOG_WARNING_TYPE, - logging.ERROR : win32evtlog.EVENTLOG_ERROR_TYPE, + logging.DEBUG: win32evtlog.EVENTLOG_INFORMATION_TYPE, + logging.INFO: win32evtlog.EVENTLOG_INFORMATION_TYPE, + logging.WARNING: win32evtlog.EVENTLOG_WARNING_TYPE, + logging.ERROR: win32evtlog.EVENTLOG_ERROR_TYPE, logging.CRITICAL: win32evtlog.EVENTLOG_ERROR_TYPE, - } + } except ImportError: - print "The Python Win32 extensions for NT (service, event "\ - "logging) appear not to be available." + print("The Python Win32 extensions for NT (service, event logging) appear not to be available.") self._welu = None def getMessageID(self, record): - """ - Return the message ID for the event record. If you are using your + """Return the message ID for the event record. If you are using your own messages, you could do this by having the msg passed to the logger being an ID rather than a formatting string. Then, in here, you could use a dictionary lookup to get the message ID. This @@ -531,8 +506,7 @@ def getMessageID(self, record): return 1 def getEventCategory(self, record): - """ - Return the event category for the record. + """Return the event category for the record. Override this if you want to specify your own categories. This version returns 0. @@ -540,8 +514,7 @@ def getEventCategory(self, record): return 0 def getEventType(self, record): - """ - Return the event type for the record. + """Return the event type for the record. Override this if you want to specify your own types. This version does a mapping using the handler's typemap attribute, which is set up in @@ -553,8 +526,7 @@ def getEventType(self, record): return self.typemap.get(record.levelno, self.deftype) def emit(self, record): - """ - Emit a record. + """Emit a record. Determine the message ID, event category and event type. Then log the message in the NT event log. @@ -570,8 +542,7 @@ def emit(self, record): self.handleError(record) def close(self): - """ - Clean up this handler. + """Clean up this handler. You can remove the application name from the registry as a source of event log entries. However, if you do this, you will @@ -579,51 +550,51 @@ def close(self): Viewer - it needs to be able to access the registry to get the DLL name. """ - #self._welu.RemoveSourceFromRegistry(self.appname, self.logtype) - pass + # self._welu.RemoveSourceFromRegistry(self.appname, self.logtype) + class HTTPHandler(logging.Handler): - """ - A class which sends records to a Web server, using either GET or + """A class which sends records to a Web server, using either GET or POST semantics. """ + def __init__(self, host, url, method="GET"): - """ - Initialize the instance with the host, the request URL, and the method + """Initialize the instance with the host, the request URL, and the method ("GET" or "POST") """ logging.Handler.__init__(self) method = string.upper(method) if method not in ["GET", "POST"]: - raise ValueError, "method must be GET or POST" + raise ValueError("method must be GET or POST") self.host = host self.url = url self.method = method def mapLogRecord(self, record): - """ - Default implementation of mapping the log record into a dict + """Default implementation of mapping the log record into a dict that is send as the CGI data. Overwrite in your class. Contributed by Franz Glasner. """ return record.__dict__ def emit(self, record): - """ - Emit a record. + """Emit a record. Send the record to the Web server as an URL-encoded dictionary """ try: - import httplib, urllib + import urllib + + import httplib + h = httplib.HTTP(self.host) url = self.url data = urllib.urlencode(self.mapLogRecord(record)) if self.method == "GET": - if (string.find(url, '?') >= 0): - sep = '&' + if string.find(url, "?") >= 0: + sep = "&" else: - sep = '?' + sep = "?" url = url + "%c%s" % (sep, data) h.putrequest(self.method, url) if self.method == "POST": @@ -631,36 +602,33 @@ def emit(self, record): h.endheaders() if self.method == "POST": h.send(data) - h.getreply() #can't do anything with the result + h.getreply() # can't do anything with the result except: self.handleError(record) + class BufferingHandler(logging.Handler): + """A handler class which buffers logging records in memory. Whenever each + record is added to the buffer, a check is made to see if the buffer should + be flushed. If it should, then flush() is expected to do what's needed. """ - A handler class which buffers logging records in memory. Whenever each - record is added to the buffer, a check is made to see if the buffer should - be flushed. If it should, then flush() is expected to do what's needed. - """ + def __init__(self, capacity): - """ - Initialize the handler with the buffer size. - """ + """Initialize the handler with the buffer size.""" logging.Handler.__init__(self) self.capacity = capacity self.buffer = [] def shouldFlush(self, record): - """ - Should the handler flush its buffer? + """Should the handler flush its buffer? Returns true if the buffer is up to capacity. This method can be overridden to implement custom flushing strategies. """ - return (len(self.buffer) >= self.capacity) + return len(self.buffer) >= self.capacity def emit(self, record): - """ - Emit a record. + """Emit a record. Append the record. If shouldFlush() tells us to, call flush() to process the buffer. @@ -670,22 +638,21 @@ def emit(self, record): self.flush() def flush(self): - """ - Override to implement custom flushing behaviour. + """Override to implement custom flushing behaviour. This version just zaps the buffer to empty. """ self.buffer = [] + class MemoryHandler(BufferingHandler): - """ - A handler class which buffers logging records in memory, periodically + """A handler class which buffers logging records in memory, periodically flushing them to a target handler. Flushing occurs whenever the buffer is full, or when an event of a certain severity or greater is seen. """ + def __init__(self, capacity, flushLevel=logging.ERROR, target=None): - """ - Initialize the handler with the buffer size, the level at which + """Initialize the handler with the buffer size, the level at which flushing should occur and an optional target. Note that without a target being set either here or via setTarget(), @@ -696,21 +663,15 @@ def __init__(self, capacity, flushLevel=logging.ERROR, target=None): self.target = target def shouldFlush(self, record): - """ - Check for buffer full or a record at the flushLevel or higher. - """ - return (len(self.buffer) >= self.capacity) or \ - (record.levelno >= self.flushLevel) + """Check for buffer full or a record at the flushLevel or higher.""" + return (len(self.buffer) >= self.capacity) or (record.levelno >= self.flushLevel) def setTarget(self, target): - """ - Set the target handler for this handler. - """ + """Set the target handler for this handler.""" self.target = target def flush(self): - """ - For a MemoryHandler, flushing means just sending the buffered + """For a MemoryHandler, flushing means just sending the buffered records to the target, if there is one. Override if you want different behaviour. """ @@ -720,9 +681,7 @@ def flush(self): self.buffer = [] def close(self): - """ - Flush, set the target to None and lose the buffer. - """ + """Flush, set the target to None and lose the buffer.""" self.flush() self.target = None self.buffer = [] diff --git a/code/planet/feedparser.py b/code/planet/feedparser.py index cd7ac83d..21f49bb5 100644 --- a/code/planet/feedparser.py +++ b/code/planet/feedparser.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """Universal feed parser Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds @@ -9,9 +9,11 @@ Required: Python 2.1 or later Recommended: Python 2.3 or later Recommended: CJKCodecs and iconv_codec + +TODO: py2->3 conversion """ -__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs" +__version__ = "4.1" # + "$Revision: 1.92 $"[11:15] + "-cvs" __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, @@ -35,11 +37,20 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.""" __author__ = "Mark Pilgrim " -__contributors__ = ["Jason Diamond ", - "John Beimler ", - "Fazal Majid ", - "Aaron Swartz ", - "Kevin Marks "] +__contributors__ = [ + "Jason Diamond ", + "John Beimler ", + "Fazal Majid ", + "Aaron Swartz ", + "Kevin Marks ", +] + +import calendar +import datetime +import email.utils +import html.entities +from html.parser import HTMLParser + _debug = 0 # HTTP "User-Agent" header to send to servers when downloading feeds. @@ -66,11 +77,18 @@ PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] # ---------- required modules (should come with any Python distribution) ---------- -import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 -try: - from cStringIO import StringIO as _StringIO -except: - from StringIO import StringIO as _StringIO +import cgi +import copy +import re +import sys +import time +import types +import urllib.parse +import urllib.request +import urllib.response + +from io import StringIO as _StringIO +from io import BytesIO # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- @@ -90,33 +108,38 @@ # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. try: import xml.sax - xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers + + xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers from xml.sax.saxutils import escape as _xmlescape + _XML_AVAILABLE = 1 except: _XML_AVAILABLE = 0 - def _xmlescape(data,entities={}): - data = data.replace('&', '&') - data = data.replace('>', '>') - data = data.replace('<', '<') + + + def _xmlescape(data, entities={}): + data = data.replace("&", "&") + data = data.replace(">", ">") + data = data.replace("<", "<") for char, entity in entities: data = data.replace(char, entity) return data # base64 support for Atom feeds that contain embedded binary data try: - import base64, binascii + import base64 + import binascii except: base64 = binascii = None # cjkcodecs and iconv_codec provide support for more character encodings. # Both are available from http://cjkpython.i18n.org/ try: - import cjkcodecs.aliases + pass except: pass try: - import iconv_codec + pass except: pass @@ -124,80 +147,97 @@ def _xmlescape(data,entities={}): # Download from http://chardet.feedparser.org/ try: import chardet + if _debug: import chardet.constants + chardet.constants._debug = 1 except: chardet = None + # ---------- don't touch these ---------- -class ThingsNobodyCaresAboutButMe(Exception): pass -class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass -class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass -class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass -class UndeclaredNamespace(Exception): pass - -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') -sgmllib.special = re.compile('' % (tag, self.strattrs(attrs)), escape=0) + tag = tag.split(":")[-1] + return self.handle_data(f"<{tag}{self.strattrs(attrs)}>", escape=0) # match namespaces - if tag.find(':') <> -1: - prefix, suffix = tag.split(':', 1) + if tag.find(":") != -1: + prefix, suffix = tag.split(":", 1) else: - prefix, suffix = '', tag + prefix, suffix = "", tag prefix = self.namespacemap.get(prefix, prefix) if prefix: - prefix = prefix + '_' + prefix = prefix + "_" # special hack for better tracking of empty textinput/image elements in illformed feeds - if (not prefix) and tag not in ('title', 'link', 'description', 'name'): + if (not prefix) and tag not in ("title", "link", "description", "name"): self.intextinput = 0 - if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): + if (not prefix) and tag not in ("title", "link", "description", "url", "href", "width", "height"): self.inimage = 0 - + # call special handler (if defined) or default handler - methodname = '_start_' + prefix + suffix + methodname = "_start_" + prefix + suffix try: method = getattr(self, methodname) return method(attrsD) @@ -490,18 +815,19 @@ def unknown_starttag(self, tag, attrs): return self.push(prefix + suffix, 1) def unknown_endtag(self, tag): - if _debug: sys.stderr.write('end %s\n' % tag) + if _debug: + sys.stderr.write("end %s\n" % tag) # match namespaces - if tag.find(':') <> -1: - prefix, suffix = tag.split(':', 1) + if tag.find(":") != -1: + prefix, suffix = tag.split(":", 1) else: - prefix, suffix = '', tag + prefix, suffix = "", tag prefix = self.namespacemap.get(prefix, prefix) if prefix: - prefix = prefix + '_' + prefix = prefix + "_" # call special handler (if defined) or default handler - methodname = '_end_' + prefix + suffix + methodname = "_end_" + prefix + suffix try: method = getattr(self, methodname) method() @@ -509,12 +835,16 @@ def unknown_endtag(self, tag): self.pop(prefix + suffix) # track inline content - if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): + if ( + self.incontent + and type in self.contentparams + and not self.contentparams.get("type", "xml").endswith("xml") + ): # element declared itself as escaped markup, but it isn't really - self.contentparams['type'] = 'application/xhtml+xml' - if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': - tag = tag.split(':')[-1] - self.handle_data('' % tag, escape=0) + self.contentparams["type"] = "application/xhtml+xml" + if self.incontent and self.contentparams.get("type") == "application/xhtml+xml": + tag = tag.split(":")[-1] + self.handle_data("" % tag, escape=0) # track xml:base and xml:lang going out of scope if self.basestack: @@ -523,49 +853,58 @@ def unknown_endtag(self, tag): self.baseuri = self.basestack[-1] if self.langstack: self.langstack.pop() - if self.langstack: # and (self.langstack[-1] is not None): + if self.langstack: # and (self.langstack[-1] is not None): self.lang = self.langstack[-1] def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' - if not self.elementstack: return + if not self.elementstack: + return ref = ref.lower() - if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): - text = '&#%s;' % ref + if ref in ("34", "38", "39", "60", "62", "x22", "x26", "x27", "x3c", "x3e"): + text = "&#%s;" % ref else: - if ref[0] == 'x': + if ref[0] == "x": c = int(ref[1:], 16) else: c = int(ref) - text = unichr(c).encode('utf-8') + text = chr(c).encode("utf-8") self.elementstack[-1][2].append(text) def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' - if not self.elementstack: return - if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) - if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): - text = '&%s;' % ref + if not self.elementstack: + return + if _debug: + sys.stderr.write("entering handle_entityref with %s\n" % ref) + if ref in ("lt", "gt", "quot", "amp", "apos"): + text = "&%s;" % ref else: # entity resolution graciously donated by Aaron Swartz def name2cp(k): import htmlentitydefs - if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 + + if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] k = htmlentitydefs.entitydefs[k] - if k.startswith('&#') and k.endswith(';'): - return int(k[2:-1]) # not in latin-1 + if k.startswith("&#") and k.endswith(";"): + return int(k[2:-1]) # not in latin-1 return ord(k) - try: name2cp(ref) - except KeyError: text = '&%s;' % ref - else: text = unichr(name2cp(ref)).encode('utf-8') + + try: + name2cp(ref) + except KeyError: + text = "&%s;" % ref + else: + text = chr(name2cp(ref)).encode("utf-8") self.elementstack[-1][2].append(text) def handle_data(self, text, escape=1): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references - if not self.elementstack: return - if escape and self.contentparams.get('type') == 'application/xhtml+xml': + if not self.elementstack: + return + if escape and self.contentparams.get("type") == "application/xhtml+xml": text = _xmlescape(text) self.elementstack[-1][2].append(text) @@ -582,181 +921,190 @@ def handle_decl(self, text): def parse_declaration(self, i): # override internal declaration handler to handle CDATA blocks - if _debug: sys.stderr.write('entering parse_declaration\n') - if self.rawdata[i:i+9] == '', i) - if k == -1: k = len(self.rawdata) - self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) - return k+3 + if _debug: + sys.stderr.write("entering parse_declaration\n") + if self.rawdata[i: i + 9] == "", i) + if k == -1: + k = len(self.rawdata) + self.handle_data(_xmlescape(self.rawdata[i + 9: k]), 0) + return k + 3 else: - k = self.rawdata.find('>', i) - return k+1 + k = self.rawdata.find(">", i) + return k + 1 def mapContentType(self, contentType): contentType = contentType.lower() - if contentType == 'text': - contentType = 'text/plain' - elif contentType == 'html': - contentType = 'text/html' - elif contentType == 'xhtml': - contentType = 'application/xhtml+xml' + if contentType == "text": + contentType = "text/plain" + elif contentType == "html": + contentType = "text/html" + elif contentType == "xhtml": + contentType = "application/xhtml+xml" return contentType - + def trackNamespace(self, prefix, uri): loweruri = uri.lower() - if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: - self.version = 'rss090' - if loweruri == 'http://purl.org/rss/1.0/' and not self.version: - self.version = 'rss10' - if loweruri == 'http://www.w3.org/2005/atom' and not self.version: - self.version = 'atom10' - if loweruri.find('backend.userland.com/rss') <> -1: + if (prefix, loweruri) == (None, "http://my.netscape.com/rdf/simple/0.9/") and not self.version: + self.version = "rss090" + if loweruri == "http://purl.org/rss/1.0/" and not self.version: + self.version = "rss10" + if loweruri == "http://www.w3.org/2005/atom" and not self.version: + self.version = "atom10" + if loweruri.find("backend.userland.com/rss") != -1: # match any backend.userland.com namespace - uri = 'http://backend.userland.com/rss' + uri = "http://backend.userland.com/rss" loweruri = uri - if self._matchnamespaces.has_key(loweruri): + if loweruri in self._matchnamespaces: self.namespacemap[prefix] = self._matchnamespaces[loweruri] self.namespacesInUse[self._matchnamespaces[loweruri]] = uri else: - self.namespacesInUse[prefix or ''] = uri + self.namespacesInUse[prefix or ""] = uri def resolveURI(self, uri): - return _urljoin(self.baseuri or '', uri) - + return _urljoin(self.baseuri or "", uri) + def decodeEntities(self, element, data): return data def strattrs(self, attrs): - return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) + return "".join([' {}="{}"'.format(t[0], _xmlescape(t[1], {'"': """})) for t in attrs]) def push(self, element, expectingText): self.elementstack.append([element, expectingText, []]) def pop(self, element, stripWhitespace=1): - if not self.elementstack: return - if self.elementstack[-1][0] != element: return - + if not self.elementstack: + return None + if self.elementstack[-1][0] != element: + return None + element, expectingText, pieces = self.elementstack.pop() - if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': + if self.version == "atom10" and self.contentparams.get("type", "text") == "application/xhtml+xml": # remove enclosing child element, but only if it is a
and # only if all the remaining content is nested underneath it. # This means that the divs would be retained in the following: #
foo
bar
- if pieces and (pieces[0] == '
' or pieces[0].startswith('
': + if pieces and (pieces[0] == "
" or pieces[0].startswith("
": depth = 0 for piece in pieces[:-1]: - if piece.startswith(''): + if depth == 0: + break + elif piece.startswith("<") and not piece.endswith("/>"): depth += 1 else: pieces = pieces[1:-1] - output = ''.join(pieces) + output = "".join(pieces) if stripWhitespace: output = output.strip() - if not expectingText: return output + if not expectingText: + return output # decode base64 content - if base64 and self.contentparams.get('base64', 0): + if base64 and self.contentparams.get("base64", 0): try: output = base64.decodestring(output) except binascii.Error: pass except binascii.Incomplete: pass - + # resolve relative URIs if (element in self.can_be_relative_uri) and output: output = self.resolveURI(output) - + # decode entities within embedded markup - if not self.contentparams.get('base64', 0): + if not self.contentparams.get("base64", 0): output = self.decodeEntities(element, output) # remove temporary cruft from contentparams try: - del self.contentparams['mode'] + del self.contentparams["mode"] except KeyError: pass try: - del self.contentparams['base64'] + del self.contentparams["base64"] except KeyError: pass # resolve relative URIs within embedded markup - if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: + if self.mapContentType(self.contentparams.get("type", "text/html")) in self.html_types: if element in self.can_contain_relative_uris: output = _resolveRelativeURIs(output, self.baseuri, self.encoding) - + # sanitize embedded markup - if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: + if self.mapContentType(self.contentparams.get("type", "text/html")) in self.html_types: if element in self.can_contain_dangerous_markup: output = _sanitizeHTML(output, self.encoding) - if self.encoding and type(output) != type(u''): + if self.encoding and type(output) != str: try: - output = unicode(output, self.encoding) + output = str(output, self.encoding) except: pass # address common error where people take data that is already # utf-8, presume that it is iso-8859-1, and re-encode it. - if self.encoding=='utf-8' and type(output) == type(u''): + if self.encoding == "utf-8" and type(output) == str: try: - output = unicode(output.encode('iso-8859-1'), 'utf-8') + output = str(output.encode("iso-8859-1"), "utf-8") except: pass # map win-1252 extensions to the proper code points - if type(output) == type(u''): - output = u''.join([c in cp1252 and cp1252[c] or c for c in output]) + if type(output) == str: + output = "".join([c in cp1252 and cp1252[c] or c for c in output]) # categories/tags/keywords/whatever are handled in _end_category - if element == 'category': + if element == "category": return output - + # store output in appropriate place(s) if self.inentry and not self.insource: - if element == 'content': + if element == "content": self.entries[-1].setdefault(element, []) contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output + contentparams["value"] = output self.entries[-1][element].append(contentparams) - elif element == 'link': + elif element == "link": self.entries[-1][element] = output if output: - self.entries[-1]['links'][-1]['href'] = output + self.entries[-1]["links"][-1]["href"] = output else: - if element == 'description': - element = 'summary' + if element == "description": + element = "summary" self.entries[-1][element] = output if self.incontent: contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - self.entries[-1][element + '_detail'] = contentparams + contentparams["value"] = output + self.entries[-1][element + "_detail"] = contentparams elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage): context = self._getContext() - if element == 'description': - element = 'subtitle' + if element == "description": + element = "subtitle" context[element] = output - if element == 'link': - context['links'][-1]['href'] = output + if element == "link": + context["links"][-1]["href"] = output elif self.incontent: contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - context[element + '_detail'] = contentparams + contentparams["value"] = output + context[element + "_detail"] = contentparams return output def pushContent(self, tag, attrsD, defaultContentType, expectingText): self.incontent += 1 - self.contentparams = FeedParserDict({ - 'type': self.mapContentType(attrsD.get('type', defaultContentType)), - 'language': self.lang, - 'base': self.baseuri}) - self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) + self.contentparams = FeedParserDict( + { + "type": self.mapContentType(attrsD.get("type", defaultContentType)), + "language": self.lang, + "base": self.baseuri, + }, + ) + self.contentparams["base64"] = self._isBase64(attrsD, self.contentparams) self.push(tag, expectingText) def popContent(self, tag): @@ -764,132 +1112,133 @@ def popContent(self, tag): self.incontent -= 1 self.contentparams.clear() return value - + def _mapToStandardPrefix(self, name): - colonpos = name.find(':') - if colonpos <> -1: + colonpos = name.find(":") + if colonpos != -1: prefix = name[:colonpos] - suffix = name[colonpos+1:] + suffix = name[colonpos + 1:] prefix = self.namespacemap.get(prefix, prefix) - name = prefix + ':' + suffix + name = prefix + ":" + suffix return name - + def _getAttribute(self, attrsD, name): return attrsD.get(self._mapToStandardPrefix(name)) def _isBase64(self, attrsD, contentparams): - if attrsD.get('mode', '') == 'base64': + if attrsD.get("mode", "") == "base64": return 1 - if self.contentparams['type'].startswith('text/'): + if self.contentparams["type"].startswith("text/"): return 0 - if self.contentparams['type'].endswith('+xml'): + if self.contentparams["type"].endswith("+xml"): return 0 - if self.contentparams['type'].endswith('/xml'): + if self.contentparams["type"].endswith("/xml"): return 0 return 1 def _itsAnHrefDamnIt(self, attrsD): - href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) + href = attrsD.get("url", attrsD.get("uri", attrsD.get("href", None))) if href: try: - del attrsD['url'] + del attrsD["url"] except KeyError: pass try: - del attrsD['uri'] + del attrsD["uri"] except KeyError: pass - attrsD['href'] = href + attrsD["href"] = href return attrsD - + def _save(self, key, value): context = self._getContext() context.setdefault(key, value) def _start_rss(self, attrsD): - versionmap = {'0.91': 'rss091u', - '0.92': 'rss092', - '0.93': 'rss093', - '0.94': 'rss094'} + versionmap = {"0.91": "rss091u", "0.92": "rss092", "0.93": "rss093", "0.94": "rss094"} if not self.version: - attr_version = attrsD.get('version', '') + attr_version = attrsD.get("version", "") version = versionmap.get(attr_version) if version: self.version = version - elif attr_version.startswith('2.'): - self.version = 'rss20' + elif attr_version.startswith("2."): + self.version = "rss20" else: - self.version = 'rss' - + self.version = "rss" + def _start_dlhottitles(self, attrsD): - self.version = 'hotrss' + self.version = "hotrss" def _start_channel(self, attrsD): self.infeed = 1 self._cdf_common(attrsD) + _start_feedinfo = _start_channel def _cdf_common(self, attrsD): - if attrsD.has_key('lastmod'): + if "lastmod" in attrsD: self._start_modified({}) - self.elementstack[-1][-1] = attrsD['lastmod'] + self.elementstack[-1][-1] = attrsD["lastmod"] self._end_modified() - if attrsD.has_key('href'): + if "href" in attrsD: self._start_link({}) - self.elementstack[-1][-1] = attrsD['href'] + self.elementstack[-1][-1] = attrsD["href"] self._end_link() - + def _start_feed(self, attrsD): self.infeed = 1 - versionmap = {'0.1': 'atom01', - '0.2': 'atom02', - '0.3': 'atom03'} + versionmap = {"0.1": "atom01", "0.2": "atom02", "0.3": "atom03"} if not self.version: - attr_version = attrsD.get('version') + attr_version = attrsD.get("version") version = versionmap.get(attr_version) if version: self.version = version else: - self.version = 'atom' + self.version = "atom" def _end_channel(self): self.infeed = 0 + _end_feed = _end_channel - + def _start_image(self, attrsD): self.inimage = 1 - self.push('image', 0) + self.push("image", 0) context = self._getContext() - context.setdefault('image', FeedParserDict()) - + context.setdefault("image", FeedParserDict()) + def _end_image(self): - self.pop('image') + self.pop("image") self.inimage = 0 def _start_textinput(self, attrsD): self.intextinput = 1 - self.push('textinput', 0) + self.push("textinput", 0) context = self._getContext() - context.setdefault('textinput', FeedParserDict()) + context.setdefault("textinput", FeedParserDict()) + _start_textInput = _start_textinput - + def _end_textinput(self): - self.pop('textinput') + self.pop("textinput") self.intextinput = 0 + _end_textInput = _end_textinput def _start_author(self, attrsD): self.inauthor = 1 - self.push('author', 1) + self.push("author", 1) + _start_managingeditor = _start_author _start_dc_author = _start_author _start_dc_creator = _start_author _start_itunes_author = _start_author def _end_author(self): - self.pop('author') + self.pop("author") self.inauthor = 0 self._sync_author_detail() + _end_managingeditor = _end_author _end_dc_author = _end_author _end_dc_creator = _end_author @@ -897,110 +1246,116 @@ def _end_author(self): def _start_itunes_owner(self, attrsD): self.inpublisher = 1 - self.push('publisher', 0) + self.push("publisher", 0) def _end_itunes_owner(self): - self.pop('publisher') + self.pop("publisher") self.inpublisher = 0 - self._sync_author_detail('publisher') + self._sync_author_detail("publisher") def _start_contributor(self, attrsD): self.incontributor = 1 context = self._getContext() - context.setdefault('contributors', []) - context['contributors'].append(FeedParserDict()) - self.push('contributor', 0) + context.setdefault("contributors", []) + context["contributors"].append(FeedParserDict()) + self.push("contributor", 0) def _end_contributor(self): - self.pop('contributor') + self.pop("contributor") self.incontributor = 0 def _start_dc_contributor(self, attrsD): self.incontributor = 1 context = self._getContext() - context.setdefault('contributors', []) - context['contributors'].append(FeedParserDict()) - self.push('name', 0) + context.setdefault("contributors", []) + context["contributors"].append(FeedParserDict()) + self.push("name", 0) def _end_dc_contributor(self): self._end_name() self.incontributor = 0 def _start_name(self, attrsD): - self.push('name', 0) + self.push("name", 0) + _start_itunes_name = _start_name def _end_name(self): - value = self.pop('name') + value = self.pop("name") if self.inpublisher: - self._save_author('name', value, 'publisher') + self._save_author("name", value, "publisher") elif self.inauthor: - self._save_author('name', value) + self._save_author("name", value) elif self.incontributor: - self._save_contributor('name', value) + self._save_contributor("name", value) elif self.intextinput: context = self._getContext() - context['textinput']['name'] = value + context["textinput"]["name"] = value + _end_itunes_name = _end_name def _start_width(self, attrsD): - self.push('width', 0) + self.push("width", 0) def _end_width(self): - value = self.pop('width') + value = self.pop("width") try: value = int(value) except: value = 0 if self.inimage: context = self._getContext() - context['image']['width'] = value + context["image"]["width"] = value def _start_height(self, attrsD): - self.push('height', 0) + self.push("height", 0) def _end_height(self): - value = self.pop('height') + value = self.pop("height") try: value = int(value) except: value = 0 if self.inimage: context = self._getContext() - context['image']['height'] = value + context["image"]["height"] = value def _start_url(self, attrsD): - self.push('href', 1) + self.push("href", 1) + _start_homepage = _start_url _start_uri = _start_url def _end_url(self): - value = self.pop('href') + value = self.pop("href") if self.inauthor: - self._save_author('href', value) + self._save_author("href", value) elif self.incontributor: - self._save_contributor('href', value) + self._save_contributor("href", value) elif self.inimage: context = self._getContext() - context['image']['href'] = value + context["image"]["href"] = value elif self.intextinput: context = self._getContext() - context['textinput']['link'] = value + context["textinput"]["link"] = value + _end_homepage = _end_url _end_uri = _end_url def _start_email(self, attrsD): - self.push('email', 0) + self.push("email", 0) + _start_itunes_email = _start_email def _end_email(self): - value = self.pop('email') + value = self.pop("email") if self.inpublisher: - self._save_author('email', value, 'publisher') + self._save_author("email", value, "publisher") elif self.inauthor: - self._save_author('email', value) + self._save_author("email", value) elif self.incontributor: - self._save_contributor('email', value) + self._save_contributor("email", value) + _end_itunes_email = _end_email def _getContext(self): @@ -1012,381 +1367,421 @@ def _getContext(self): context = self.feeddata return context - def _save_author(self, key, value, prefix='author'): + def _save_author(self, key, value, prefix="author"): context = self._getContext() - context.setdefault(prefix + '_detail', FeedParserDict()) - context[prefix + '_detail'][key] = value + context.setdefault(prefix + "_detail", FeedParserDict()) + context[prefix + "_detail"][key] = value self._sync_author_detail() def _save_contributor(self, key, value): context = self._getContext() - context.setdefault('contributors', [FeedParserDict()]) - context['contributors'][-1][key] = value + context.setdefault("contributors", [FeedParserDict()]) + context["contributors"][-1][key] = value - def _sync_author_detail(self, key='author'): + def _sync_author_detail(self, key="author"): context = self._getContext() - detail = context.get('%s_detail' % key) + detail = context.get("%s_detail" % key) if detail: - name = detail.get('name') - email = detail.get('email') + name = detail.get("name") + email = detail.get("email") if name and email: - context[key] = '%s (%s)' % (name, email) + context[key] = f"{name} ({email})" elif name: context[key] = name elif email: context[key] = email else: author = context.get(key) - if not author: return - emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author) - if not emailmatch: return + if not author: + return + emailmatch = re.search( + r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", + author, + ) + if not emailmatch: + return email = emailmatch.group(0) # probably a better way to do the following, but it passes all the tests - author = author.replace(email, '') - author = author.replace('()', '') + author = author.replace(email, "") + author = author.replace("()", "") author = author.strip() - if author and (author[0] == '('): + if author and (author[0] == "("): author = author[1:] - if author and (author[-1] == ')'): + if author and (author[-1] == ")"): author = author[:-1] author = author.strip() - context.setdefault('%s_detail' % key, FeedParserDict()) - context['%s_detail' % key]['name'] = author - context['%s_detail' % key]['email'] = email + context.setdefault("%s_detail" % key, FeedParserDict()) + context["%s_detail" % key]["name"] = author + context["%s_detail" % key]["email"] = email def _start_subtitle(self, attrsD): - self.pushContent('subtitle', attrsD, 'text/plain', 1) + self.pushContent("subtitle", attrsD, "text/plain", 1) + _start_tagline = _start_subtitle _start_itunes_subtitle = _start_subtitle def _end_subtitle(self): - self.popContent('subtitle') + self.popContent("subtitle") + _end_tagline = _end_subtitle _end_itunes_subtitle = _end_subtitle - + def _start_rights(self, attrsD): - self.pushContent('rights', attrsD, 'text/plain', 1) + self.pushContent("rights", attrsD, "text/plain", 1) + _start_dc_rights = _start_rights _start_copyright = _start_rights def _end_rights(self): - self.popContent('rights') + self.popContent("rights") + _end_dc_rights = _end_rights _end_copyright = _end_rights def _start_item(self, attrsD): self.entries.append(FeedParserDict()) - self.push('item', 0) + self.push("item", 0) self.inentry = 1 self.guidislink = 0 - id = self._getAttribute(attrsD, 'rdf:about') + id = self._getAttribute(attrsD, "rdf:about") if id: context = self._getContext() - context['id'] = id + context["id"] = id self._cdf_common(attrsD) + _start_entry = _start_item _start_product = _start_item def _end_item(self): - self.pop('item') + self.pop("item") self.inentry = 0 + _end_entry = _end_item def _start_dc_language(self, attrsD): - self.push('language', 1) + self.push("language", 1) + _start_language = _start_dc_language def _end_dc_language(self): - self.lang = self.pop('language') + self.lang = self.pop("language") + _end_language = _end_dc_language def _start_dc_publisher(self, attrsD): - self.push('publisher', 1) + self.push("publisher", 1) + _start_webmaster = _start_dc_publisher def _end_dc_publisher(self): - self.pop('publisher') - self._sync_author_detail('publisher') + self.pop("publisher") + self._sync_author_detail("publisher") + _end_webmaster = _end_dc_publisher def _start_published(self, attrsD): - self.push('published', 1) + self.push("published", 1) + _start_dcterms_issued = _start_published _start_issued = _start_published def _end_published(self): - value = self.pop('published') - self._save('published_parsed', _parse_date(value)) + value = self.pop("published") + self._save("published_parsed", _parse_date(value)) + _end_dcterms_issued = _end_published _end_issued = _end_published def _start_updated(self, attrsD): - self.push('updated', 1) + self.push("updated", 1) + _start_modified = _start_updated _start_dcterms_modified = _start_updated _start_pubdate = _start_updated _start_dc_date = _start_updated def _end_updated(self): - value = self.pop('updated') + value = self.pop("updated") parsed_value = _parse_date(value) - self._save('updated_parsed', parsed_value) + self._save("updated_parsed", parsed_value) + _end_modified = _end_updated _end_dcterms_modified = _end_updated _end_pubdate = _end_updated _end_dc_date = _end_updated def _start_created(self, attrsD): - self.push('created', 1) + self.push("created", 1) + _start_dcterms_created = _start_created def _end_created(self): - value = self.pop('created') - self._save('created_parsed', _parse_date(value)) + value = self.pop("created") + self._save("created_parsed", _parse_date(value)) + _end_dcterms_created = _end_created def _start_expirationdate(self, attrsD): - self.push('expired', 1) + self.push("expired", 1) def _end_expirationdate(self): - self._save('expired_parsed', _parse_date(self.pop('expired'))) + self._save("expired_parsed", _parse_date(self.pop("expired"))) def _start_cc_license(self, attrsD): - self.push('license', 1) - value = self._getAttribute(attrsD, 'rdf:resource') + self.push("license", 1) + value = self._getAttribute(attrsD, "rdf:resource") if value: self.elementstack[-1][2].append(value) - self.pop('license') - + self.pop("license") + def _start_creativecommons_license(self, attrsD): - self.push('license', 1) + self.push("license", 1) def _end_creativecommons_license(self): - self.pop('license') + self.pop("license") def _addTag(self, term, scheme, label): context = self._getContext() - tags = context.setdefault('tags', []) - if (not term) and (not scheme) and (not label): return - value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) + tags = context.setdefault("tags", []) + if (not term) and (not scheme) and (not label): + return + value = FeedParserDict({"term": term, "scheme": scheme, "label": label}) if value not in tags: - tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label})) + tags.append(FeedParserDict({"term": term, "scheme": scheme, "label": label})) def _start_category(self, attrsD): - if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) - term = attrsD.get('term') - scheme = attrsD.get('scheme', attrsD.get('domain')) - label = attrsD.get('label') + if _debug: + sys.stderr.write("entering _start_category with %s\n" % repr(attrsD)) + term = attrsD.get("term") + scheme = attrsD.get("scheme", attrsD.get("domain")) + label = attrsD.get("label") self._addTag(term, scheme, label) - self.push('category', 1) + self.push("category", 1) + _start_dc_subject = _start_category _start_keywords = _start_category - + def _end_itunes_keywords(self): - for term in self.pop('itunes_keywords').split(): - self._addTag(term, 'http://www.itunes.com/', None) - + for term in self.pop("itunes_keywords").split(): + self._addTag(term, "http://www.itunes.com/", None) + def _start_itunes_category(self, attrsD): - self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) - self.push('category', 1) - + self._addTag(attrsD.get("text"), "http://www.itunes.com/", None) + self.push("category", 1) + def _end_category(self): - value = self.pop('category') - if not value: return + value = self.pop("category") + if not value: + return context = self._getContext() - tags = context['tags'] - if value and len(tags) and not tags[-1]['term']: - tags[-1]['term'] = value + tags = context["tags"] + if value and len(tags) and not tags[-1]["term"]: + tags[-1]["term"] = value else: self._addTag(value, None, None) + _end_dc_subject = _end_category _end_keywords = _end_category _end_itunes_category = _end_category def _start_cloud(self, attrsD): - self._getContext()['cloud'] = FeedParserDict(attrsD) - + self._getContext()["cloud"] = FeedParserDict(attrsD) + def _start_link(self, attrsD): - attrsD.setdefault('rel', 'alternate') - attrsD.setdefault('type', 'text/html') + attrsD.setdefault("rel", "alternate") + attrsD.setdefault("type", "text/html") attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): - attrsD['href'] = self.resolveURI(attrsD['href']) + if "href" in attrsD: + attrsD["href"] = self.resolveURI(attrsD["href"]) expectingText = self.infeed or self.inentry or self.insource context = self._getContext() - context.setdefault('links', []) - context['links'].append(FeedParserDict(attrsD)) - if attrsD['rel'] == 'enclosure': + context.setdefault("links", []) + context["links"].append(FeedParserDict(attrsD)) + if attrsD["rel"] == "enclosure": self._start_enclosure(attrsD) - if attrsD.has_key('href'): + if "href" in attrsD: expectingText = 0 - if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): - context['link'] = attrsD['href'] + if (attrsD.get("rel") == "alternate") and (self.mapContentType(attrsD.get("type")) in self.html_types): + context["link"] = attrsD["href"] else: - self.push('link', expectingText) + self.push("link", expectingText) + _start_producturl = _start_link def _end_link(self): - value = self.pop('link') + value = self.pop("link") context = self._getContext() if self.intextinput: - context['textinput']['link'] = value + context["textinput"]["link"] = value if self.inimage: - context['image']['link'] = value + context["image"]["link"] = value + _end_producturl = _end_link def _start_guid(self, attrsD): - self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') - self.push('id', 1) + self.guidislink = attrsD.get("ispermalink", "true") == "true" + self.push("id", 1) def _end_guid(self): - value = self.pop('id') - self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) + value = self.pop("id") + self._save("guidislink", self.guidislink and "link" not in self._getContext()) if self.guidislink: # guid acts as link, but only if 'ispermalink' is not present or is 'true', # and only if the item doesn't already have a link element - self._save('link', value) + self._save("link", value) def _start_title(self, attrsD): - self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) + self.pushContent("title", attrsD, "text/plain", self.infeed or self.inentry or self.insource) + def _start_title_low_pri(self, attrsD): - if not self._getContext().has_key('title'): + if "title" not in self._getContext(): self._start_title(attrsD) + _start_dc_title = _start_title_low_pri _start_media_title = _start_title_low_pri def _end_title(self): - value = self.popContent('title') + value = self.popContent("title") context = self._getContext() if self.intextinput: - context['textinput']['title'] = value + context["textinput"]["title"] = value elif self.inimage: - context['image']['title'] = value + context["image"]["title"] = value + def _end_title_low_pri(self): - if not self._getContext().has_key('title'): + if "title" not in self._getContext(): self._end_title() + _end_dc_title = _end_title_low_pri _end_media_title = _end_title_low_pri def _start_description(self, attrsD): context = self._getContext() - if context.has_key('summary'): - self._summaryKey = 'content' + if "summary" in context: + self._summaryKey = "content" self._start_content(attrsD) else: - self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) + self.pushContent("description", attrsD, "text/html", self.infeed or self.inentry or self.insource) def _start_abstract(self, attrsD): - self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) + self.pushContent("description", attrsD, "text/plain", self.infeed or self.inentry or self.insource) def _end_description(self): - if self._summaryKey == 'content': + if self._summaryKey == "content": self._end_content() else: - value = self.popContent('description') + value = self.popContent("description") context = self._getContext() if self.intextinput: - context['textinput']['description'] = value + context["textinput"]["description"] = value elif self.inimage: - context['image']['description'] = value + context["image"]["description"] = value self._summaryKey = None + _end_abstract = _end_description def _start_info(self, attrsD): - self.pushContent('info', attrsD, 'text/plain', 1) + self.pushContent("info", attrsD, "text/plain", 1) + _start_feedburner_browserfriendly = _start_info def _end_info(self): - self.popContent('info') + self.popContent("info") + _end_feedburner_browserfriendly = _end_info def _start_generator(self, attrsD): if attrsD: attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): - attrsD['href'] = self.resolveURI(attrsD['href']) - self._getContext()['generator_detail'] = FeedParserDict(attrsD) - self.push('generator', 1) + if "href" in attrsD: + attrsD["href"] = self.resolveURI(attrsD["href"]) + self._getContext()["generator_detail"] = FeedParserDict(attrsD) + self.push("generator", 1) def _end_generator(self): - value = self.pop('generator') + value = self.pop("generator") context = self._getContext() - if context.has_key('generator_detail'): - context['generator_detail']['name'] = value - + if "generator_detail" in context: + context["generator_detail"]["name"] = value + def _start_admin_generatoragent(self, attrsD): - self.push('generator', 1) - value = self._getAttribute(attrsD, 'rdf:resource') + self.push("generator", 1) + value = self._getAttribute(attrsD, "rdf:resource") if value: self.elementstack[-1][2].append(value) - self.pop('generator') - self._getContext()['generator_detail'] = FeedParserDict({'href': value}) + self.pop("generator") + self._getContext()["generator_detail"] = FeedParserDict({"href": value}) def _start_admin_errorreportsto(self, attrsD): - self.push('errorreportsto', 1) - value = self._getAttribute(attrsD, 'rdf:resource') + self.push("errorreportsto", 1) + value = self._getAttribute(attrsD, "rdf:resource") if value: self.elementstack[-1][2].append(value) - self.pop('errorreportsto') - + self.pop("errorreportsto") + def _start_summary(self, attrsD): context = self._getContext() - if context.has_key('summary'): - self._summaryKey = 'content' + if "summary" in context: + self._summaryKey = "content" self._start_content(attrsD) else: - self._summaryKey = 'summary' - self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) + self._summaryKey = "summary" + self.pushContent(self._summaryKey, attrsD, "text/plain", 1) + _start_itunes_summary = _start_summary def _end_summary(self): - if self._summaryKey == 'content': + if self._summaryKey == "content": self._end_content() else: - self.popContent(self._summaryKey or 'summary') + self.popContent(self._summaryKey or "summary") self._summaryKey = None + _end_itunes_summary = _end_summary - + def _start_enclosure(self, attrsD): attrsD = self._itsAnHrefDamnIt(attrsD) - self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD)) - href = attrsD.get('href') + self._getContext().setdefault("enclosures", []).append(FeedParserDict(attrsD)) + href = attrsD.get("href") if href: context = self._getContext() - if not context.get('id'): - context['id'] = href - + if not context.get("id"): + context["id"] = href + def _start_source(self, attrsD): self.insource = 1 def _end_source(self): self.insource = 0 - self._getContext()['source'] = copy.deepcopy(self.sourcedata) + self._getContext()["source"] = copy.deepcopy(self.sourcedata) self.sourcedata.clear() def _start_content(self, attrsD): - self.pushContent('content', attrsD, 'text/plain', 1) - src = attrsD.get('src') + self.pushContent("content", attrsD, "text/plain", 1) + src = attrsD.get("src") if src: - self.contentparams['src'] = src - self.push('content', 1) + self.contentparams["src"] = src + self.push("content", 1) def _start_prodlink(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) + self.pushContent("content", attrsD, "text/html", 1) def _start_body(self, attrsD): - self.pushContent('content', attrsD, 'application/xhtml+xml', 1) + self.pushContent("content", attrsD, "application/xhtml+xml", 1) + _start_xhtml_body = _start_body def _start_content_encoded(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) + self.pushContent("content", attrsD, "text/html", 1) + _start_fullitem = _start_content_encoded def _end_content(self): - copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) - value = self.popContent('content') + copyToDescription = self.mapContentType(self.contentparams.get("type")) in (["text/plain"] + self.html_types) + value = self.popContent("content") if copyToDescription: - self._save('description', value) + self._save("description", value) + _end_body = _end_content _end_xhtml_body = _end_content _end_content_encoded = _end_content @@ -1394,48 +1789,60 @@ def _end_content(self): _end_prodlink = _end_content def _start_itunes_image(self, attrsD): - self.push('itunes_image', 0) - self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) + self.push("itunes_image", 0) + self._getContext()["image"] = FeedParserDict({"href": attrsD.get("href")}) + _start_itunes_link = _start_itunes_image - + def _end_itunes_block(self): - value = self.pop('itunes_block', 0) - self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 + value = self.pop("itunes_block", 0) + self._getContext()["itunes_block"] = (value == "yes") and 1 or 0 def _end_itunes_explicit(self): - value = self.pop('itunes_explicit', 0) - self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 + value = self.pop("itunes_explicit", 0) + self._getContext()["itunes_explicit"] = (value == "yes") and 1 or 0 + if _XML_AVAILABLE: + class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): def __init__(self, baseuri, baselang, encoding): - if _debug: sys.stderr.write('trying StrictFeedParser\n') + if _debug: + sys.stderr.write("trying StrictFeedParser\n") xml.sax.handler.ContentHandler.__init__(self) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) self.bozo = 0 self.exc = None - + def startPrefixMapping(self, prefix, uri): self.trackNamespace(prefix, uri) - + def startElementNS(self, name, qname, attrs): namespace, localname = name - lowernamespace = str(namespace or '').lower() - if lowernamespace.find('backend.userland.com/rss') <> -1: + lowernamespace = str(namespace or "").lower() + if lowernamespace.find("backend.userland.com/rss") != -1: # match any backend.userland.com namespace - namespace = 'http://backend.userland.com/rss' + namespace = "http://backend.userland.com/rss" lowernamespace = namespace - if qname and qname.find(':') > 0: - givenprefix = qname.split(':')[0] + if qname and qname.find(":") > 0: + givenprefix = qname.split(":")[0] else: givenprefix = None prefix = self._matchnamespaces.get(lowernamespace, givenprefix) - if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): - raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix + if ( + givenprefix + and (prefix == None or (prefix == "" and lowernamespace == "")) + and givenprefix not in self.namespacesInUse + ): + raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix) if prefix: - localname = prefix + ':' + localname + localname = prefix + ":" + localname localname = str(localname).lower() - if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) + if _debug: + sys.stderr.write( + "startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n" + % (qname, namespace, givenprefix, prefix, attrs.items(), localname), + ) # qname implementation is horribly broken in Python 2.1 (it # doesn't report any), and slightly broken in Python 2.2 (it @@ -1446,10 +1853,10 @@ def startElementNS(self, name, qname, attrs): # tirelessly telling me that it didn't work yet. attrsD = {} for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): - lowernamespace = (namespace or '').lower() - prefix = self._matchnamespaces.get(lowernamespace, '') + lowernamespace = (namespace or "").lower() + prefix = self._matchnamespaces.get(lowernamespace, "") if prefix: - attrlocalname = prefix + ':' + attrlocalname + attrlocalname = prefix + ":" + attrlocalname attrsD[str(attrlocalname).lower()] = attrvalue for qname in attrs.getQNames(): attrsD[str(qname).lower()] = attrs.getValueByQName(qname) @@ -1460,78 +1867,102 @@ def characters(self, text): def endElementNS(self, name, qname): namespace, localname = name - lowernamespace = str(namespace or '').lower() - if qname and qname.find(':') > 0: - givenprefix = qname.split(':')[0] + lowernamespace = str(namespace or "").lower() + if qname and qname.find(":") > 0: + givenprefix = qname.split(":")[0] else: - givenprefix = '' + givenprefix = "" prefix = self._matchnamespaces.get(lowernamespace, givenprefix) if prefix: - localname = prefix + ':' + localname + localname = prefix + ":" + localname localname = str(localname).lower() self.unknown_endtag(localname) def error(self, exc): self.bozo = 1 self.exc = exc - + def fatalError(self, exc): self.error(exc) raise exc -class _BaseHTMLProcessor(sgmllib.SGMLParser): - elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', - 'img', 'input', 'isindex', 'link', 'meta', 'param'] - + +class _BaseHTMLProcessor(HTMLParser): + elements_no_end_tag = [ + "area", + "base", + "basefont", + "br", + "col", + "frame", + "hr", + "img", + "input", + "isindex", + "link", + "meta", + "param", + ] + + tagfind = re.compile("[a-zA-Z][-_.:a-zA-Z0-9]*") + charref = re.compile(r"&#(\d+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]") + special = re.compile("' + return "<" + tag + " />" else: - return '<' + tag + '>' - + return "<" + tag + ">" + + def convert_charrefs(self, *args, **kwargs): + return super().convert_charrefs(*args, **kwargs) + def feed(self, data): - data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace - data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) - data = data.replace(''', "'") - data = data.replace('"', '"') - if self.encoding and type(data) == type(u''): + if not isinstance(data, str): + data = data.decode(self.encoding) + data = re.compile(r"", self._shorttag_replace, data) + data = data.replace("'", "'") + data = data.replace(""", '"') + if self.encoding and isinstance(data, str): data = data.encode(self.encoding) - sgmllib.SGMLParser.feed(self, data) - sgmllib.SGMLParser.close(self) + super().feed(data.decode()) def normalize_attrs(self, attrs): # utility method to be called by descendants attrs = [(k.lower(), v) for k, v in attrs] - attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] + attrs = [(k, k in ("rel", "type") and v.lower() or v) for k, v in attrs] return attrs def unknown_starttag(self, tag, attrs): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag='pre', attrs=[('class', 'screen')]
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
+        if _debug:
+            sys.stderr.write("_BaseHTMLProcessor, unknown_starttag, tag=%s\n" % tag)
         uattrs = []
         # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
         for key, value in attrs:
-            if type(value) != type(u''):
-                value = unicode(value, self.encoding)
-            uattrs.append((unicode(key, self.encoding), value))
-        strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
+            if type(value) != str:
+                value = str(value, self.encoding)
+            uattrs.append((str(key, self.encoding), value))
+        strattrs = "".join([f' {key}="{value}"' for key, value in uattrs]).encode(self.encoding)
         if tag in self.elements_no_end_tag:
-            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
+            self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
         else:
-            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+            self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
 
     def unknown_endtag(self, tag):
         # called for each end tag, e.g. for 
, tag will be 'pre' @@ -1542,42 +1973,43 @@ def unknown_endtag(self, tag): def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' # Reconstruct the original character reference. - self.pieces.append('&#%(ref)s;' % locals()) - + self.pieces.append("&#%(ref)s;" % locals()) + def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. - import htmlentitydefs - if not hasattr(htmlentitydefs, 'name2codepoint') or htmlentitydefs.name2codepoint.has_key(ref): - self.pieces.append('&%(ref)s;' % locals()) + if ref in html.entities.name2codepoint: + self.pieces.append(f"&{ref};") else: - self.pieces.append('&%(ref)s' % locals()) + self.pieces.append(f"&{ref}") def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. - if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) + if _debug: + sys.stderr.write("_BaseHTMLProcessor, handle_text, text=%s\n" % text) self.pieces.append(text) - + def handle_comment(self, text): # called for each HTML comment, e.g. # Reconstruct the original comment. - self.pieces.append('' % locals()) - + self.pieces.append("" % locals()) + def handle_pi(self, text): # called for each processing instruction, e.g. # Reconstruct original processing instruction. - self.pieces.append('' % locals()) + self.pieces.append("" % locals()) def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # # Reconstruct original DOCTYPE - self.pieces.append('' % locals()) - - _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match + self.pieces.append("" % locals()) + + _new_declname_match = re.compile(r"[a-zA-Z][-_.a-zA-Z0-9:]*\s*").match + def _scan_name(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) @@ -1592,68 +2024,76 @@ def _scan_name(self, i, declstartpos): return name.lower(), m.end() else: self.handle_data(rawdata) -# self.updatepos(declstartpos, i) + # self.updatepos(declstartpos, i) return None, -1 def output(self): - '''Return processed HTML as a single string''' - return ''.join([str(p) for p in self.pieces]) + """Return processed HTML as a single string""" + return "".join([str(p) for p in self.pieces]) + class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): def __init__(self, baseuri, baselang, encoding): - sgmllib.SGMLParser.__init__(self) + self.rawdata = "" + self.cdata_elem = None + self.lineno = 1 + self.offset = 0 + super().__init__(encoding) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) def decodeEntities(self, element, data): - data = data.replace('<', '<') - data = data.replace('<', '<') - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('>', '>') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace('"', '"') - data = data.replace(''', ''') - data = data.replace(''', ''') - if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace(''', "'") + data = data.replace("<", "<") + data = data.replace("<", "<") + data = data.replace("<", "<") + data = data.replace(">", ">") + data = data.replace(">", ">") + data = data.replace(">", ">") + data = data.replace("&", "&") + data = data.replace("&", "&") + data = data.replace(""", """) + data = data.replace(""", """) + data = data.replace("'", "'") + data = data.replace("'", "'") + if "type" in self.contentparams and not self.contentparams.get("type", "xml").endswith("xml"): + data = data.replace("<", "<") + data = data.replace(">", ">") + data = data.replace("&", "&") + data = data.replace(""", '"') + data = data.replace("'", "'") return data - + def strattrs(self, attrs): - return ''.join([' %s="%s"' % t for t in attrs]) - + return "".join([f' {k}="{v}"' for k, v in attrs]) + + class _RelativeURIResolver(_BaseHTMLProcessor): - relative_uris = [('a', 'href'), - ('applet', 'codebase'), - ('area', 'href'), - ('blockquote', 'cite'), - ('body', 'background'), - ('del', 'cite'), - ('form', 'action'), - ('frame', 'longdesc'), - ('frame', 'src'), - ('iframe', 'longdesc'), - ('iframe', 'src'), - ('head', 'profile'), - ('img', 'longdesc'), - ('img', 'src'), - ('img', 'usemap'), - ('input', 'src'), - ('input', 'usemap'), - ('ins', 'cite'), - ('link', 'href'), - ('object', 'classid'), - ('object', 'codebase'), - ('object', 'data'), - ('object', 'usemap'), - ('q', 'cite'), - ('script', 'src')] + relative_uris = [ + ("a", "href"), + ("applet", "codebase"), + ("area", "href"), + ("blockquote", "cite"), + ("body", "background"), + ("del", "cite"), + ("form", "action"), + ("frame", "longdesc"), + ("frame", "src"), + ("iframe", "longdesc"), + ("iframe", "src"), + ("head", "profile"), + ("img", "longdesc"), + ("img", "src"), + ("img", "usemap"), + ("input", "src"), + ("input", "usemap"), + ("ins", "cite"), + ("link", "href"), + ("object", "classid"), + ("object", "codebase"), + ("object", "data"), + ("object", "usemap"), + ("q", "cite"), + ("script", "src"), + ] def __init__(self, baseuri, encoding): _BaseHTMLProcessor.__init__(self, encoding) @@ -1661,56 +2101,188 @@ def __init__(self, baseuri, encoding): def resolveURI(self, uri): return _urljoin(self.baseuri, uri) - + def unknown_starttag(self, tag, attrs): attrs = self.normalize_attrs(attrs) attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - + + def _resolveRelativeURIs(htmlSource, baseURI, encoding): - if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') + if _debug: + sys.stderr.write("entering _resolveRelativeURIs\n") p = _RelativeURIResolver(baseURI, encoding) p.feed(htmlSource) return p.output() + class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', - 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', - 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', - 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', - 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', - 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', - 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', - 'thead', 'tr', 'tt', 'u', 'ul', 'var'] - - acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', - 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', - 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', - 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', - 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', - 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', - 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', - 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'] - - unacceptable_elements_with_end_tag = ['script', 'applet'] + acceptable_elements = [ + "a", + "abbr", + "acronym", + "address", + "area", + "b", + "big", + "blockquote", + "br", + "button", + "caption", + "center", + "cite", + "code", + "col", + "colgroup", + "dd", + "del", + "dfn", + "dir", + "div", + "dl", + "dt", + "em", + "fieldset", + "font", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "i", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "map", + "menu", + "ol", + "optgroup", + "option", + "p", + "pre", + "q", + "s", + "samp", + "select", + "small", + "span", + "strike", + "strong", + "sub", + "sup", + "table", + "tbody", + "td", + "textarea", + "tfoot", + "th", + "thead", + "tr", + "tt", + "u", + "ul", + "var", + ] + + acceptable_attributes = [ + "abbr", + "accept", + "accept-charset", + "accesskey", + "action", + "align", + "alt", + "axis", + "border", + "cellpadding", + "cellspacing", + "char", + "charoff", + "charset", + "checked", + "cite", + "class", + "clear", + "cols", + "colspan", + "color", + "compact", + "coords", + "datetime", + "dir", + "disabled", + "enctype", + "for", + "frame", + "headers", + "height", + "href", + "hreflang", + "hspace", + "id", + "ismap", + "label", + "lang", + "longdesc", + "maxlength", + "media", + "method", + "multiple", + "name", + "nohref", + "noshade", + "nowrap", + "prompt", + "readonly", + "rel", + "rev", + "rows", + "rowspan", + "rules", + "scope", + "selected", + "shape", + "size", + "span", + "src", + "start", + "summary", + "tabindex", + "target", + "title", + "type", + "usemap", + "valign", + "value", + "vspace", + "width", + "xml:lang", + ] + + unacceptable_elements_with_end_tag = ["script", "applet"] def reset(self): _BaseHTMLProcessor.reset(self) self.unacceptablestack = 0 - + def unknown_starttag(self, tag, attrs): - if not tag in self.acceptable_elements: + if tag not in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack += 1 return attrs = self.normalize_attrs(attrs) attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - + def unknown_endtag(self, tag): - if not tag in self.acceptable_elements: + if tag not in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack -= 1 return @@ -1726,6 +2298,8 @@ def handle_data(self, text): if not self.unacceptablestack: _BaseHTMLProcessor.handle_data(self, text) + +# TODO(py2to3): replace tidy and mx def _sanitizeHTML(htmlSource, encoding): p = _HTMLSanitizer(encoding) p.feed(htmlSource) @@ -1738,63 +2312,67 @@ def _sanitizeHTML(htmlSource, encoding): try: if tidy_interface == "uTidy": from tidy import parseString as _utidy + def _tidy(data, **kwargs): return str(_utidy(data, **kwargs)) + break elif tidy_interface == "mxTidy": from mx.Tidy import Tidy as _mxtidy + def _tidy(data, **kwargs): nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) return data + break except: pass if _tidy: - utf8 = type(data) == type(u'') + utf8 = type(data) == str if utf8: - data = data.encode('utf-8') + data = data.encode("utf-8") data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") if utf8: - data = unicode(data, 'utf-8') - if data.count(''): - data = data.split('>', 1)[1] - if data.count('"): + data = data.split(">", 1)[1] + if data.count("= '2.3.3' + assert sys.version.split()[0] >= "2.3.3" assert base64 != None - user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') - realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] + user, passw = base64.decodestring(req.headers["Authorization"].split(" ")[1]).split(":") + realm = re.findall('realm="([^"]*)"', headers["WWW-Authenticate"])[0] self.add_password(realm, host, user, passw) - retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) + retry = self.http_error_auth_reqed("www-authenticate", host, req, headers) self.reset_retry_count() return retry except: return self.http_error_default(req, fp, code, msg, headers) + def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): """URL, filename, or string --> stream @@ -1845,61 +2424,72 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h If handlers is supplied, it is a list of handlers used to build a urllib2 opener. """ - - if hasattr(url_file_stream_or_string, 'read'): + if hasattr(url_file_stream_or_string, "read"): return url_file_stream_or_string - if url_file_stream_or_string == '-': + if url_file_stream_or_string == "-": return sys.stdin - if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): + if urllib.parse.urlparse(url_file_stream_or_string)[0] in ("http", "https", "ftp"): if not agent: agent = USER_AGENT # test for inline user:password for basic auth auth = None if base64: - urltype, rest = urllib.splittype(url_file_stream_or_string) - realhost, rest = urllib.splithost(rest) + urltype, rest = urllib.parse.splittype(url_file_stream_or_string) + realhost, rest = urllib.parse.splithost(rest) if realhost: - user_passwd, realhost = urllib.splituser(realhost) + user_passwd, realhost = urllib.parse.splituser(realhost) if user_passwd: - url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) + url_file_stream_or_string = f"{urltype}://{realhost}{rest}" auth = base64.encodestring(user_passwd).strip() # try to open with urllib2 (to use optional headers) - request = urllib2.Request(url_file_stream_or_string) - request.add_header('User-Agent', agent) + request = urllib.request.Request(url_file_stream_or_string) + request.add_header("User-Agent", agent) if etag: - request.add_header('If-None-Match', etag) + request.add_header("If-None-Match", etag) if modified: # format into an RFC 1123-compliant timestamp. We can't use # time.strftime() since the %a and %b directives can be affected # by the current locale, but RFC 2616 states that dates must be # in English. - short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) + short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] + months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] + request.add_header( + "If-Modified-Since", + "%s, %02d %s %04d %02d:%02d:%02d GMT" + % ( + short_weekdays[modified[6]], + modified[2], + months[modified[1] - 1], + modified[0], + modified[3], + modified[4], + modified[5], + ), + ) if referrer: - request.add_header('Referer', referrer) + request.add_header("Referer", referrer) if gzip and zlib: - request.add_header('Accept-encoding', 'gzip, deflate') + request.add_header("Accept-encoding", "gzip, deflate") elif gzip: - request.add_header('Accept-encoding', 'gzip') + request.add_header("Accept-encoding", "gzip") elif zlib: - request.add_header('Accept-encoding', 'deflate') + request.add_header("Accept-encoding", "deflate") else: - request.add_header('Accept-encoding', '') + request.add_header("Accept-encoding", "") if auth: - request.add_header('Authorization', 'Basic %s' % auth) + request.add_header("Authorization", "Basic %s" % auth) if ACCEPT_HEADER: - request.add_header('Accept', ACCEPT_HEADER) - request.add_header('A-IM', 'feed') # RFC 3229 support - opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) - opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent + request.add_header("Accept", ACCEPT_HEADER) + request.add_header("A-IM", "feed") # RFC 3229 support + opener = urllib.request.build_opener(*([_FeedURLHandler()] + handlers)) + opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent try: return opener.open(request) finally: - opener.close() # JohnD - + opener.close() # JohnD + # try to open with native open function (if url_file_stream_or_string is a filename) try: return open(url_file_stream_or_string) @@ -1909,11 +2499,15 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h # treat url_file_stream_or_string as string return _StringIO(str(url_file_stream_or_string)) + _date_handlers = [] + + def registerDateHandler(func): - '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' + """Register a date handler function (takes string, returns 9-tuple date in GMT)""" _date_handlers.insert(0, func) - + + # ISO-8601 date parsing routines written by Fazal Majid. # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 # parser is beyond the scope of feedparser and would be a worthwhile addition @@ -1923,51 +2517,66 @@ def registerDateHandler(func): # 0301-04-01), so we use templates instead. # Please note the order in templates is significant because we need a # greedy match. -_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', - 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', - '-YY-?MM', '-OOO', '-YY', - '--MM-?DD', '--MM', - '---DD', - 'CC', ''] +_iso8601_tmpl = [ + "YYYY-?MM-?DD", + "YYYY-MM", + "YYYY-?OOO", + "YY-?MM-?DD", + "YY-?OOO", + "YYYY", + "-YY-?MM", + "-OOO", + "-YY", + "--MM-?DD", + "--MM", + "---DD", + "CC", + "", +] + _iso8601_re = [ - tmpl.replace( - 'YYYY', r'(?P\d{4})').replace( - 'YY', r'(?P\d\d)').replace( - 'MM', r'(?P[01]\d)').replace( - 'DD', r'(?P[0123]\d)').replace( - 'OOO', r'(?P[0123]\d\d)').replace( - 'CC', r'(?P\d\d$)') - + r'(T?(?P\d{2}):(?P\d{2})' - + r'(:(?P\d{2}))?' - + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' - for tmpl in _iso8601_tmpl] -del tmpl + tmpl.replace("YYYY", r"(?P\d{4})") + .replace("YY", r"(?P\d\d)") + .replace("MM", r"(?P[01]\d)") + .replace("DD", r"(?P[0123]\d)") + .replace("OOO", r"(?P[0123]\d\d)") + .replace("CC", r"(?P\d\d$)") + + r"(T?(?P\d{2}):(?P\d{2})" + + r"(:(?P\d{2}))?" + + r"(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?" + for tmpl in _iso8601_tmpl +] + _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] -del regex + + def _parse_date_iso8601(dateString): - '''Parse a variety of ISO-8601-compatible formats like 20040105''' + """Parse a variety of ISO-8601-compatible formats like 20040105""" m = None for _iso8601_match in _iso8601_matches: m = _iso8601_match(dateString) - if m: break - if not m: return - if m.span() == (0, 0): return + if m: + break + if not m: + return None + if m.span() == (0, 0): + return None params = m.groupdict() - ordinal = params.get('ordinal', 0) + ordinal = params.get("ordinal", 0) if ordinal: ordinal = int(ordinal) else: ordinal = 0 - year = params.get('year', '--') - if not year or year == '--': + year = params.get("year", "--") + if not year or year == "--": year = time.gmtime()[0] elif len(year) == 2: # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 year = 100 * int(time.gmtime()[0] / 100) + int(year) else: year = int(year) - month = params.get('month', '-') - if not month or month == '-': + month = params.get("month", "-") + if not month or month == "-": # ordinals are NOT normalized by mktime, we simulate them # by setting month=1, day=ordinal if ordinal: @@ -1975,13 +2584,12 @@ def _parse_date_iso8601(dateString): else: month = time.gmtime()[1] month = int(month) - day = params.get('day', 0) + day = params.get("day", 0) if not day: # see above if ordinal: day = ordinal - elif params.get('century', 0) or \ - params.get('year', 0) or params.get('month', 0): + elif params.get("century", 0) or params.get("year", 0) or params.get("month", 0): day = 1 else: day = time.gmtime()[2] @@ -1989,15 +2597,15 @@ def _parse_date_iso8601(dateString): day = int(day) # special case of the century - is the first year of the 21st century # 2000 or 2001 ? The debate goes on... - if 'century' in params.keys(): - year = (int(params['century']) - 1) * 100 + 1 + if "century" in params.keys(): + year = (int(params["century"]) - 1) * 100 + 1 # in ISO 8601 most fields are optional - for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: + for field in ["hour", "minute", "second", "tzhour", "tzmin"]: if not params.get(field, None): params[field] = 0 - hour = int(params.get('hour', 0)) - minute = int(params.get('minute', 0)) - second = int(params.get('second', 0)) + hour = int(params.get("hour", 0)) + minute = int(params.get("minute", 0)) + second = int(params.get("second", 0)) # weekday is normalized by mktime(), we can ignore it weekday = 0 # daylight savings is complex, but not needed for feedparser's purposes @@ -2005,192 +2613,224 @@ def _parse_date_iso8601(dateString): # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and # and most implementations have DST bugs daylight_savings_flag = 0 - tm = [year, month, day, hour, minute, second, weekday, - ordinal, daylight_savings_flag] + tm = [year, month, day, hour, minute, second, weekday, ordinal, daylight_savings_flag] # ISO 8601 time zone adjustments - tz = params.get('tz') - if tz and tz != 'Z': - if tz[0] == '-': - tm[3] += int(params.get('tzhour', 0)) - tm[4] += int(params.get('tzmin', 0)) - elif tz[0] == '+': - tm[3] -= int(params.get('tzhour', 0)) - tm[4] -= int(params.get('tzmin', 0)) + tz = params.get("tz") + if tz and tz != "Z": + if tz[0] == "-": + tm[3] += int(params.get("tzhour", 0)) + tm[4] += int(params.get("tzmin", 0)) + elif tz[0] == "+": + tm[3] -= int(params.get("tzhour", 0)) + tm[4] -= int(params.get("tzmin", 0)) else: return None # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) # which is guaranteed to normalize d/m/y/h/m/s. # Many implementations have bugs, but we'll pretend they don't. return time.localtime(time.mktime(tm)) + + registerDateHandler(_parse_date_iso8601) - + # 8-bit date handling routines written by ytrewq1. -_korean_year = u'\ub144' # b3e2 in euc-kr -_korean_month = u'\uc6d4' # bff9 in euc-kr -_korean_day = u'\uc77c' # c0cf in euc-kr -_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr -_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr - -_korean_onblog_date_re = \ - re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ - (_korean_year, _korean_month, _korean_day)) -_korean_nate_date_re = \ - re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ - (_korean_am, _korean_pm)) +_korean_year = "\ub144" # b3e2 in euc-kr +_korean_month = "\uc6d4" # bff9 in euc-kr +_korean_day = "\uc77c" # c0cf in euc-kr +_korean_am = "\uc624\uc804" # bfc0 c0fc in euc-kr +_korean_pm = "\uc624\ud6c4" # bfc0 c8c4 in euc-kr + +_korean_onblog_date_re = re.compile( + rf"(\d{{4}}){_korean_year}\s+(\d{{2}}){_korean_month}\s+(\d{{2}}){_korean_day}\s+(\d{{2}}):(\d{{2}}):(\d{{2}})", +) +_korean_nate_date_re = re.compile( + rf"(\d{{4}})-(\d{{2}})-(\d{{2}})\s+({_korean_am}|{_korean_pm})\s+(\d{{,2}}):(\d{{,2}}):(\d{{,2}})", +) + + def _parse_date_onblog(dateString): - '''Parse a string according to the OnBlog 8-bit date format''' + """Parse a string according to the OnBlog 8-bit date format""" m = _korean_onblog_date_re.match(dateString) - if not m: return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) + if not m: + return None + w3dtfdate = "{year}-{month}-{day}T{hour}:{minute}:{second}{zonediff}".format( + year=m.group(1), + month=m.group(2), + day=m.group(3), + hour=m.group(4), + minute=m.group(5), + second=m.group(6), + zonediff="+09:00", + ) + if _debug: + sys.stderr.write("OnBlog date parsed as: %s\n" % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) + + registerDateHandler(_parse_date_onblog) + def _parse_date_nate(dateString): - '''Parse a string according to the Nate 8-bit date format''' + """Parse a string according to the Nate 8-bit date format""" m = _korean_nate_date_re.match(dateString) - if not m: return + if not m: + return None hour = int(m.group(5)) ampm = m.group(4) - if (ampm == _korean_pm): + if ampm == _korean_pm: hour += 12 hour = str(hour) if len(hour) == 1: - hour = '0' + hour - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) + hour = "0" + hour + w3dtfdate = "{year}-{month}-{day}T{hour}:{minute}:{second}{zonediff}".format( + year=m.group(1), + month=m.group(2), + day=m.group(3), + hour=hour, + minute=m.group(6), + second=m.group(7), + zonediff="+09:00", + ) + if _debug: + sys.stderr.write("Nate date parsed as: %s\n" % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) + + registerDateHandler(_parse_date_nate) -_mssql_date_re = \ - re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?') +_mssql_date_re = re.compile(r"(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?") + + def _parse_date_mssql(dateString): - '''Parse a string according to the MS SQL date format''' + """Parse a string according to the MS SQL date format""" m = _mssql_date_re.match(dateString) - if not m: return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) + if not m: + return None + w3dtfdate = "{year}-{month}-{day}T{hour}:{minute}:{second}{zonediff}".format( + year=m.group(1), + month=m.group(2), + day=m.group(3), + hour=m.group(4), + minute=m.group(5), + second=m.group(6), + zonediff="+09:00", + ) + if _debug: + sys.stderr.write("MS SQL date parsed as: %s\n" % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) + + registerDateHandler(_parse_date_mssql) # Unicode strings for Greek date strings -_greek_months = \ - { \ - u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 - u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 - u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 - u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 - u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 - u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 - u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 - u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 - u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 - u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 - u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 - u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 - u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 - u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 - u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 - u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 - u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 - u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 - u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 - } - -_greek_wdays = \ - { \ - u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 - u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 - u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 - u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 - u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 - u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 - u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 - } - -_greek_date_format_re = \ - re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') +_greek_months = { + "\u0399\u03b1\u03bd": "Jan", # c9e1ed in iso-8859-7 + "\u03a6\u03b5\u03b2": "Feb", # d6e5e2 in iso-8859-7 + "\u039c\u03ac\u03ce": "Mar", # ccdcfe in iso-8859-7 + "\u039c\u03b1\u03ce": "Mar", # cce1fe in iso-8859-7 + "\u0391\u03c0\u03c1": "Apr", # c1f0f1 in iso-8859-7 + "\u039c\u03ac\u03b9": "May", # ccdce9 in iso-8859-7 + "\u039c\u03b1\u03ca": "May", # cce1fa in iso-8859-7 + "\u039c\u03b1\u03b9": "May", # cce1e9 in iso-8859-7 + "\u0399\u03bf\u03cd\u03bd": "Jun", # c9effded in iso-8859-7 + "\u0399\u03bf\u03bd": "Jun", # c9efed in iso-8859-7 + "\u0399\u03bf\u03cd\u03bb": "Jul", # c9effdeb in iso-8859-7 + "\u0399\u03bf\u03bb": "Jul", # c9f9eb in iso-8859-7 + "\u0391\u03cd\u03b3": "Aug", # c1fde3 in iso-8859-7 + "\u0391\u03c5\u03b3": "Aug", # c1f5e3 in iso-8859-7 + "\u03a3\u03b5\u03c0": "Sep", # d3e5f0 in iso-8859-7 + "\u039f\u03ba\u03c4": "Oct", # cfeaf4 in iso-8859-7 + "\u039d\u03bf\u03ad": "Nov", # cdefdd in iso-8859-7 + "\u039d\u03bf\u03b5": "Nov", # cdefe5 in iso-8859-7 + "\u0394\u03b5\u03ba": "Dec", # c4e5ea in iso-8859-7 +} + +_greek_wdays = { + "\u039a\u03c5\u03c1": "Sun", # caf5f1 in iso-8859-7 + "\u0394\u03b5\u03c5": "Mon", # c4e5f5 in iso-8859-7 + "\u03a4\u03c1\u03b9": "Tue", # d4f1e9 in iso-8859-7 + "\u03a4\u03b5\u03c4": "Wed", # d4e5f4 in iso-8859-7 + "\u03a0\u03b5\u03bc": "Thu", # d0e5ec in iso-8859-7 + "\u03a0\u03b1\u03c1": "Fri", # d0e1f1 in iso-8859-7 + "\u03a3\u03b1\u03b2": "Sat", # d3e1e2 in iso-8859-7 +} + +_greek_date_format_re = re.compile(r"([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)") + def _parse_date_greek(dateString): - '''Parse a string according to a Greek 8-bit date format.''' + """Parse a string according to a Greek 8-bit date format.""" m = _greek_date_format_re.match(dateString) - if not m: return + if not m: + return None try: wday = _greek_wdays[m.group(1)] month = _greek_months[m.group(3)] except: - return - rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ - {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ - 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ - 'zonediff': m.group(8)} - if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) + return None + rfc822date = f"{wday}, {m.group(2)} {month} {m.group(4)} {m.group(5)}:{m.group(6)}:{m.group(7)} {m.group(8)}" + if _debug: + sys.stderr.write("Greek date parsed as: %s\n" % rfc822date) return _parse_date_rfc822(rfc822date) + + registerDateHandler(_parse_date_greek) # Unicode strings for Hungarian date strings -_hungarian_months = \ - { \ - u'janu\u00e1r': u'01', # e1 in iso-8859-2 - u'febru\u00e1ri': u'02', # e1 in iso-8859-2 - u'm\u00e1rcius': u'03', # e1 in iso-8859-2 - u'\u00e1prilis': u'04', # e1 in iso-8859-2 - u'm\u00e1ujus': u'05', # e1 in iso-8859-2 - u'j\u00fanius': u'06', # fa in iso-8859-2 - u'j\u00falius': u'07', # fa in iso-8859-2 - u'augusztus': u'08', - u'szeptember': u'09', - u'okt\u00f3ber': u'10', # f3 in iso-8859-2 - u'november': u'11', - u'december': u'12', - } - -_hungarian_date_format_re = \ - re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') +_hungarian_months = { + "janu\u00e1r": "01", # e1 in iso-8859-2 + "febru\u00e1ri": "02", # e1 in iso-8859-2 + "m\u00e1rcius": "03", # e1 in iso-8859-2 + "\u00e1prilis": "04", # e1 in iso-8859-2 + "m\u00e1ujus": "05", # e1 in iso-8859-2 + "j\u00fanius": "06", # fa in iso-8859-2 + "j\u00falius": "07", # fa in iso-8859-2 + "augusztus": "08", + "szeptember": "09", + "okt\u00f3ber": "10", # f3 in iso-8859-2 + "november": "11", + "december": "12", +} + +_hungarian_date_format_re = re.compile(r"(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))") + def _parse_date_hungarian(dateString): - '''Parse a string according to a Hungarian 8-bit date format.''' + """Parse a string according to a Hungarian 8-bit date format.""" m = _hungarian_date_format_re.match(dateString) - if not m: return + if not m: + return None try: month = _hungarian_months[m.group(2)] day = m.group(3) if len(day) == 1: - day = '0' + day + day = "0" + day hour = m.group(4) if len(hour) == 1: - hour = '0' + hour + hour = "0" + hour except: - return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ - {'year': m.group(1), 'month': month, 'day': day,\ - 'hour': hour, 'minute': m.group(5),\ - 'zonediff': m.group(6)} - if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) + return None + w3dtfdate = f"{m.group(1)}-{month}-{day}T{hour}:{m.group(5)}{m.group(6)}" + if _debug: + sys.stderr.write("Hungarian date parsed as: %s\n" % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) + + registerDateHandler(_parse_date_hungarian) + # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by # Drake and licensed under the Python license. Removed all range checking # for month, day, hour, minute, and second, since mktime will normalize # these later def _parse_date_w3dtf(dateString): def __extract_date(m): - year = int(m.group('year')) + year = int(m.group("year")) if year < 100: year = 100 * int(time.gmtime()[0] / 100) + int(year) if year < 1000: return 0, 0, 0 - julian = m.group('julian') + julian = m.group("julian") if julian: julian = int(julian) month = julian / 30 + 1 @@ -2208,17 +2848,17 @@ def __extract_date(m): day = 31 elif jday < julian: if day + diff < 28: - day = day + diff + day = day + diff else: month = month + 1 return year, month, day - month = m.group('month') + month = m.group("month") day = 1 if month is None: month = 1 else: month = int(month) - day = m.group('day') + day = m.group("day") if day: day = int(day) else: @@ -2228,12 +2868,12 @@ def __extract_date(m): def __extract_time(m): if not m: return 0, 0, 0 - hours = m.group('hours') + hours = m.group("hours") if not hours: return 0, 0, 0 hours = int(hours) - minutes = int(m.group('minutes')) - seconds = m.group('seconds') + minutes = int(m.group("minutes")) + seconds = m.group("seconds") if seconds: seconds = int(seconds) else: @@ -2241,89 +2881,93 @@ def __extract_time(m): return hours, minutes, seconds def __extract_tzd(m): - '''Return the Time Zone Designator as an offset in seconds from UTC.''' + """Return the Time Zone Designator as an offset in seconds from UTC.""" if not m: return 0 - tzd = m.group('tzd') + tzd = m.group("tzd") if not tzd: return 0 - if tzd == 'Z': + if tzd == "Z": return 0 - hours = int(m.group('tzdhours')) - minutes = m.group('tzdminutes') + hours = int(m.group("tzdhours")) + minutes = m.group("tzdminutes") if minutes: minutes = int(minutes) else: minutes = 0 - offset = (hours*60 + minutes) * 60 - if tzd[0] == '+': + offset = (hours * 60 + minutes) * 60 + if tzd[0] == "+": return -offset return offset - __date_re = ('(?P\d\d\d\d)' - '(?:(?P-|)' - '(?:(?P\d\d\d)' - '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?') - __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)' + __date_re = ( + r"(?P\d\d\d\d)" + "(?:(?P-|)" + r"(?:(?P\d\d\d)" + r"|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?" + ) + __tzd_re = r"(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)" __tzd_rx = re.compile(__tzd_re) - __time_re = ('(?P\d\d)(?P:|)(?P\d\d)' - '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?' - + __tzd_re) - __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) + __time_re = r"(?P\d\d)(?P:|)(?P\d\d)(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?" + __tzd_re + __datetime_re = f"{__date_re}(?:T{__time_re})?" __datetime_rx = re.compile(__datetime_re) m = __datetime_rx.match(dateString) - if (m is None) or (m.group() != dateString): return + if (m is None) or (m.group() != dateString): + return None gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) - if gmt[0] == 0: return + if gmt[0] == 0: + return None return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) + + registerDateHandler(_parse_date_w3dtf) + def _parse_date_rfc822(dateString): - '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' + """Parse an RFC822, RFC1123, RFC2822, or asctime-style date""" data = dateString.split() - if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: + if data[0][-1] in (",", ".") or data[0].lower() in ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]: del data[0] if len(data) == 4: s = data[3] - i = s.find('+') + i = s.find("+") if i > 0: - data[3:] = [s[:i], s[i+1:]] + data[3:] = [s[:i], s[i + 1:]] else: - data.append('') + data.append("") dateString = " ".join(data) if len(data) < 5: - dateString += ' 00:00:00 GMT' - tm = rfc822.parsedate_tz(dateString) + dateString += " 00:00:00 GMT" + tm = email.utils.parsedate_tz(dateString) if tm: - return time.gmtime(rfc822.mktime_tz(tm)) -# rfc822.py defines several time zones, but we define some extra ones. -# 'ET' is equivalent to 'EST', etc. -_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} -rfc822._timezones.update(_additional_timezones) -registerDateHandler(_parse_date_rfc822) + return time.gmtime(calendar.timegm(tm[:9])) + def _parse_date(dateString): - '''Parses a variety of date formats into a 9-tuple in GMT''' + """Parses a variety of date formats into a 9-tuple in GMT""" for handler in _date_handlers: try: date9tuple = handler(dateString) - if not date9tuple: continue + if not date9tuple: + continue if len(date9tuple) != 9: - if _debug: sys.stderr.write('date handler function must return 9-tuple\n') + if _debug: + sys.stderr.write("date handler function must return 9-tuple\n") raise ValueError map(int, date9tuple) return date9tuple - except Exception, e: - if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) - pass + except Exception as e: + if _debug: + sys.stderr.write(f"{handler.__name__} raised {e!r}\n") return None + def _getCharacterEncoding(http_headers, xml_data): - '''Get the character encoding of the XML document + """Get the character encoding of the XML document http_headers is a dictionary xml_data is a raw string (not Unicode) - + This is so much trickier than it sounds, it's not even funny. According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type is application/xml, application/*+xml, @@ -2342,12 +2986,12 @@ def _getCharacterEncoding(http_headers, xml_data): served with a Content-Type of text/* and no charset parameter must be treated as us-ascii. (We now do this.) And also that it must always be flagged as non-well-formed. (We now do this too.) - + If Content-Type is unspecified (input was local file or non-HTTP source) or unrecognized (server just got it totally wrong), then go by the encoding given in the XML prefix of the document and default to 'iso-8859-1' as per the HTTP specification (RFC 2616). - + Then, assuming we didn't find a character encoding in the HTTP headers (and the HTTP Content-type allowed us to look in the body), we need to sniff the first few bytes of the XML data and try to determine @@ -2364,221 +3008,249 @@ def _getCharacterEncoding(http_headers, xml_data): correctly, which many are not). CJKCodecs and iconv_codec help a lot; you should definitely install them if you can. http://cjkpython.i18n.org/ - ''' + """ def _parseHTTPContentType(content_type): - '''takes HTTP Content-Type header and returns (content type, charset) + """Takes HTTP Content-Type header and returns (content type, charset) If no charset is specified, returns (content type, '') If no content type is specified, returns ('', '') Both return parameters are guaranteed to be lowercase strings - ''' - content_type = content_type or '' + """ + content_type = content_type or "" content_type, params = cgi.parse_header(content_type) - return content_type, params.get('charset', '').replace("'", '') + return content_type, params.get("charset", "").replace("'", "") - sniffed_xml_encoding = '' - xml_encoding = '' - true_encoding = '' - http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) + sniffed_xml_encoding = "" + xml_encoding = "" + true_encoding = "" + http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type")) # Must sniff for non-ASCII-compatible character encodings before # searching for XML declaration. This heuristic is defined in # section F of the XML specification: # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info try: - if xml_data[:4] == '\x4c\x6f\xa7\x94': + if xml_data[:4] == "\x4c\x6f\xa7\x94": # EBCDIC xml_data = _ebcdic_to_ascii(xml_data) - elif xml_data[:4] == '\x00\x3c\x00\x3f': + elif xml_data[:4] == "\x00\x3c\x00\x3f": # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): + sniffed_xml_encoding = "utf-16be" + xml_data = str(xml_data, "utf-16be").encode("utf-8") + elif (len(xml_data) >= 4) and (xml_data[:2] == "\xfe\xff") and (xml_data[2:4] != "\x00\x00"): # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x3f\x00': + sniffed_xml_encoding = "utf-16be" + xml_data = str(xml_data[2:], "utf-16be").encode("utf-8") + elif xml_data[:4] == "\x3c\x00\x3f\x00": # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): + sniffed_xml_encoding = "utf-16le" + xml_data = str(xml_data, "utf-16le").encode("utf-8") + elif (len(xml_data) >= 4) and (xml_data[:2] == "\xff\xfe") and (xml_data[2:4] != "\x00\x00"): # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\x00\x3c': + sniffed_xml_encoding = "utf-16le" + xml_data = str(xml_data[2:], "utf-16le").encode("utf-8") + elif xml_data[:4] == "\x00\x00\x00\x3c": # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x00\x00': + sniffed_xml_encoding = "utf-32be" + xml_data = str(xml_data, "utf-32be").encode("utf-8") + elif xml_data[:4] == "\x3c\x00\x00\x00": # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\xfe\xff': + sniffed_xml_encoding = "utf-32le" + xml_data = str(xml_data, "utf-32le").encode("utf-8") + elif xml_data[:4] == "\x00\x00\xfe\xff": # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\xff\xfe\x00\x00': + sniffed_xml_encoding = "utf-32be" + xml_data = str(xml_data[4:], "utf-32be").encode("utf-8") + elif xml_data[:4] == "\xff\xfe\x00\x00": # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == '\xef\xbb\xbf': + sniffed_xml_encoding = "utf-32le" + xml_data = str(xml_data[4:], "utf-32le").encode("utf-8") + elif xml_data[:3] == "\xef\xbb\xbf": # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + sniffed_xml_encoding = "utf-8" + xml_data = str(xml_data[3:], "utf-8").encode("utf-8") else: # ASCII-compatible pass - xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) + xml_encoding_match = re.compile("^<\\?.*encoding=['\"](.*?)['\"].*\\?>").match(xml_data) except: xml_encoding_match = None if xml_encoding_match: xml_encoding = xml_encoding_match.groups()[0].lower() - if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): + if sniffed_xml_encoding and ( + xml_encoding + in ( + "iso-10646-ucs-2", + "ucs-2", + "csunicode", + "iso-10646-ucs-4", + "ucs-4", + "csucs4", + "utf-16", + "utf-32", + "utf_16", + "utf_32", + "utf16", + "u16", + ) + ): xml_encoding = sniffed_xml_encoding acceptable_content_type = 0 - application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') - text_content_types = ('text/xml', 'text/xml-external-parsed-entity') - if (http_content_type in application_content_types) or \ - (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): + application_content_types = ("application/xml", "application/xml-dtd", "application/xml-external-parsed-entity") + text_content_types = ("text/xml", "text/xml-external-parsed-entity") + if (http_content_type in application_content_types) or ( + http_content_type.startswith("application/") and http_content_type.endswith("+xml") + ): acceptable_content_type = 1 - true_encoding = http_encoding or xml_encoding or 'utf-8' - elif (http_content_type in text_content_types) or \ - (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): + true_encoding = http_encoding or xml_encoding or "utf-8" + elif ( + (http_content_type in text_content_types) + or (http_content_type.startswith("text/")) + and http_content_type.endswith("+xml") + ): acceptable_content_type = 1 - true_encoding = http_encoding or 'us-ascii' - elif http_content_type.startswith('text/'): - true_encoding = http_encoding or 'us-ascii' - elif http_headers and (not http_headers.has_key('content-type')): - true_encoding = xml_encoding or 'iso-8859-1' + true_encoding = http_encoding or "us-ascii" + elif http_content_type.startswith("text/"): + true_encoding = http_encoding or "us-ascii" + elif http_headers and ("content-type" not in http_headers): + true_encoding = xml_encoding or "iso-8859-1" else: - true_encoding = xml_encoding or 'utf-8' + true_encoding = xml_encoding or "utf-8" return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type - + + def _toUTF8(data, encoding): - '''Changes an XML data stream on the fly to specify a new encoding + """Changes an XML data stream on the fly to specify a new encoding data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already encoding is a string recognized by encodings.aliases - ''' - if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) + """ + data = data.encode() + if _debug: + sys.stderr.write("entering _toUTF8, trying encoding %s\n" % encoding) # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): + if (len(data) >= 4) and (data[:2] == "\xfe\xff") and (data[2:4] != "\x00\x00"): if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-16be': - sys.stderr.write('trying utf-16be instead\n') - encoding = 'utf-16be' + sys.stderr.write("stripping BOM\n") + if encoding != "utf-16be": + sys.stderr.write("trying utf-16be instead\n") + encoding = "utf-16be" data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): + elif (len(data) >= 4) and (data[:2] == "\xff\xfe") and (data[2:4] != "\x00\x00"): if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-16le': - sys.stderr.write('trying utf-16le instead\n') - encoding = 'utf-16le' + sys.stderr.write("stripping BOM\n") + if encoding != "utf-16le": + sys.stderr.write("trying utf-16le instead\n") + encoding = "utf-16le" data = data[2:] - elif data[:3] == '\xef\xbb\xbf': + elif data[:3] == "\xef\xbb\xbf": if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-8': - sys.stderr.write('trying utf-8 instead\n') - encoding = 'utf-8' + sys.stderr.write("stripping BOM\n") + if encoding != "utf-8": + sys.stderr.write("trying utf-8 instead\n") + encoding = "utf-8" data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': + elif data[:4] == "\x00\x00\xfe\xff": if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-32be': - sys.stderr.write('trying utf-32be instead\n') - encoding = 'utf-32be' + sys.stderr.write("stripping BOM\n") + if encoding != "utf-32be": + sys.stderr.write("trying utf-32be instead\n") + encoding = "utf-32be" data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': + elif data[:4] == "\xff\xfe\x00\x00": if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-32le': - sys.stderr.write('trying utf-32le instead\n') - encoding = 'utf-32le' + sys.stderr.write("stripping BOM\n") + if encoding != "utf-32le": + sys.stderr.write("trying utf-32le instead\n") + encoding = "utf-32le" data = data[4:] - newdata = unicode(data, encoding) - if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) - declmatch = re.compile('^<\?xml[^>]*?>') - newdecl = '''''' + newdata = str(data, encoding) + if _debug: + sys.stderr.write("successfully converted %s data to unicode\n" % encoding) + declmatch = re.compile(r"^<\?xml[^>]*?>") + newdecl = """""" if declmatch.search(newdata): newdata = declmatch.sub(newdecl, newdata) else: - newdata = newdecl + u'\n' + newdata - return newdata.encode('utf-8') + newdata = newdecl + "\n" + newdata + return newdata.encode("utf-8") + def _stripDoctype(data): - '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) + """Strips DOCTYPE from XML document, returns (rss_version, stripped_data) rss_version may be 'rss091n' or None stripped_data is the same XML document, minus the DOCTYPE - ''' - entity_pattern = re.compile(r']*?)>', re.MULTILINE) - data = entity_pattern.sub('', data) - doctype_pattern = re.compile(r']*?)>', re.MULTILINE) + """ + entity_pattern = re.compile(r"]*?)>", re.MULTILINE) + if not isinstance(data, str): + data = data.decode() + data = entity_pattern.sub("", data) + doctype_pattern = re.compile(r"]*?)>", re.MULTILINE) doctype_results = doctype_pattern.findall(data) - doctype = doctype_results and doctype_results[0] or '' - if doctype.lower().count('netscape'): - version = 'rss091n' + doctype = doctype_results and doctype_results[0] or "" + if doctype.lower().count("netscape"): + version = "rss091n" else: version = None - data = doctype_pattern.sub('', data) + data = doctype_pattern.sub("", data) return version, data - + + def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): - '''Parse a feed from a URL, file, stream, or string''' + """Parse a feed from a URL, file, stream, or string""" result = FeedParserDict() - result['feed'] = FeedParserDict() - result['entries'] = [] + result["feed"] = FeedParserDict() + result["entries"] = [] if _XML_AVAILABLE: - result['bozo'] = 0 - if type(handlers) == types.InstanceType: + result["bozo"] = 0 + if not isinstance(handlers, list): handlers = [handlers] try: f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) data = f.read() - except Exception, e: - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' + except Exception as e: + result["bozo"] = 1 + result["bozo_exception"] = e + data = "" f = None # if feed is gzip-compressed, decompress it - if f and data and hasattr(f, 'headers'): - if gzip and f.headers.get('content-encoding', '') == 'gzip': + if f and data and hasattr(f, "headers"): + if gzip and f.headers.get("content-encoding", "") == "gzip": try: - data = gzip.GzipFile(fileobj=_StringIO(data)).read() - except Exception, e: + data = gzip.GzipFile(fileobj=BytesIO(data)).read() + except Exception as e: # Some feeds claim to be gzipped but they're not, so # we get garbage. Ideally, we should re-request the # feed without the 'Accept-encoding: gzip' header, # but we don't. - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' - elif zlib and f.headers.get('content-encoding', '') == 'deflate': + result["bozo"] = 1 + result["bozo_exception"] = e + data = "" + elif zlib and f.headers.get("content-encoding", "") == "deflate": try: data = zlib.decompress(data, -zlib.MAX_WBITS) - except Exception, e: - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' + except Exception as e: + result["bozo"] = 1 + result["bozo_exception"] = e + data = "" # save HTTP headers - if hasattr(f, 'info'): + if hasattr(f, "info"): info = f.info() - result['etag'] = info.getheader('ETag') - last_modified = info.getheader('Last-Modified') + result["etag"] = info.get("ETag") + last_modified = info.get("Last-Modified") if last_modified: - result['modified'] = _parse_date(last_modified) - if hasattr(f, 'url'): - result['href'] = f.url - result['status'] = 200 - if hasattr(f, 'status'): - result['status'] = f.status - if hasattr(f, 'headers'): - result['headers'] = f.headers.dict - if hasattr(f, 'close'): + result["modified"] = _parse_date(last_modified) + if hasattr(f, "url"): + result["href"] = f.url + result["status"] = 200 + if hasattr(f, "status"): + result["status"] = f.status + if hasattr(f, "headers"): + result["headers"] = dict(f.headers) + if hasattr(f, "close"): f.close() # there are four encodings to keep track of: @@ -2586,27 +3258,30 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # - xml_encoding is the encoding declared in the ; changed # project name -#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); +# 2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); # removed unnecessary urllib code -- urllib2 should always be available anyway; # return actual url, status, and full HTTP headers (as result['url'], # result['status'], and result['headers']) if parsing a remote feed over HTTP -- # this should pass all the HTTP tests at ; # added the latest namespace-of-the-week for RSS 2.0 -#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom +# 2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom # User-Agent (otherwise urllib2 sends two, which confuses some servers) -#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for +# 2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for # inline and as used in some RSS 2.0 feeds -#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or +# 2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or # textInput, and also to return the character encoding (if specified) -#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking +# 2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking # nested divs within content (JohnD); fixed missing sys import (JohanS); # fixed regular expression to capture XML character encoding (Andrei); # added support for Atom 0.3-style links; fixed bug with textInput tracking; @@ -2780,7 +3460,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # description, xhtml:body, content, content:encoded, title, subtitle, # summary, info, tagline, and copyright; added support for pingback and # trackback namespaces -#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback +# 2.7 - 1/5/2004 - MAP - really added support for trackback and pingback # namespaces, as opposed to 2.6 when I said I did but didn't really; # sanitize HTML markup within some elements; added mxTidy support (if # installed) to tidy HTML markup within some elements; fixed indentation @@ -2789,66 +3469,66 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed', # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified' # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa -#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory +# 2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory # leak not closing url opener (JohnD); added dc:publisher support (MarekK); # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK) -#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in +# 2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL); # fixed relative URI processing for guid (skadz); added ICBM support; added # base64 support -#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many +# 2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many # blogspot.com sites); added _debug variable -#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing -#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available); +# 2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing +# 3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available); # added several new supported namespaces; fixed bug tracking naked markup in # description; added support for enclosure; added support for source; re-added # support for cloud which got dropped somehow; added support for expirationDate -#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking +# 3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking # xml:base URI, one for documents that don't define one explicitly and one for # documents that define an outer and an inner xml:base that goes out of scope # before the end of the document -#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level -#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version'] +# 3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level +# 3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version'] # will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized; # added support for creativeCommons:license and cc:license; added support for # full Atom content model in title, tagline, info, copyright, summary; fixed bug # with gzip encoding (not always telling server we support it when we do) -#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail +# 3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail # (dictionary of 'name', 'url', 'email'); map author to author_detail if author # contains name + email address -#3.0b8 - 1/28/2004 - MAP - added support for contributor -#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added +# 3.0b8 - 1/28/2004 - MAP - added support for contributor +# 3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added # support for summary -#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from +# 3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from # xml.util.iso8601 -#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain +# 3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain # dangerous markup; fiddled with decodeEntities (not right); liberalized # date parsing even further -#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); +# 3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); # added support to Atom 0.2 subtitle; added support for Atom content model # in copyright; better sanitizing of dangerous HTML elements with end tags # (script, frameset) -#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, +# 3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, # etc.) in embedded markup, in either HTML or XHTML form (
,
,
) -#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under +# 3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under # Python 2.1 -#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; +# 3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; # fixed bug capturing author and contributor URL; fixed bug resolving relative # links in author and contributor URL; fixed bug resolvin relative links in # generator URL; added support for recognizing RSS 1.0; passed Simon Fell's # namespace tests, and included them permanently in the test suite with his # permission; fixed namespace handling under Python 2.1 -#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) -#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 -#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); +# 3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) +# 3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 +# 3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); # use libxml2 (if available) -#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author +# 3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author # name was in parentheses; removed ultra-problematic mxTidy support; patch to # workaround crash in PyXML/expat when encountering invalid entities # (MarkMoraes); support for textinput/textInput -#3.0b20 - 4/7/2004 - MAP - added CDF support -#3.0b21 - 4/14/2004 - MAP - added Hot RSS support -#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in +# 3.0b20 - 4/7/2004 - MAP - added CDF support +# 3.0b21 - 4/14/2004 - MAP - added Hot RSS support +# 3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in # results dict; changed results dict to allow getting values with results.key # as well as results[key]; work around embedded illformed HTML with half # a DOCTYPE; work around malformed Content-Type header; if character encoding @@ -2858,35 +3538,35 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # from Unicode to raw strings before feeding data to sgmllib.SGMLParser; # convert each value in results to Unicode (if possible), even if using # regex-based parsing -#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain +# 3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain # high-bit characters in attributes in embedded HTML in description (thanks # Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in # FeedParserDict; tweaked FeedParserDict.has_key to return True if asking # about a mapped key -#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and +# 3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and # results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could # cause the same encoding to be tried twice (even if it failed the first time); # fixed DOCTYPE stripping when DOCTYPE contained entity declarations; # better textinput and image tracking in illformed RSS 1.0 feeds -#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed +# 3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed # my blink tag tests -#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that +# 3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that # failed to parse utf-16 encoded feeds; made source into a FeedParserDict; # duplicate admin:generatorAgent/@rdf:resource in generator_detail.url; # added support for image; refactored parse() fallback logic to try other # encodings if SAX parsing fails (previously it would only try other encodings -# if re-encoding failed); remove unichr madness in normalize_attrs now that +# if re-encoding failed); remove chr madness in normalize_attrs now that # we're properly tracking encoding in and out of BaseHTMLProcessor; set # feed.language from root-level xml:lang; set entry.id from rdf:about; # send Accept header -#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between +# 3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between # iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are # windows-1252); fixed regression that could cause the same encoding to be # tried twice (even if it failed the first time) -#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types; +# 3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types; # recover from malformed content-type header parameter with no equals sign # ('text/xml; charset:iso-8859-1') -#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities +# 3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities # to Unicode equivalents in illformed feeds (aaronsw); added and # passed tests for converting character entities to Unicode equivalents # in illformed feeds (aaronsw); test for valid parsers when setting @@ -2896,7 +3576,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # out of url and send as basic authentication; expose downloading-related # exceptions in bozo_exception (aaronsw); added __contains__ method to # FeedParserDict (aaronsw); added publisher_detail (aaronsw) -#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always +# 3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always # convert feed to UTF-8 before passing to XML parser; completely revamped # logic for determining character encoding and attempting XML parsing # (much faster); increased default timeout to 20 seconds; test for presence @@ -2907,7 +3587,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # XML parsers are available; added support for 'Content-encoding: deflate'; # send blank 'Accept-encoding: ' header if neither gzip nor zlib modules # are available -#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure +# 3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure # problem tracking xml:base and xml:lang if element declares it, child # doesn't, first grandchild redeclares it, and second grandchild doesn't; # refactored date parsing; defined public registerDateHandler so callers @@ -2927,11 +3607,11 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # redirecting to a URL that returns 304, redirecting to a URL that # redirects to another URL with a different type of redirect); add # support for HTTP 303 redirects -#4.0 - MAP - support for relative URIs in xml:base attribute; fixed +# 4.0 - MAP - support for relative URIs in xml:base attribute; fixed # encoding issue with mxTidy (phopkins); preliminary support for RFC 3229; # support for Atom 1.0; support for iTunes extensions; new 'tags' for # categories/keywords/etc. as array of dict # {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0 # terminology; parse RFC 822-style dates with no time; lots of other # bug fixes -#4.1 - MAP - removed socket timeout; added support for chardet library +# 4.1 - MAP - removed socket timeout; added support for chardet library diff --git a/code/planet/htmltmpl.py b/code/planet/htmltmpl.py index be6e41bb..2fff9abb 100644 --- a/code/planet/htmltmpl.py +++ b/code/planet/htmltmpl.py @@ -1,4 +1,3 @@ - """ A templating engine for separation of code and HTML. The documentation of this templating engine is separated to two parts: @@ -31,19 +30,20 @@ __version__ = 1.22 __author__ = "Tomas Styblo (tripie@cpan.org)" +import html +import pickle # All imported modules are part of the standard Python library. -from types import * import re import os import os.path -import pprint # only for debugging +import pprint # only for debugging import sys import copy -import cgi # for HTML escaping of variables -import urllib # for URL escaping of variables -import cPickle # for template compilation +import cgi # for HTML escaping of variables +import urllib # for URL escaping of variables import gettext +from urllib.parse import quote_plus INCLUDE_DIR = "inc" @@ -76,6 +76,7 @@ LOCK_SH = 2 LOCK_UN = 3 + ############################################## # CLASS: TemplateManager # ############################################## @@ -164,8 +165,8 @@ def __init__(self, include=1, max_include=5, precompile=1, comments=1, # multitask/thread safe. Currently it works only on UNIX # and Windows. Anyone willing to implement it on Mac ? if precompile and not LOCKTYPE: - raise TemplateError, "Template precompilation is not "\ - "available on this platform." + raise TemplateError("Template precompilation is not " \ + "available on this platform.") self.DEB("INIT DONE") def prepare(self, file): @@ -202,9 +203,9 @@ def prepare(self, file): if self.is_precompiled(file): try: precompiled = self.load_precompiled(file) - except PrecompiledError, template: - print >> sys.stderr, "Htmltmpl: bad precompiled "\ - "template '%s' removed" % template + except PrecompiledError as template: + print("Htmltmpl: bad precompiled " \ + "template '%s' removed" % template, file=sys.stderr) compiled = self.compile(file) self.save_precompiled(compiled) else: @@ -225,7 +226,7 @@ def prepare(self, file): self.DEB("PRECOMPILATION DISABLED") compiled = self.compile(file) return compiled - + def update(self, template): """ Update (recompile) a compiled template. @@ -258,7 +259,7 @@ def DEB(self, str): """ Print debugging message to stderr if debugging is enabled. @hidden """ - if self._debug: print >> sys.stderr, str + if self._debug: print(str, file=sys.stderr) def lock_file(self, file, lock): """ Provide platform independent file locking. @@ -273,7 +274,7 @@ def lock_file(self, file, lock): elif lock == LOCK_UN: fcntl.flock(fd, fcntl.LOCK_UN) else: - raise TemplateError, "BUG: bad lock in lock_file" + raise TemplateError("BUG: bad lock in lock_file") elif LOCKTYPE == LOCKTYPE_MSVCRT: if lock == LOCK_SH: # msvcrt does not support shared locks :-( @@ -283,9 +284,9 @@ def lock_file(self, file, lock): elif lock == LOCK_UN: msvcrt.locking(fd, msvcrt.LK_UNLCK, 1) else: - raise TemplateError, "BUG: bad lock in lock_file" + raise TemplateError("BUG: bad lock in lock_file") else: - raise TemplateError, "BUG: bad locktype in lock_file" + raise TemplateError("BUG: bad locktype in lock_file") def compile(self, file): """ Compile the template. @@ -294,19 +295,19 @@ def compile(self, file): return TemplateCompiler(self._include, self._max_include, self._comments, self._gettext, self._debug).compile(file) - + def is_precompiled(self, file): """ Return true if the template is already precompiled on the disk. This method doesn't check whether the compiled template is uptodate. @hidden """ - filename = file + "c" # "template.tmplc" + filename = file + "c" # "template.tmplc" if os.path.isfile(filename): return 1 else: return 0 - + def load_precompiled(self, file): """ Load precompiled template from disk. @@ -323,14 +324,14 @@ def load_precompiled(self, file): try: file = open(filename, "rb") self.lock_file(file, LOCK_SH) - precompiled = cPickle.load(file) - except IOError, (errno, errstr): - raise TemplateError, "IO error in load precompiled "\ - "template '%s': (%d) %s"\ - % (filename, errno, errstr) - except cPickle.UnpicklingError: + precompiled = pickle.load(file) + except IOError as e: + raise TemplateError("IO error in load precompiled " \ + "template '%s': (%d) %s" \ + % (filename, e.errno, e.strerror)) + except pickle.UnpicklingError: remove_bad = 1 - raise PrecompiledError, filename + raise PrecompiledError(filename) except: remove_bad = 1 raise @@ -343,7 +344,7 @@ def load_precompiled(self, file): if remove_bad and os.path.isfile(filename): # X: We may lose the original exception here, raising OSError. os.remove(filename) - + def save_precompiled(self, template): """ Save compiled template to disk in precompiled form. @@ -357,35 +358,35 @@ def save_precompiled(self, template): @hidden """ - filename = template.file() + "c" # creates "template.tmplc" + filename = template.file() + "c" # creates "template.tmplc" # Check if we have write permission to the template's directory. template_dir = os.path.dirname(os.path.abspath(filename)) if not os.access(template_dir, os.W_OK): - raise TemplateError, "Cannot save precompiled templates "\ - "to '%s': write permission denied."\ - % template_dir + raise TemplateError("Cannot save precompiled templates " \ + "to '%s': write permission denied." \ + % template_dir) try: remove_bad = 0 file = None try: - file = open(filename, "wb") # may truncate existing file + file = open(filename, "wb") # may truncate existing file self.lock_file(file, LOCK_EX) BINARY = 1 READABLE = 0 if self._debug: - cPickle.dump(template, file, READABLE) + pickle.dump(template, file, READABLE) else: - cPickle.dump(template, file, BINARY) - except IOError, (errno, errstr): + pickle.dump(template, file, BINARY) + except IOError as e: remove_bad = 1 - raise TemplateError, "IO error while saving precompiled "\ - "template '%s': (%d) %s"\ - % (filename, errno, errstr) - except cPickle.PicklingError, error: + raise TemplateError("IO error while saving precompiled " \ + "template '%s': (%d) %s" \ + % (filename, e.errno, e.errstr)) + except pickle.PicklingError as error: remove_bad = 1 - raise TemplateError, "Pickling error while saving "\ - "precompiled template '%s': %s"\ - % (filename, error) + raise TemplateError("Pickling error while saving " \ + "precompiled template '%s': %s" \ + % (filename, error)) except: remove_bad = 1 raise @@ -448,13 +449,13 @@ def __init__(self, html_escape=1, magic_vars=1, global_vars=0, debug=0): self._html_escape = html_escape self._magic_vars = magic_vars self._global_vars = global_vars - self._debug = debug + self._debug = debug # Data structure containing variables and loops set by the # application. Use debug=1, process some template and # then check stderr to see how the structure looks. # It's modified only by set() and reset() methods. - self._vars = {} + self._vars = {} # Following variables are for multipart templates. self._current_part = 1 @@ -490,17 +491,17 @@ def set(self, var, value): if self.is_ordinary_var(value): # template top-level ordinary variable if not var.islower(): - raise TemplateError, "Invalid variable name '%s'." % var - elif type(value) == ListType: + raise TemplateError("Invalid variable name '%s'." % var) + elif type(value) == list: # template top-level loop if var != var.capitalize(): - raise TemplateError, "Invalid loop name '%s'." % var + raise TemplateError("Invalid loop name '%s'." % var) else: - raise TemplateError, "Value of toplevel variable '%s' must "\ - "be either a scalar or a list." % var + raise TemplateError("Value of toplevel variable '%s' must " \ + "be either a scalar or a list." % var) self._vars[var] = value self.DEB("VALUE SET: " + str(var)) - + def reset(self, keep_data=0): """ Reset the template data. @@ -552,37 +553,37 @@ def process(self, template, part=None): self.DEB("APP INPUT:") if self._debug: pprint.pprint(self._vars, sys.stderr) if part != None and (part == 0 or part < self._current_part): - raise TemplateError, "process() - invalid part number" + raise TemplateError("process() - invalid part number") # This flag means "jump behind the end of current statement" or # "skip the parameters of current statement". # Even parameters that actually are not present in the template # do appear in the list of tokens as empty items ! - skip_params = 0 + skip_params = 0 # Stack for enabling or disabling output in response to TMPL_IF, # TMPL_UNLESS, TMPL_ELSE and TMPL_LOOPs with no passes. output_control = [] ENABLE_OUTPUT = 1 DISABLE_OUTPUT = 0 - + # Stacks for data related to loops. - loop_name = [] # name of a loop - loop_pass = [] # current pass of a loop (counted from zero) - loop_start = [] # index of loop start in token list - loop_total = [] # total number of passes in a loop - + loop_name = [] # name of a loop + loop_pass = [] # current pass of a loop (counted from zero) + loop_start = [] # index of loop start in token list + loop_total = [] # total number of passes in a loop + tokens = template.tokens() len_tokens = len(tokens) - out = "" # buffer for processed output + out = "" # buffer for processed output # Recover position at which we ended after processing of last part. i = self._current_pos - + # Process the list of tokens. while 1: - if i == len_tokens: break - if skip_params: + if i == len_tokens: break + if skip_params: # Skip the parameters following a statement. skip_params = 0 i += PARAMS_NUMBER @@ -590,16 +591,16 @@ def process(self, template, part=None): token = tokens[i] if token.startswith("." + raise TemplateError("No identifier in .") escape = tokens[i + PARAM_ESCAPE] globalp = tokens[i + PARAM_GLOBAL] skip_params = 1 - + # If output of current block is not disabled then append # the substitued and escaped variable to the output. if DISABLE_OUTPUT not in output_control: @@ -611,7 +612,7 @@ def process(self, template, part=None): elif token == "." + raise TemplateError("No identifier in .") skip_params = 1 # Find total number of passes in this loop. @@ -632,13 +633,13 @@ def process(self, template, part=None): self.DEB("LOOP: DISABLE: " + str(var)) else: output_control.append(ENABLE_OUTPUT) - self.DEB("LOOP: FIRST PASS: %s TOTAL: %d"\ + self.DEB("LOOP: FIRST PASS: %s TOTAL: %d" \ % (var, passtotal)) elif token == "." + raise TemplateError("No identifier in .") globalp = tokens[i + PARAM_GLOBAL] skip_params = 1 if self.find_value(var, loop_name, loop_pass, @@ -648,29 +649,29 @@ def process(self, template, part=None): else: output_control.append(DISABLE_OUTPUT) self.DEB("IF: DISABLE: " + str(var)) - + elif token == "." + raise TemplateError("No identifier in .") globalp = tokens[i + PARAM_GLOBAL] skip_params = 1 if self.find_value(var, loop_name, loop_pass, - loop_total, globalp): + loop_total, globalp): output_control.append(DISABLE_OUTPUT) self.DEB("UNLESS: DISABLE: " + str(var)) else: output_control.append(ENABLE_OUTPUT) self.DEB("UNLESS: ENABLE: " + str(var)) - + elif token == "." - + raise TemplateError("Unmatched .") + # If this loop was not disabled, then record the pass. if loop_total[-1] > 0: loop_pass[-1] += 1 - + if loop_pass[-1] == loop_total[-1]: # There are no more passes in this loop. Pop # the loop from stack. @@ -685,25 +686,25 @@ def process(self, template, part=None): # to process next pass of the loop. i = loop_start[-1] self.DEB("LOOP: NEXT PASS") - + elif token == "." + raise TemplateError("Unmatched .") output_control.pop() self.DEB("IF: END") - + elif token == "." + raise TemplateError("Unmatched .") output_control.pop() self.DEB("UNLESS: END") - + elif token == "." + raise TemplateError("Unmatched .") if output_control[-1] == DISABLE_OUTPUT: # Condition was false, activate the ELSE block. output_control[-1] = ENABLE_OUTPUT @@ -713,7 +714,7 @@ def process(self, template, part=None): output_control[-1] = DISABLE_OUTPUT self.DEB("ELSE: DISABLE") else: - raise TemplateError, "BUG: ELSE: INVALID FLAG" + raise TemplateError("BUG: ELSE: INVALID FLAG") elif token == "." % token - + raise TemplateError("Invalid statement %s>." % token) + elif DISABLE_OUTPUT not in output_control: # Raw textual template data. # If output of current block is not disabled, then # append template data to the output buffer. out += token - + i += 1 # end of the big while loop - + # Check whether all opening statements were closed. - if loop_name: raise TemplateError, "Missing ." - if output_control: raise TemplateError, "Missing or " + if loop_name: raise TemplateError("Missing .") + if output_control: raise TemplateError("Missing or ") return out ############################################## @@ -774,7 +775,7 @@ def DEB(self, str): """ Print debugging message to stderr if debugging is enabled. @hidden """ - if self._debug: print >> sys.stderr, str + if self._debug: print(str, file=sys.stderr) def find_value(self, var, loop_name, loop_pass, loop_total, global_override=None): @@ -796,35 +797,35 @@ def find_value(self, var, loop_name, loop_pass, loop_total, # of the variable starts with "__" and if we are inside a loop. if self._magic_vars and var.startswith("__") and loop_name: return self.magic_var(var, loop_pass[-1], loop_total[-1]) - + # Search for an ordinary variable or for a loop. # Recursively search in self._vars for the requested variable. scope = self._vars globals = [] - for i in range(len(loop_name)): + for i in range(len(loop_name)): # If global lookup is on then push the value on the stack. if ((self._global_vars and global_override != "0") or \ - global_override == "1") and scope.has_key(var) and \ - self.is_ordinary_var(scope[var]): + global_override == "1") and scope.has_key(var) and \ + self.is_ordinary_var(scope[var]): globals.append(scope[var]) - + # Descent deeper into the hierarchy. - if scope.has_key(loop_name[i]) and scope[loop_name[i]]: + if loop_name[i] in scope and scope[loop_name[i]]: scope = scope[loop_name[i]][loop_pass[i]] else: return "" - - if scope.has_key(var): + + if var in scope: # Value exists in current loop. - if type(scope[var]) == ListType: + if type(scope[var]) == list: # The requested value is a loop. # Return total number of its passes. return len(scope[var]) else: return scope[var] elif globals and \ - ((self._global_vars and global_override != "0") or \ - global_override == "1"): + ((self._global_vars and global_override != "0") or \ + global_override == "1"): # Return globally looked up value. return globals.pop() else: @@ -843,7 +844,7 @@ def magic_var(self, var, loop_pass, loop_total): @hidden """ - self.DEB("MAGIC: '%s', PASS: %d, TOTAL: %d"\ + self.DEB("MAGIC: '%s', PASS: %d, TOTAL: %d" \ % (var, loop_pass, loop_total)) if var == "__FIRST__": if loop_pass == 0: @@ -860,7 +861,7 @@ def magic_var(self, var, loop_pass, loop_total): if loop_pass != 0 and loop_pass != loop_total - 1: return 1 else: - return 0 + return 0 elif var == "__PASS__": # Magic variable __PASS__ counts passes from one. return loop_pass + 1 @@ -880,14 +881,14 @@ def magic_var(self, var, loop_pass, loop_total): if loop_pass != 0 and loop_pass != loop_total - 1: # Check if an integer follows the variable name. try: - every = int(var[9:]) # nine is length of "__EVERY__" + every = int(var[9:]) # nine is length of "__EVERY__" except ValueError: - raise TemplateError, "Magic variable __EVERY__x: "\ - "Invalid pass number." + raise TemplateError("Magic variable __EVERY__x: " \ + "Invalid pass number.") else: if not every: - raise TemplateError, "Magic variable __EVERY__x: "\ - "Pass number cannot be zero." + raise TemplateError("Magic variable __EVERY__x: " \ + "Pass number cannot be zero.") elif (loop_pass + 1) % every == 0: self.DEB("MAGIC: EVERY: " + str(every)) return 1 @@ -896,30 +897,23 @@ def magic_var(self, var, loop_pass, loop_total): else: return 0 else: - raise TemplateError, "Invalid magic variable '%s'." % var + raise TemplateError("Invalid magic variable '%s'." % var) - def escape(self, str, override=""): + def escape(self, str_value, override=""): """ Escape a string either by HTML escaping or by URL escaping. @hidden """ - ESCAPE_QUOTES = 1 - if (self._html_escape and override != "NONE" and override != "0" and \ - override != "URL") or override == "HTML" or override == "1": - return cgi.escape(str, ESCAPE_QUOTES) - elif override == "URL": - return urllib.quote_plus(str) - else: - return str + if (self._html_escape and override not in ("NONE", "0", "URL")) or override in ("HTML", "1"): + return html.escape(str(str_value), quote=True) + if override == "URL": + return quote_plus(str(str_value)) + return str_value def is_ordinary_var(self, var): """ Return true if var is a scalar. (not a reference to loop) @hidden """ - if type(var) == StringType or type(var) == IntType or \ - type(var) == LongType or type(var) == FloatType: - return 1 - else: - return 0 + return isinstance(var, (str, int, float)) ############################################## @@ -954,13 +948,13 @@ def __init__(self, include=1, max_include=5, comments=1, gettext=0, @param gettext Enable or disable gettext support. @param debug Enable or disable debugging messages. """ - + self._include = include self._max_include = max_include self._comments = comments self._gettext = gettext self._debug = debug - + # This is a list of filenames of all included templates. # It's modified by the include_templates() method. self._include_files = [] @@ -968,7 +962,7 @@ def __init__(self, include=1, max_include=5, comments=1, gettext=0, # This is a counter of current inclusion depth. It's used to prevent # infinite recursive includes. self._include_level = 0 - + def compile(self, file): """ Compile template from a file. @@ -981,7 +975,7 @@ def compile(self, file): See the prepare() method of the TemplateManager class for exaplanation of this parameter. """ - + self.DEB("COMPILING FROM FILE: " + file) self._include_path = os.path.join(os.path.dirname(file), INCLUDE_DIR) tokens = self.parse(self.read(file)) @@ -1015,13 +1009,13 @@ def compile_string(self, data): ############################################## # PRIVATE METHODS # ############################################## - + def DEB(self, str): """ Print debugging message to stderr if debugging is enabled. @hidden """ - if self._debug: print >> sys.stderr, str - + if self._debug: print(str, file=sys.stderr) + def read(self, filename): """ Read content of file and return it. Raise an error if a problem occurs. @@ -1033,14 +1027,14 @@ def read(self, filename): try: f = open(filename, "r") data = f.read() - except IOError, (errno, errstr): - raise TemplateError, "IO error while reading template '%s': "\ - "(%d) %s" % (filename, errno, errstr) + except IOError as e: + raise TemplateError("IO error while reading template '%s': " \ + "(%d) %s" % (filename, e.errno, e.errstr)) else: return data finally: if f: f.close() - + def parse(self, template_data): """ Parse the template. This method is recursively called from within the include_templates() method. @@ -1063,7 +1057,7 @@ def remove_comments(self, template_data): """ pattern = r"### .*" return re.sub(pattern, "", template_data) - + def include_templates(self, tokens): """ Process TMPL_INCLUDE statements. Use the include_level counter to prevent infinite recursion. Record paths to all included @@ -1071,9 +1065,9 @@ def include_templates(self, tokens): @hidden """ i = 0 - out = "" # buffer for output + out = "" # buffer for output skip_params = 0 - + # Process the list of tokens. while 1: if i == len(tokens): break @@ -1086,7 +1080,7 @@ def include_templates(self, tokens): if token == "." + raise TemplateError("No filename in .") self._include_level += 1 if self._include_level > self._max_include: # Do not include the template. @@ -1104,16 +1098,16 @@ def include_templates(self, tokens): # Append the tokens from the included template to actual # position in the tokens list, replacing the TMPL_INCLUDE # token and its parameters. - tokens[i:i+PARAMS_NUMBER+1] = include_tokens + tokens[i:i + PARAMS_NUMBER + 1] = include_tokens i = i + len(include_tokens) self.DEB("INCLUDED: " + filename) - continue # Do not increment 'i' below. + continue # Do not increment 'i' below. i += 1 # end of the main while loop if self._include_level > 0: self._include_level -= 1 return out - + def tokenize(self, template_data): """ Split the template into tokens separated by template statements. The statements itself and associated parameters are also @@ -1139,9 +1133,9 @@ def tokenize(self, template_data): tokens = [] for statement in split: if statement.startswith("is_uptodate method. """ - + def __init__(self, version, file, include_files, tokens, compile_params, debug=0): """ Constructor. @@ -1318,7 +1312,7 @@ def __init__(self, version, file, include_files, tokens, compile_params, self._tokens = tokens self._compile_params = compile_params self._debug = debug - self._mtime = None + self._mtime = None self._include_mtimes = {} if not file: @@ -1329,16 +1323,16 @@ def __init__(self, version, file, include_files, tokens, compile_params, if os.path.isfile(file): self._mtime = os.path.getmtime(file) else: - raise TemplateError, "Template: file does not exist: '%s'" % file + raise TemplateError("Template: file does not exist: '%s'" % file) # Save modificaton times of all included template files. for inc_file in include_files: if os.path.isfile(inc_file): self._include_mtimes[inc_file] = os.path.getmtime(inc_file) else: - raise TemplateError, "Template: file does not exist: '%s'"\ - % inc_file - + raise TemplateError("Template: file does not exist: '%s'" \ + % inc_file) + self.DEB("NEW TEMPLATE CREATED") def is_uptodate(self, compile_params=None): @@ -1361,7 +1355,7 @@ def is_uptodate(self, compile_params=None): if not self._file: self.DEB("TEMPLATE COMPILED FROM A STRING") return 0 - + if self._version != __version__: self.DEB("TEMPLATE: VERSION NOT UPTODATE") return 0 @@ -1369,7 +1363,7 @@ def is_uptodate(self, compile_params=None): if compile_params != None and compile_params != self._compile_params: self.DEB("TEMPLATE: DIFFERENT COMPILATION PARAMS") return 0 - + # Check modification times of the main template and all included # templates. If the included template no longer exists, then # the problem will be resolved when the template is recompiled. @@ -1378,9 +1372,9 @@ def is_uptodate(self, compile_params=None): if not (os.path.isfile(self._file) and \ self._mtime == os.path.getmtime(self._file)): self.DEB("TEMPLATE: NOT UPTODATE: " + self._file) - return 0 + return 0 - # Included templates. + # Included templates. for inc_file in self._include_mtimes.keys(): if not (os.path.isfile(inc_file) and \ self._include_mtimes[inc_file] == \ @@ -1389,8 +1383,8 @@ def is_uptodate(self, compile_params=None): return 0 else: self.DEB("TEMPLATE: UPTODATE") - return 1 - + return 1 + def tokens(self): """ Get tokens of this template. @hidden @@ -1430,12 +1424,11 @@ def __setstate__(self, dict): dict["_debug"] = 0 self.__dict__ = dict - def DEB(self, str): """ Print debugging message to stderr. @hidden """ - if self._debug: print >> sys.stderr, str + if self._debug: print(str, file=sys.stderr) ############################################## @@ -1477,4 +1470,3 @@ def __init__(self, template): @hidden """ Exception.__init__(self, template) - diff --git a/code/planet/sanitize.py b/code/planet/sanitize.py index c98b14de..b025de57 100644 --- a/code/planet/sanitize.py +++ b/code/planet/sanitize.py @@ -1,13 +1,15 @@ -""" -sanitize: bringing sanitiy to world of messed-up data +"""sanitize: bringing sanitiy to world of messed-up data + +TODO: py2->3 """ -__author__ = ["Mark Pilgrim ", - "Aaron Swartz "] +__author__ = ["Mark Pilgrim ", "Aaron Swartz "] __contributors__ = ["Sam Ruby "] __license__ = "BSD" __version__ = "0.25" +import sys + _debug = 0 # If you want sanitize to automatically run HTML markup through HTML Tidy, set @@ -19,75 +21,94 @@ # if TIDY_MARKUP = 1 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] -import sgmllib, re +import re +from html.parser import HTMLParser # chardet library auto-detects character encodings # Download from http://chardet.feedparser.org/ try: import chardet + if _debug: import chardet.constants + chardet.constants._debug = 1 - _chardet = lambda data: chardet.detect(data)['encoding'] + _chardet = lambda data: chardet.detect(data)["encoding"] except: chardet = None _chardet = lambda data: None -class _BaseHTMLProcessor(sgmllib.SGMLParser): - elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', - 'img', 'input', 'isindex', 'link', 'meta', 'param'] - - _r_barebang = re.compile(r'') - + +class _BaseHTMLProcessor(HTMLParser): + elements_no_end_tag = [ + "area", + "base", + "basefont", + "br", + "col", + "frame", + "hr", + "img", + "input", + "isindex", + "link", + "meta", + "param", + ] + + _r_barebang = re.compile(r"") + def __init__(self, encoding): self.encoding = encoding - if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) - sgmllib.SGMLParser.__init__(self) - + if _debug: + sys.stderr.write(f"entering BaseHTMLProcessor, encoding={self.encoding}\n") + super().__init__(convert_charrefs=False) + def reset(self): self.pieces = [] - sgmllib.SGMLParser.reset(self) + super().reset() def _shorttag_replace(self, match): tag = match.group(1) if tag in self.elements_no_end_tag: - return '<' + tag + ' />' + return "<" + tag + " />" else: - return '<' + tag + '>' - + return "<" + tag + ">" + def feed(self, data): - data = self._r_barebang.sub(r'<!\1', data) + data = self._r_barebang.sub(r"<!\1", data) data = self._r_bareamp.sub("&", data) - data = self._r_shorttag.sub(self._shorttag_replace, data) - if self.encoding and type(data) == type(u''): + data = self._r_shorttag.sub(self._shorttag_replace, data) + if self.encoding and isinstance(data, str): data = data.encode(self.encoding) - sgmllib.SGMLParser.feed(self, data) + super().feed(data.decode()) def normalize_attrs(self, attrs): # utility method to be called by descendants attrs = [(k.lower(), v) for k, v in attrs] - attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] + attrs = [(k, k in ("rel", "type") and v.lower() or v) for k, v in attrs] return attrs def unknown_starttag(self, tag, attrs): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag='pre', attrs=[('class', 'screen')]
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
+        if _debug:
+            sys.stderr.write("_BaseHTMLProcessor, unknown_starttag, tag=%s\n" % tag)
         uattrs = []
         # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
         for key, value in attrs:
-            if type(value) != type(u''):
-                value = unicode(value, self.encoding)
-            uattrs.append((unicode(key, self.encoding), value))
-        strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
+            if type(value) != str:
+                value = str(value, self.encoding)
+            uattrs.append((str(key, self.encoding), value))
+        strattrs = "".join([f' {key}="{value}"' for key, value in uattrs]).encode(self.encoding)
         if tag in self.elements_no_end_tag:
-            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
+            self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
         else:
-            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+            self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
 
     def unknown_endtag(self, tag):
         # called for each end tag, e.g. for 
, tag will be 'pre' @@ -98,38 +119,40 @@ def unknown_endtag(self, tag): def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' # Reconstruct the original character reference. - self.pieces.append('&#%(ref)s;' % locals()) - + self.pieces.append("&#%(ref)s;" % locals()) + def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. - self.pieces.append('&%(ref)s;' % locals()) + self.pieces.append("&%(ref)s;" % locals()) def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. - if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) + if _debug: + sys.stderr.write("_BaseHTMLProcessor, handle_text, text=%s\n" % text) self.pieces.append(text) - + def handle_comment(self, text): # called for each HTML comment, e.g. # Reconstruct the original comment. - self.pieces.append('' % locals()) - + self.pieces.append("" % locals()) + def handle_pi(self, text): # called for each processing instruction, e.g. # Reconstruct original processing instruction. - self.pieces.append('' % locals()) + self.pieces.append("" % locals()) def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # # Reconstruct original DOCTYPE - self.pieces.append('' % locals()) - - _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match + self.pieces.append("" % locals()) + + _new_declname_match = re.compile(r"[a-zA-Z][-_.a-zA-Z0-9:]*\s*").match + def _scan_name(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) @@ -144,36 +167,165 @@ def _scan_name(self, i, declstartpos): return name.lower(), m.end() else: self.handle_data(rawdata) -# self.updatepos(declstartpos, i) + # self.updatepos(declstartpos, i) return None, -1 def output(self): - '''Return processed HTML as a single string''' - return ''.join([str(p) for p in self.pieces]) + """Return processed HTML as a single string""" + return "".join([str(p) for p in self.pieces]) + class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', - 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', - 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', - 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', - 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', - 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', - 'strong', 'sub', 'sup', 'table', 'textarea', 'tbody', 'td', 'tfoot', 'th', - 'thead', 'tr', 'tt', 'u', 'ul', 'var'] - - acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', - 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', - 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', - 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', - 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', - 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', - 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', - 'usemap', 'valign', 'value', 'vspace', 'width'] - - ignorable_elements = ['script', 'applet', 'style'] - + acceptable_elements = [ + "a", + "abbr", + "acronym", + "address", + "area", + "b", + "big", + "blockquote", + "br", + "button", + "caption", + "center", + "cite", + "code", + "col", + "colgroup", + "dd", + "del", + "dfn", + "dir", + "div", + "dl", + "dt", + "em", + "fieldset", + "font", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "i", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "map", + "menu", + "ol", + "optgroup", + "option", + "p", + "pre", + "q", + "s", + "samp", + "select", + "small", + "span", + "strike", + "strong", + "sub", + "sup", + "table", + "textarea", + "tbody", + "td", + "tfoot", + "th", + "thead", + "tr", + "tt", + "u", + "ul", + "var", + ] + + acceptable_attributes = [ + "abbr", + "accept", + "accept-charset", + "accesskey", + "action", + "align", + "alt", + "axis", + "border", + "cellpadding", + "cellspacing", + "char", + "charoff", + "charset", + "checked", + "cite", + "class", + "clear", + "cols", + "colspan", + "color", + "compact", + "coords", + "datetime", + "dir", + "disabled", + "enctype", + "for", + "frame", + "headers", + "height", + "href", + "hreflang", + "hspace", + "id", + "ismap", + "label", + "lang", + "longdesc", + "maxlength", + "media", + "method", + "multiple", + "name", + "nohref", + "noshade", + "nowrap", + "prompt", + "readonly", + "rel", + "rev", + "rows", + "rowspan", + "rules", + "scope", + "selected", + "shape", + "size", + "span", + "src", + "start", + "summary", + "tabindex", + "target", + "title", + "type", + "usemap", + "valign", + "value", + "vspace", + "width", + ] + + ignorable_elements = ["script", "applet", "style"] + def reset(self): _BaseHTMLProcessor.reset(self) self.tag_stack = [] @@ -183,30 +335,30 @@ def feed(self, data): _BaseHTMLProcessor.feed(self, data) while self.tag_stack: _BaseHTMLProcessor.unknown_endtag(self, self.tag_stack.pop()) - + def unknown_starttag(self, tag, attrs): if tag in self.ignorable_elements: self.ignore_level += 1 return - + if self.ignore_level: return - + if tag in self.acceptable_elements: attrs = self.normalize_attrs(attrs) attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] if tag not in self.elements_no_end_tag: self.tag_stack.append(tag) _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - + def unknown_endtag(self, tag): if tag in self.ignorable_elements: self.ignore_level -= 1 return - + if self.ignore_level: return - + if tag in self.acceptable_elements and tag not in self.elements_no_end_tag: match = False while self.tag_stack: @@ -227,10 +379,12 @@ def handle_decl(self, text): def handle_data(self, text): if not self.ignore_level: - text = text.replace('<', '') + text = text.replace("<", "") _BaseHTMLProcessor.handle_data(self, text) -def HTML(htmlSource, encoding='utf8'): + +# TODO(py2to3): we need to replace `mx` and `tidy` here +def HTML(htmlSource, encoding="utf8"): p = _HTMLSanitizer(encoding) p.feed(htmlSource) data = p.output() @@ -242,113 +396,364 @@ def HTML(htmlSource, encoding='utf8'): try: if tidy_interface == "uTidy": from tidy import parseString as _utidy + def _tidy(data, **kwargs): return str(_utidy(data, **kwargs)) + break elif tidy_interface == "mxTidy": from mx.Tidy import Tidy as _mxtidy + def _tidy(data, **kwargs): nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) return data + break except: pass if _tidy: - utf8 = type(data) == type(u'') + utf8 = type(data) == str if utf8: - data = data.encode('utf-8') + data = data.encode("utf-8") data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") if utf8: - data = unicode(data, 'utf-8') - if data.count(''): - data = data.split('>', 1)[1] - if data.count('"): + data = data.split(">", 1)[1] + if data.count(" 1: filename = sys.argv[1] else: - filename = 'config.ini' - -oconfig = ConfigParser.RawConfigParser() + filename = "config.ini" + +oconfig = RawConfigParser() oconfig.read(filename) # This part will destroy the configuration if there's a crash while # writing the output. We're in an GIT-controlled directory, so # I didn't care enough to fix this. -with open(filename, 'wb') as fd: +with open(filename, "wb") as fd: # Copy of write() code that sorts output by section if oconfig._defaults: fd.write("[%s]\n" % DEFAULTSECT) - for (key, value) in oconfig._defaults.items(): - fd.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t'))) + for key, value in oconfig._defaults.items(): + fd.write("{} = {}\n".format(key, str(value).replace("\n", "\n\t"))) fd.write("\n") - + result = {} for section in sorted(oconfig._sections): - if section == 'Planet': + if section == "Planet": fd.write("[%s]\n" % section) - for (key, value) in oconfig._sections[section].items(): + for key, value in oconfig._sections[section].items(): if key != "__name__": - if section == 'Planet': - fd.write("%s = %s\n" % - (key, str(value).replace('\n', '\n\t'))) + if section == "Planet": + fd.write("{} = {}\n".format(key, str(value).replace("\n", "\n\t"))) else: - result[value.replace('"', '')] = section - if section == 'Planet': + result[value.replace('"', "")] = section + if section == "Planet": fd.write("\n") - + for key, value in sorted(result.items()): fd.write("[%s]\n" % value) name = key @@ -43,4 +42,3 @@ name = '"%s"' % key fd.write("name = %s\n" % name) fd.write("\n") -