From cc6a6dd40fe3fbcb6fcac24705543f67eaa599f7 Mon Sep 17 00:00:00 2001 From: Victor Bogado da Silva Lins Date: Sat, 25 Jan 2025 17:16:02 -0500 Subject: [PATCH] Update so the processing takes much less memory. By using generators instead of holding to much of the data the processing gets much lighter. --- cppman/entry.py | 210 ++++++++++++++++++++++++++++++++++++++++++++++++ cppman/main.py | 152 ++--------------------------------- 2 files changed, 217 insertions(+), 145 deletions(-) create mode 100644 cppman/entry.py diff --git a/cppman/entry.py b/cppman/entry.py new file mode 100644 index 0000000..3c19a3d --- /dev/null +++ b/cppman/entry.py @@ -0,0 +1,210 @@ +# -*- coding: utf-8 -*- +# +# entry.py +# +# Copyright (C) 2010 - 2015 Wei-Ning Huang (AZ) +# Copyright (C) 2025 Victor Bogado da Silva Lins +# All Rights reserved. +# +# This file is part of cppman. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +import re +import html + +from bs4 import BeautifulSoup +from itertools import takewhile + + +def _parse_expression(expr: str) -> list[str]: + """ + split expression into prefix and expression + tested with + ``` + operator== + != + std::rel_ops::operator!= + std::atomic::operator= + std::array::operator[] + std::function::operator() + std::vector::at + std::relational operators + std::vector::begin + std::abs(float) + std::fabs() + ``` + """ + m = re.match(r'^(.*?(?:::)?(?:operator)?)((?:::[^:]*|[^:]*)?)$', expr) + prefix = m.group(1) + tail = m.group(2) + return [prefix, tail] + + +def _commonprefix(*names: str) -> str: + return "".join(ch[0] for ch in takewhile( + lambda name: all(chars == name[0] for chars in names), zip(*names)) + ) + + +class Entry(object): + def __init__(self, url: str, content: str): + self.url = url + self.name = self._extract_name(content).replace('\n', '') + self._keywords = set() + self._parse_title(self) + + def add_keyword(self, keyword: str): + self._keywords.add(keyword) + + def aliases(self, keyword): + if keyword.find("std::") != -1: + yield (keyword, keyword.replace("std::", "")) + for other_keyword in self.keywords(): + if other_keyword == keyword: + continue + + prefix = _commonprefix(keyword, other_keyword) + size = len(prefix) + if size > 2 and prefix[-2:] == "::": + yield (keyword[size:], other_keyword[size:]) + + def all_aliases(self): + for keyword in self.keywords(): + for alias in self.aliases(keyword): + yield alias + + def keywords(self): + for keyword in self._keywords: + yield keyword + if keyword.find("std::") != -1: + yield keyword.replace("std::", "") + + def _extract_keywords(self, text): + """ + extract aliases like std::string, template specializations + like std::atomic_bool and helper functions like std::is_same_v + """ + soup = BeautifulSoup(text, "lxml") + names = [] + + # search for typedef list + for x in soup.find_all("table"): + # just searching for "Type" is not enough, see std::is_same + p = x.find_previous_sibling("h3") + if p: + if p.get_text().strip() == "Member types": + continue + + typedefTable = False + for tr in x.find_all("tr"): + tds = tr.find_all("td") + if len(tds) == 2: + if re.match(r"\s*Type\s*", tds[0].get_text()): + typedefTable = True + elif typedefTable: + res = re.search(r"^\s*(\S*)\s+.*$", tds[0].get_text()) + if res and res.group(1): + names.append(res.group(1)) + elif not typedefTable: + break + if typedefTable: + break + + # search for "Helper variable template" list + for x in soup.find_all("h3"): + if x.find("span", id="Helper_variable_template"): + e = x.find_next_sibling() + while e.name == "": + e = e.find_next_sibling() + if e.name == "table": + for tr in e.find_all("tr"): + text = re.sub("\n", " ", tr.get_text()) + res = re.search(r"^.* (\S+)\s*=.*$", text) + if res: + names.append(res.group(1)) + # search for "Helper types" list + for x in soup.find_all("h3"): + if x.find("span", id="Helper_types"): + e = x.find_next_sibling() + while e.name == "": + e = e.find_next_sibling() + if e.name == "table": + for tr in e.find_all("tr"): + text = re.sub("\n", " ", tr.get_text()) + res = re.search(r"^.* (\S+)\s*=.*$", text) + if res: + self.add_keyword( + html.unescape( + names.append( + res.group(1) + ) + ) + ) + + def _extract_name(self, content: str): + """Extract man page name from web page.""" + name = re.search('<[hH]1[^>]*>(.+?)', + content, + re.DOTALL).group(1) + name = re.sub(r'<([^>]+)>', r'', name) + name = re.sub(r'>', r'>', name) + name = re.sub(r'<', r'<', name) + return html.unescape(name) + + def _parse_title(self, content): + """ + split of the last parenthesis operator==,!=,<,<=(std::vector) + tested with + ``` + operator==,!=,<,<=,>,>=(std::vector) + operator==,!=,<,<=,>,>=(std::vector) + operator==,!=,<,<=,>,>= + operator==,!=,<,<=,>,>= + std::rel_ops::operator!=,>,<=,>= + std::atomic::operator= + std::array::operator[] + std::function::operator() + std::vector::at + std::relational operators (vector) + std::vector::begin, std::vector::cbegin + std::abs(float), std::fabs + std::unordered_set::begin(size_type), + std::unordered_set::cbegin(size_type) + ``` + """ + """ remove all template stuff """ + title = re.sub(r" ?<[^>]+>", "", self.name) + + m = re.match( + r"^\s*((?:\(size_type\)|(?:.|\(\))*?)*)((?:\([^)]+\))?)\s*$", + re.sub(r" ?<[^>]+>", "", title) + ) + + postfix = m.group(2) + + t_names = map(lambda name: name.strip(), m.group(1).split(",")) + + prefix = None + for n in t_names: + r = _parse_expression(n) + if prefix is None: + prefix = r[0] + + if prefix == r[0]: + self.add_keyword(n + postfix) + else: + self.add_keyword(prefix + r[1] + postfix) diff --git a/cppman/main.py b/cppman/main.py index 3a6576e..fd802b6 100644 --- a/cppman/main.py +++ b/cppman/main.py @@ -24,7 +24,6 @@ import collections import gzip -import html import importlib import os import os.path @@ -34,13 +33,12 @@ import subprocess import sys -from bs4 import BeautifulSoup from cppman import environ, util from cppman.crawler import Crawler +from cppman.entry import Entry from urllib.parse import urlparse, unquote - def _sort_crawl(entry): """ Sorting entries for putting '(1)' indexes behind keyword @@ -168,13 +166,13 @@ def rebuild_index(self): """ 1. insert title """ self.db_cursor.execute( 'INSERT INTO "%s" (title, url) VALUES (?, ?)' - % table, (title, results[title]["url"])) + % table, (title, results[title].url)) lastRow = self.db_cursor.execute( 'SELECT last_insert_rowid()').fetchall()[0][0] """ 2. insert all keywords """ - for k in results[title]["keywords"]: + for k in results[title].keywords(): self.db_cursor.execute( 'INSERT INTO "%s_keywords" (id, keyword) ' 'VALUES (?, ?)' @@ -182,7 +180,7 @@ def rebuild_index(self): """ 3. add all aliases """ for title in results: - for (k, a) in results[title]["aliases"]: + for (k, a) in results[title].all_aliases(): """ search for combinations of words e.g. std::basic_string::append @@ -249,38 +247,9 @@ def rebuild_index(self): def process_document(self, url, content, depth): """callback to insert index""" print("Indexing '%s' (depth %s)..." % (url, depth)) - name = self._extract_name(content).replace('\n', '') - keywords = self._extract_keywords(content) - - entry = {'url': url, 'keywords': set(), 'aliases': set()} - self.results[name].append(entry) - - for n in self._parse_title(name): - """ add as keyword """ - entry["keywords"].add(n) - - """ add as keyword without std:: """ - if n.find("std::") != -1: - entry["keywords"].add(n.replace('std::', '')) - - """ add with all keywords variations """ - for k in keywords: - """ add std:: to typedef if original type is in std namespace """ - if n.find("std::") != -1 and k.find("std::") == -1: - k = "std::" + k; - - entry["aliases"].add((n, k)) - prefix = _commonprefix(n, k) - if len(prefix) > 2 and prefix[-2:] == "::": - """ Create names and keyword without prefixes """ - new_name = n[len(prefix):] - new_key = k[len(prefix):] - entry["aliases"].add((new_name, new_key)) - - if k.find("std::") != -1: - entry["aliases"].add( - (n, k.replace('std::', ''))) + entry = Entry(url, content) + self.results[entry.name].append(entry) return True @@ -293,7 +262,7 @@ def _results_with_unique_title(self): if len(entries) == 1: results[title] = entries[0] else: - paths = [_removesuffix(urlparse(entry['url'])[2], '/') for entry in entries] + paths = [_removesuffix(urlparse(entry.url)[2], '/') for entry in entries] prefix = os.path.commonpath(paths) if prefix: prefix += '/' @@ -308,14 +277,6 @@ def _results_with_unique_title(self): results["{} ({})".format(title, unquote(path))] = entry return results - def _extract_name(self, data): - """Extract man page name from web page.""" - name = re.search('<[hH]1[^>]*>(.+?)', data, re.DOTALL).group(1) - name = re.sub(r'<([^>]+)>', r'', name) - name = re.sub(r'>', r'>', name) - name = re.sub(r'<', r'<', name) - return html.unescape(name) - def _parse_expression(self, expr): """ split expression into prefix and expression @@ -339,105 +300,6 @@ def _parse_expression(self, expr): tail = m.group(2) return [prefix, tail] - def _parse_title(self, title): - """ - split of the last parenthesis operator==,!=,<,<=(std::vector) - tested with - ``` - operator==,!=,<,<=,>,>=(std::vector) - operator==,!=,<,<=,>,>=(std::vector) - operator==,!=,<,<=,>,>= - operator==,!=,<,<=,>,>= - std::rel_ops::operator!=,>,<=,>= - std::atomic::operator= - std::array::operator[] - std::function::operator() - std::vector::at - std::relational operators (vector) - std::vector::begin, std::vector::cbegin - std::abs(float), std::fabs - std::unordered_set::begin(size_type), std::unordered_set::cbegin(size_type) - ``` - """ - """ remove all template stuff """ - title = re.sub(r" ?<[^>]+>", "", title) - - m = re.match( - r'^\s*((?:\(size_type\)|(?:.|\(\))*?)*)((?:\([^)]+\))?)\s*$', title) - - postfix = m.group(2) - - t_names = m.group(1).split(',') - t_names = [n.strip() for n in t_names] - prefix = self._parse_expression(t_names[0])[0] - names = [] - for n in t_names: - r = self._parse_expression(n) - if prefix == r[0]: - names.append(n + postfix) - else: - names.append(prefix + r[1] + postfix) - return names - - def _extract_keywords(self, text): - """ - extract aliases like std::string, template specializations like std::atomic_bool - and helper functions like std::is_same_v - """ - soup = BeautifulSoup(text, "lxml") - names = [] - - # search for typedef list - for x in soup.find_all('table'): - # just searching for "Type" is not enough, see std::is_same - p = x.find_previous_sibling('h3') - if p: - if p.get_text().strip() == "Member types": - continue - - typedefTable = False - for tr in x.find_all('tr'): - tds = tr.find_all('td') - if len(tds) == 2: - if re.match(r"\s*Type\s*", tds[0].get_text()): - typedefTable = True - elif typedefTable: - res = re.search(r'^\s*(\S*)\s+.*$', tds[0].get_text()) - if res and res.group(1): - names.append(res.group(1)) - elif not typedefTable: - break - if typedefTable: - break - - # search for "Helper variable template" list - for x in soup.find_all('h3'): - variableTemplateHeader = False - if x.find('span', id="Helper_variable_template"): - e = x.find_next_sibling() - while e.name == "": - e = e.find_next_sibling() - if e.name == "table": - for tr in e.find_all('tr'): - text = re.sub('\n', ' ', tr.get_text()) - res = re.search(r'^.* (\S+)\s*=.*$', text) - if res: - names.append(res.group(1)) - # search for "Helper types" list - for x in soup.find_all('h3'): - variableTemplateHeader = False - if x.find('span', id="Helper_types"): - e = x.find_next_sibling() - while e.name == "": - e = e.find_next_sibling() - if e.name == "table": - for tr in e.find_all('tr'): - text = re.sub('\n', ' ', tr.get_text()) - res = re.search(r'^.* (\S+)\s*=.*$', text) - if res: - names.append(res.group(1)) - return [html.unescape(n) for n in names] - def cache_all(self): """Cache all available man pages"""