diff --git a/elixir/data.py b/elixir/data.py index b952943c..74116af1 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -18,13 +18,21 @@ # You should have received a copy of the GNU Affero General Public License # along with Elixir. If not, see . +from typing import OrderedDict import berkeleydb import re +import time from . import lib +from .lib import autoBytes import os import os.path import errno +# Cache size used by the update script for the largest databases. Tuple of (gigabytes, bytes). +# https://docs.oracle.com/database/bdb181/html/api_reference/C/dbset_cachesize.html +# https://docs.oracle.com/database/bdb181/html/programmer_reference/general_am_conf.html#am_conf_cachesize +CACHESIZE = (2,0) + deflist_regex = re.compile(b'(\d*)(\w)(\d*)(\w),?') deflist_macro_regex = re.compile('\dM\d+(\w)') @@ -59,43 +67,86 @@ class DefList: def __init__(self, data=b'#'): self.data, self.families = data.split(b'#') + self.modified = False + self.entries = None + self.to_append = [] + + def populate_entries(self): + entries_modified = False + if self.entries is None: + self.entries = [ + (int(d[0]), d[1], int(d[2]), d[3]) + for d in deflist_regex.findall(self.data) + ] + entries_modified = True + + if len(self.to_append) != 0: + self.entries += self.to_append + self.to_append = [] + entries_modified = True + + if entries_modified: + self.entries.sort(key=lambda x:int(x[0])) + def iter(self, dummy=False): # Get all element in a list of sublists and sort them - entries = deflist_regex.findall(self.data) - entries.sort(key=lambda x:int(x[0])) - for id, type, line, family in entries: - id = int(id) - type = defTypeR [type.decode()] - line = int(line) - family = family.decode() - yield id, type, line, family + if self.entries is None: + self.populate_entries() + + for id, type, line, family in self.entries: + yield id, defTypeR[type.decode()], int(line), family.decode() if dummy: yield maxId, None, None, None - def append(self, id, type, line, family): + def exists(self, idx: int, line_num: int): + if self.entries is None: + self.populate_entries() + + for id, _, line, _ in self.entries: + if id == idx and int(line) == line_num: + return True + + return False + + def append(self, id: int, type, line: int, family: str): if type not in defTypeD: return - p = str(id) + defTypeD[type] + str(line) + family - if self.data != b'': - p = ',' + p - self.data += p.encode() + + self.modified = True + if self.entries is None: + self.to_append.append((id, defTypeD[type].encode(), line, family.encode())) + else: + self.entries.append((id, defTypeD[type].encode(), line, family.encode())) + self.add_family(family) - def pack(self): - return self.data + b'#' + self.families + def pack(self) -> bytes: + if self.entries is None: + to_append = b",".join([ + str(arg[0]).encode() + arg[1] + str(arg[2]).encode() + arg[3] + for arg in self.to_append + ]) + self.to_append = [] + self.data += to_append + return self.data + b'#' + self.families + else: + self.data = b",".join([ + str(arg[0]).encode() + arg[1] + str(arg[2]).encode() + arg[3] + for arg in self.entries + ]) + return self.data + b'#' + self.families - def add_family(self, family): - family = family.encode() + def add_family(self, family: str): if not family in self.families.split(b','): if self.families != b'': - family = b',' + family - self.families += family + family = ',' + family + self.families += family.encode() def get_families(self): - return self.families.decode().split(',') + return [f.decode() for f in self.families.split(b',')] def get_macros(self): - return deflist_macro_regex.findall(self.data.decode()) or '' + return (deflist_macro_regex.findall(self.data.decode()) + [entry[1] for entry in self.to_append]) or '' class PathList: '''Stores associations between a blob ID and a file path. @@ -124,69 +175,205 @@ class RefList: and the corresponding family.''' def __init__(self, data=b''): self.data = data + self.entries = None + self.to_append = [] + self.sorted = False + self.modified = False + + def decode_entry(self, k): + return (int(k[0].decode()), k[1].decode(), k[2].decode()) + + def populate_entries(self): + self.entries = [self.decode_entry(x.split(b':')) for x in self.data.split(b'\n')[:-1]] + self.entries += self.to_append + self.to_append = [] + self.entries.sort(key=lambda x:int(x[0])) def iter(self, dummy=False): - # Split all elements in a list of sublists and sort them - entries = [x.split(b':') for x in self.data.split(b'\n')[:-1]] - entries.sort(key=lambda x:int(x[0])) - for b, c, d in entries: - b = int(b.decode()) - c = c.decode() - d = d.decode() + if self.entries is None: + self.populate_entries() + + for b, c, d in self.entries: yield b, c, d if dummy: yield maxId, None, None def append(self, id, lines, family): - p = str(id) + ':' + lines + ':' + family + '\n' - self.data += p.encode() + self.modified = True + if self.entries is not None: + self.entries.append((id, lines, family)) + else: + self.to_append.append((id, lines, family)) def pack(self): - return self.data + if self.entries is not None: + assert len(self.to_append) == 0 + result = "".join([str(id) + ":" + lines + ":" + family + "\n" for id, lines, family in self.entries]) + return result.encode() + elif len(self.to_append) != 0: + result = "".join([str(id) + ":" + lines + ":" + family + "\n" for id, lines, family in self.to_append]) + self.data += result.encode() + self.to_append = [] + return self.data class BsdDB: - def __init__(self, filename, readonly, contentType, shared=False): + def __init__(self, filename, readonly, contentType, shared=False, cachesize=None): self.filename = filename self.db = berkeleydb.db.DB() - flags = berkeleydb.db.DB_THREAD if shared else 0 + self.flags = berkeleydb.db.DB_THREAD if shared else 0 - if readonly: - flags |= berkeleydb.db.DB_RDONLY - self.db.open(filename, flags=flags) + self.readonly = readonly + if self.readonly: + self.flags |= berkeleydb.db.DB_RDONLY else: - flags |= berkeleydb.db.DB_CREATE - self.db.open(filename, flags=flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE) + self.flags |= berkeleydb.db.DB_CREATE + + if cachesize is not None: + self.db.set_cachesize(cachesize[0], cachesize[1]) + + self.open() self.ctype = contentType + def open(self): + if self.readonly: + self.db.open(self.filename, flags=self.flags) + else: + self.db.open(self.filename, flags=self.flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE) + def exists(self, key): - key = lib.autoBytes(key) + key = autoBytes(key) return self.db.exists(key) def get(self, key): - key = lib.autoBytes(key) + key = autoBytes(key) p = self.db.get(key) - return self.ctype(p) if p is not None else None + if p is None: + return None + p = self.ctype(p) + return p def get_keys(self): return self.db.keys() def put(self, key, val, sync=False): - key = lib.autoBytes(key) - val = lib.autoBytes(val) + key = autoBytes(key) + val = autoBytes(val) + if type(val) is not bytes: + val = val.pack() + self.db.put(key, val) + if sync: + self.db.sync() + + def sync(self): + self.db.sync() + + def close(self): + self.db.close() + + def __len__(self): + return self.db.stat()["nkeys"] + +class CachedBsdDB: + def __init__(self, filename, readonly, contentType, cachesize): + self.filename = filename + self.db = None + self.readonly = readonly + + self.cachesize = cachesize + self.cache = OrderedDict() + + self.open() + + self.ctype = contentType + + def open(self): + if self.db is None: + self.db = berkeleydb.db.DB() + + flags = 0 + + if self.readonly: + flags |= berkeleydb.db.DB_RDONLY + self.db.open(self.filename, flags=flags) + else: + flags |= berkeleydb.db.DB_CREATE + self.db.open(self.filename, flags=flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE) + + def exists(self, key): + if key in self.cache: + return True + + return self.db.exists(autoBytes(key)) + + def get(self, key): + if key in self.cache: + self.cache.move_to_end(key) + return self.cache[key] + + p = self.db.get(autoBytes(key)) + if p is None: + return None + p = self.ctype(p) + + self.cache[key] = p + self.cache.move_to_end(key) + if len(self.cache) > self.cachesize: + old_k, old_v = self.cache.popitem(last=False) + if old_v.modified: + self.put_raw(old_k, old_v) + + return p + + def get_keys(self): + self.sync() + return self.db.keys() + + def put(self, key, val): + if self.readonly: + raise Exception("database is readonly") + + self.cache[key] = val + self.cache.move_to_end(key) + if len(self.cache) > self.cachesize: + old_k, old_v = self.cache.popitem(last=False) + if old_v.modified: + self.put_raw(old_k, old_v) + + def put_raw(self, key, val, sync=False): + if self.readonly: + raise Exception("database is readonly") + + key = autoBytes(key) + val = autoBytes(val) if type(val) is not bytes: val = val.pack() self.db.put(key, val) if sync: self.db.sync() + def sync(self): + start = time.time() + flushed = 0 + if not self.readonly: + for k, v in self.cache.items(): + if v.modified: + v.modified = False + self.put_raw(k, v) + flushed += 1 + + print("synced", flushed, "/", len(self.cache), time.time()-start) + self.db.sync() + def close(self): + self.sync() self.db.close() + self.db = None def __len__(self): return self.db.stat()["nkeys"] class DB: - def __init__(self, dir, readonly=True, dtscomp=False, shared=False): + def __init__(self, dir, readonly=True, dtscomp=False, shared=False, update_cache=None): if os.path.isdir(dir): self.dir = dir else: @@ -194,6 +381,11 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False): ro = readonly + if update_cache: + db_cls = lambda dir, ro, ctype: CachedBsdDB(dir, ro, ctype, cachesize=update_cache) + else: + db_cls = lambda dir, ro, ctype: BsdDB(dir, ro, ctype, shared=shared) + self.vars = BsdDB(dir + '/variables.db', ro, lambda x: int(x.decode()), shared=shared) # Key-value store of basic information self.blob = BsdDB(dir + '/blobs.db', ro, lambda x: int(x.decode()), shared=shared) @@ -203,7 +395,7 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False): self.file = BsdDB(dir + '/filenames.db', ro, lambda x: x.decode(), shared=shared) # Map serial number to filename self.vers = BsdDB(dir + '/versions.db', ro, PathList, shared=shared) - self.defs = BsdDB(dir + '/definitions.db', ro, DefList, shared=shared) + self.defs = db_cls(dir + '/definitions.db', ro, DefList) self.defs_cache = {} NOOP = lambda x: x self.defs_cache['C'] = BsdDB(dir + '/definitions-cache-C.db', ro, NOOP, shared=shared) @@ -211,12 +403,12 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False): self.defs_cache['D'] = BsdDB(dir + '/definitions-cache-D.db', ro, NOOP, shared=shared) self.defs_cache['M'] = BsdDB(dir + '/definitions-cache-M.db', ro, NOOP, shared=shared) assert sorted(self.defs_cache.keys()) == sorted(lib.CACHED_DEFINITIONS_FAMILIES) - self.refs = BsdDB(dir + '/references.db', ro, RefList, shared=shared) - self.docs = BsdDB(dir + '/doccomments.db', ro, RefList, shared=shared) + self.refs = db_cls(dir + '/references.db', ro, RefList) + self.docs = db_cls(dir + '/doccomments.db', ro, RefList) self.dtscomp = dtscomp if dtscomp: - self.comps = BsdDB(dir + '/compatibledts.db', ro, RefList, shared=shared) - self.comps_docs = BsdDB(dir + '/compatibledts_docs.db', ro, RefList, shared=shared) + self.comps = db_cls(dir + '/compatibledts.db', ro, RefList) + self.comps_docs = db_cls(dir + '/compatibledts_docs.db', ro, RefList) # Use a RefList in case there are multiple doc comments for an identifier def close(self): diff --git a/elixir/lib.py b/elixir/lib.py index 7d7d0757..2442e107 100755 --- a/elixir/lib.py +++ b/elixir/lib.py @@ -21,6 +21,7 @@ import sys import logging import subprocess, os +from typing import List logger = logging.getLogger(__name__) @@ -46,7 +47,7 @@ def run_cmd(*args, env=None): # Invoke ./script.sh with the given arguments # Returns the list of output lines -def scriptLines(*args, env=None): +def scriptLines(*args, env=None) -> List[bytes]: p = script(*args, env=env) p = p.split(b'\n') del p[-1] diff --git a/elixir/update.py b/elixir/update.py new file mode 100644 index 00000000..cd2d2493 --- /dev/null +++ b/elixir/update.py @@ -0,0 +1,441 @@ +import os.path +import logging +import time +import signal +import bisect +import cProfile +from multiprocessing import cpu_count, set_start_method +from multiprocessing.pool import Pool +from typing import Dict, Iterable, List, Optional, Tuple +from collections import OrderedDict + +from find_compatible_dts import FindCompatibleDTS + +from elixir.data import DB, BsdDB, CachedBsdDB, DefList, PathList, RefList +from elixir.lib import ( + compatibleFamily, + compatibleMacro, + getDataDir, + getFileFamily, + isIdent, + script, + scriptLines, +) + +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + +# File identification - id, hash, filename +FileId = Tuple[int, bytes, str] + +# Definitions parsing output, ident -> list of (file_idx, type, line, family) +DefsDict = Dict[bytes, List[Tuple[int, str, int, str]]] + +# References parsing output, ident -> (file_idx, family) -> list of lines +RefsDict = Dict[bytes, Dict[Tuple[int, str], List[int]]] + +# Generic dictionary of ident -> list of lines +LinesListDict = Dict[str, List[int]] + +# File idx -> (hash, filename, is a new file?) +IdxCache = Dict[int, Tuple[bytes, str, bool]] + +class Cache: + def __init__(self, size): + self.cache = OrderedDict() + self.size = size + + def contains(self, key): + return key in self.cache + + def get(self, key): + self.cache.move_to_end(key) + return self.cache[key] + + def put(self, key, val): + self.cache[key] = val + self.cache.move_to_end(key) + if len(self.cache) > self.size: + self.cache.popitem(last=False) + +# Check if definition for ident is visible in current version +def def_in_version(def_ident: DefList, idx_to_hash_and_filename: IdxCache) -> bool: + def_ident.populate_entries() + + prev_idx = None + for def_idx, _, _, _ in reversed(def_ident.entries): + if def_idx == prev_idx: + continue + if def_idx in idx_to_hash_and_filename: + return True + prev_idx = def_idx + return False + +# Add definitions to database +def add_defs(db: DB, defs: DefsDict): + for ident, occ_list in defs.items(): + obj = db.defs.get(ident) + if obj is None: + obj = DefList() + + for (idx, type, line, family) in occ_list: + obj.append(idx, type, line, family) + + db.defs.put(ident, obj) + +# Add references to database +def add_refs(db: DB, in_ver_cache: Cache, idx_to_hash_and_filename: IdxCache, refs: RefsDict): + for ident, idx_to_lines in refs.items(): + deflist = db.defs.get(ident) + if deflist is None: + continue + + if not in_ver_cache.contains(ident): + in_version = def_in_version(deflist, idx_to_hash_and_filename) + if not in_version: + in_ver_cache.put(ident, False) + continue + in_ver_cache.put(ident, True) + elif not in_ver_cache.get(ident): + continue + + obj = db.refs.get(ident) + if obj is None: + obj = RefList() + + for (idx, family), lines_str in idx_to_lines.items(): + obj.append(idx, lines_str, family) + + db.refs.put(ident, obj) + +# Add documentation references to database +def add_docs(db: DB, idx: int, family: str, docs: Dict[str, List[int]]): + add_to_lineslist(db.docs, idx, family, docs) + +# Add compatible references to database +def add_comps(db: DB, idx: int, family: str, comps: Dict[str, List[int]]): + add_to_lineslist(db.comps, idx, family, comps) + +# Add compatible docs to database +def add_comps_docs(db: DB, idx: int, family: str, comps_docs: Dict[str, List[int]]): + comps_result = {} + for ident, v in comps_docs.items(): + if db.comps.exists(ident): + comps_result[ident] = v + + add_to_lineslist(db.comps_docs, idx, family, comps_result) + +# Add data to a database file that uses lines list schema +def add_to_lineslist(db_file: BsdDB, idx: int, family: str, to_add: Dict[str, List[int]]): + for ident, lines in to_add.items(): + obj = db_file.get(ident) + if obj is None: + obj = RefList() + + lines_str = ','.join((str(n) for n in lines)) + obj.append(idx, lines_str, family) + db_file.put(ident, obj) + + +# Adds blob list to database, returns blob id -> (hash, filename) dict +def collect_blobs(db: DB, tag: bytes) -> IdxCache: + idx = db.vars.get('numBlobs') + if idx is None: + idx = 0 + + # Get blob hashes and associated file names (without path) + blobs = scriptLines('list-blobs', '-p', tag) + versionBuf = [] + idx_to_hash_and_filename = {} + + # Collect new blobs, assign database ids to the blobs + for blob in blobs: + hash, path = blob.split(b' ',maxsplit=1) + filename = os.path.basename(path.decode()) + blob_idx = db.blob.get(hash) + + if blob_idx is not None: + versionBuf.append((blob_idx, path)) + if blob_idx not in idx_to_hash_and_filename: + idx_to_hash_and_filename[blob_idx] = (hash, filename, False) + else: + versionBuf.append((idx, path)) + idx_to_hash_and_filename[idx] = (hash, filename, True) + db.blob.put(hash, idx) + db.hash.put(idx, hash) + db.file.put(idx, filename) + idx += 1 + + # Update number of blobs in the database + db.vars.put('numBlobs', idx) + + # Add mapping blob id -> path to version database + versionBuf.sort() + obj = PathList() + for i, path in versionBuf: + obj.append(i, path) + db.vers.put(tag, obj, sync=True) + + return idx_to_hash_and_filename + +# Generate definitions cache databases +def generate_defs_caches(db: DB): + for key in db.defs.get_keys(): + value = db.defs.get(key) + for family in ['C', 'K', 'D', 'M']: + if (compatibleFamily(value.get_families(), family) or + compatibleMacro(value.get_macros(), family)): + db.defs_cache[family].put(key, b'') + + +# Collect definitions from ctags for a file +def get_defs(file_id: FileId) -> Optional[DefsDict]: + idx, hash, filename = file_id + defs = {} + family = getFileFamily(filename) + if family in (None, 'M'): + return None + + lines = scriptLines('parse-defs', hash, filename, family) + + for l in lines: + ident, type, line = l.split(b' ') + type = type.decode() + line = int(line.decode()) + if isIdent(ident): + if ident not in defs: + defs[ident] = [] + defs[ident].append((idx, type, line, family)) + + return defs + +def call_get_refs(arg: Tuple[FileId, str]) -> Optional[RefsDict]: + return get_refs(arg[0], CachedBsdDB(arg[1], True, DefList, 1000)) + +# Collect references from the tokenizer for a file +def get_refs(file_id: FileId, defs: CachedBsdDB) -> Optional[RefsDict]: + idx, hash, filename = file_id + refs = {} + family = getFileFamily(filename) + if family is None: + return + + # Kconfig values are saved as CONFIG_ + prefix = b'' if family != 'K' else b'CONFIG_' + + tokens = scriptLines('tokenize-file', '-b', hash, family) + even = True + line_num = 1 + + def deflist_exists(deflist, idx: int, line: int): + deflist.populate_entries() + start = bisect.bisect_left(deflist.entries, idx, key=lambda x: x[0]) + + for def_idx, _, def_line, _ in deflist.entries[start:]: + if def_idx == idx: + if def_line == line: + return True + else: + break + + return False + + for tok in tokens: + even = not even + if even: + tok = prefix + tok + + # We only index CONFIG_??? in makefiles + if (family != 'M' or tok.startswith(b'CONFIG_')): + deflist = defs.get(tok) + if not deflist: + continue + + if deflist_exists(deflist, idx, line_num): + continue + + if tok not in refs: + refs[tok] = {} + + if (idx, family) not in refs[tok]: + refs[tok][(idx, family)] = str(line_num) + else: + refs[tok][(idx, family)] += "," + str(line_num) + + else: + line_num += tok.count(b'\1') + + + return refs + +# Collect compatible script output into lineslinst-schema compatible format +def collect_get_blob_output(lines: Iterable[str]) -> LinesListDict: + results = {} + for l in lines: + ident, line = l.split(' ') + line = int(line) + + if ident not in results: + results[ident] = [] + results[ident].append(line) + + return results + +# Collect docs from doc comments script for a single file +def get_docs(file_id: FileId) -> Optional[Tuple[int, str, LinesListDict]]: + idx, hash, filename = file_id + family = getFileFamily(filename) + if family in (None, 'M'): return + + start = time.time() + lines = (line.decode() for line in scriptLines('parse-docs', hash, filename)) + parser_time = time.time()-start + + if parser_time > 10: + print("docs timeout", parser_time, file_id) + + docs = collect_get_blob_output(lines) + + return (idx, family, docs) + +# Collect compatible references for a single file +def get_comps(file_id: FileId) -> Optional[Tuple[int, str, LinesListDict]]: + idx, hash, filename = file_id + family = getFileFamily(filename) + if family in (None, 'K', 'M'): return + + compatibles_parser = FindCompatibleDTS() + + start = time.time() + lines = compatibles_parser.run(scriptLines('get-blob', hash), family) + parser_time = time.time()-start + + if parser_time > 10: + print("comps docs timeout", parser_time, file_id) + + comps = collect_get_blob_output(lines) + + return (idx, family, comps) + +# Collect compatible documentation references for a single file +def get_comps_docs(file_id: FileId) -> Optional[Tuple[int, str, LinesListDict]]: + idx, hash, _ = file_id + family = 'B' + + compatibles_parser = FindCompatibleDTS() + lines = compatibles_parser.run(scriptLines('get-blob', hash), family) + comps_docs = {} + for l in lines: + ident, line = l.split(' ') + + if ident not in comps_docs: + comps_docs[ident] = [] + comps_docs[ident].append(int(line)) + + return (idx, family, comps_docs) + + +# Update a single version - collects data from all the stages and saves it in the database +def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): + idx_to_hash_and_filename = collect_blobs(db, tag) + + # Collect blobs to process and split list of blobs into chunks + idxes = [(idx, hash, filename) for (idx, (hash, filename, new)) in idx_to_hash_and_filename.items() if new] + chunksize = int(len(idxes) / cpu_count()) + chunksize = min(max(1, chunksize), 100) + + logger.info("collecting blobs done, new blobs: %d", len(idxes)) + + for result in pool.imap_unordered(get_defs, idxes, chunksize): + if result is not None: + add_defs(db, result) + + logger.info("defs done") + + for result in pool.imap_unordered(get_docs, idxes, chunksize): + if result is not None: + add_docs(db, *result) + + logger.info("docs done") + + if dts_comp_support: + comp_idxes = [idx for idx in idxes if getFileFamily(idx[2]) not in (None, 'K', 'M')] + comp_chunksize = int(len(comp_idxes) / cpu_count()) + comp_chunksize = min(max(1, comp_chunksize), 100) + for result in pool.imap_unordered(get_comps, comp_idxes, comp_chunksize): + if result is not None: + add_comps(db, *result) + + logger.info("dts comps done") + + for result in pool.imap_unordered(get_comps_docs, idxes, chunksize): + if result is not None: + add_comps_docs(db, *result) + + logger.info("dts comps docs done") + + + #with cProfile.Profile() as pr: + db.defs.close() + db.defs.readonly = True + db.defs.open() + + in_def_cache = Cache(10000) + ref_idxes = [(idx, db.defs.filename) for idx in idxes] + ref_chunksize = int(len(ref_idxes) / cpu_count()) + ref_chunksize = min(max(1, ref_chunksize), 100) + #pr.dump_stats("5refs"+str(int(time.time()))) + + logger.info("ref blobs: %d", len(ref_idxes)) + + for result in pool.imap_unordered(call_get_refs, ref_idxes, ref_chunksize): + if result is not None: + add_refs(db, in_def_cache, idx_to_hash_and_filename, result) + + db.defs.close() + db.defs.readonly = False + db.defs.open() + + logger.info("refs done") + logger.info("update done") + + +sigint_caught = False + +def sigint_handler(signum, _frame): + global sigint_caught + if not sigint_caught: + logger.info("Caught SIGINT... the script will exit after processing this version") + signal.signal(signum, signal.SIG_IGN) + sigint_caught = True + +signal.signal(signal.SIGINT, sigint_handler) + +def ignore_sigint(): + signal.signal(signal.SIGINT, lambda _,__: None) + +if __name__ == "__main__": + + dts_comp_support = bool(int(script('dts-comp'))) + db = DB(getDataDir(), readonly=False, dtscomp=dts_comp_support, shared=False, update_cache=100000) + + set_start_method('spawn') + with Pool(initializer=ignore_sigint) as pool: + for tag in scriptLines('list-tags'): + #if not tag.startswith(b'v6'): + # continue + + if sigint_caught: + break + + if not db.vers.exists(tag): + logger.info("updating tag %s", tag) + update_version(db, tag, pool, dts_comp_support) + + logger.info("generating def caches") + generate_defs_caches(db) + logger.info("def caches generated") + db.close() + logger.info("database closed") + + diff --git a/find_compatible_dts.py b/find_compatible_dts.py index 8aec94d6..a1a356f1 100755 --- a/find_compatible_dts.py +++ b/find_compatible_dts.py @@ -31,6 +31,8 @@ def __init__(self): self.regex_bindings = re.compile("([\w-]+,?[\w-]+)") def parse_c(self, content): + if "compatible" not in content: + return [] return self.regex_c.findall(content) def parse_dts(self, content): diff --git a/script.sh b/script.sh index 3bbff2a7..656a2633 100755 --- a/script.sh +++ b/script.sh @@ -165,7 +165,7 @@ parse_defs_C() git cat-file blob "$opt1" > "$full_path" # Use ctags to parse most of the defs - ctags -x --kinds-c=+p+x --extras='-{anonymous}' "$full_path" | + ctags -u -x --kinds-c=+p+x --extras='-{anonymous}' "$full_path" | grep -avE -e "^operator " -e "^CONFIG_" | awk '{print $1" "$2" "$3}' @@ -182,7 +182,7 @@ parse_defs_K() tmp=`mktemp -d` full_path=$tmp/$opt2 git cat-file blob "$opt1" > "$full_path" - ctags -x --language-force=kconfig --kinds-kconfig=c --extras-kconfig=-{configPrefixed} "$full_path" | + ctags -u -x --language-force=kconfig --kinds-kconfig=c --extras-kconfig=-{configPrefixed} "$full_path" | awk '{print "CONFIG_"$1" "$2" "$3}' rm "$full_path" rmdir $tmp @@ -193,7 +193,7 @@ parse_defs_D() tmp=`mktemp -d` full_path=$tmp/$opt2 git cat-file blob "$opt1" > "$full_path" - ctags -x --language-force=dts "$full_path" | + ctags -u -x --language-force=dts "$full_path" | awk '{print $1" "$2" "$3}' rm "$full_path" rmdir $tmp @@ -204,7 +204,13 @@ parse_docs() tmpfile=`mktemp` git cat-file blob "$opt1" > "$tmpfile" - "$script_dir/find-file-doc-comments.pl" "$tmpfile" || exit "$?" + + # Shortcut: if '/**' isn't present in the file, it cannot contain a doc. + # This avoids calling find-file-doc-comments.pl on most files, which is an + # expensive operation. + if grep -qF '/**' "$tmpfile"; then + "$script_dir/find-file-doc-comments.pl" "$tmpfile" || exit "$?" + fi rm -rf "$tmpfile" } diff --git a/update.py b/update.py deleted file mode 100755 index 9d84ff31..00000000 --- a/update.py +++ /dev/null @@ -1,638 +0,0 @@ -#!/usr/bin/env python3 - -# This file is part of Elixir, a source code cross-referencer. -# -# Copyright (C) 2017--2020 Mikaƫl Bouillot -# Maxime Chretien -# and contributors -# -# Elixir is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Elixir is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with Elixir. If not, see . - -# Throughout, an "idx" is the sequential number associated with a blob. -# This is different from that blob's Git hash. - -from sys import argv -from threading import Thread, Lock, Event, Condition - -import elixir.lib as lib -from elixir.lib import script, scriptLines -import elixir.data as data -from elixir.data import PathList -from find_compatible_dts import FindCompatibleDTS - -verbose = False - -dts_comp_support = int(script('dts-comp')) - -compatibles_parser = FindCompatibleDTS() - -db = data.DB(lib.getDataDir(), readonly=False, shared=True, dtscomp=dts_comp_support) - -# Number of cpu threads (+2 for version indexing) -cpu = 10 -threads_list = [] - -hash_file_lock = Lock() # Lock for db.hash and db.file -blobs_lock = Lock() # Lock for db.blobs -defs_lock = Lock() # Lock for db.defs -refs_lock = Lock() # Lock for db.refs -docs_lock = Lock() # Lock for db.docs -comps_lock = Lock() # Lock for db.comps -comps_docs_lock = Lock() # Lock for db.comps_docs -tag_ready = Condition() # Waiting for new tags - -new_idxes = [] # (new idxes, Event idxes ready, Event defs ready, Event comps ready, Event vers ready) -bindings_idxes = [] # DT bindings documentation files -idx_key_mod = 1000000 -defs_idxes = {} # Idents definitions stored with (idx*idx_key_mod + line) as the key. - -tags_done = False # True if all tags have been added to new_idxes - -# Progress variables [tags, finished threads] -tags_defs = [0, 0] -tags_defs_lock = Lock() -tags_refs = [0, 0] -tags_refs_lock = Lock() -tags_docs = [0, 0] -tags_docs_lock = Lock() -tags_comps = [0, 0] -tags_comps_lock = Lock() -tags_comps_docs = [0, 0] -tags_comps_docs_lock = Lock() - -class UpdateIds(Thread): - def __init__(self, tag_buf): - Thread.__init__(self, name="UpdateIdsElixir") - self.tag_buf = tag_buf - - def run(self): - global new_idxes, tags_done, tag_ready - self.index = 0 - - for tag in self.tag_buf: - - new_idxes.append((self.update_blob_ids(tag), Event(), Event(), Event(), Event())) - - progress('ids: ' + tag.decode() + ': ' + str(len(new_idxes[self.index][0])) + - ' new blobs', self.index+1) - - new_idxes[self.index][1].set() # Tell that the tag is ready - - self.index += 1 - - # Wake up waiting threads - with tag_ready: - tag_ready.notify_all() - - tags_done = True - progress('ids: Thread finished', self.index) - - def update_blob_ids(self, tag): - - global hash_file_lock, blobs_lock - - if db.vars.exists('numBlobs'): - idx = db.vars.get('numBlobs') - else: - idx = 0 - - # Get blob hashes and associated file names (without path) - blobs = scriptLines('list-blobs', '-f', tag) - - new_idxes = [] - for blob in blobs: - hash, filename = blob.split(b' ',maxsplit=1) - with blobs_lock: - blob_exist = db.blob.exists(hash) - if not blob_exist: - db.blob.put(hash, idx) - - if not blob_exist: - with hash_file_lock: - db.hash.put(idx, hash) - db.file.put(idx, filename) - - new_idxes.append(idx) - if verbose: - print(f"New blob #{idx} {hash}:{filename}") - idx += 1 - db.vars.put('numBlobs', idx) - return new_idxes - - -class UpdateVersions(Thread): - def __init__(self, tag_buf): - Thread.__init__(self, name="UpdateVersionsElixir") - self.tag_buf = tag_buf - - def run(self): - global new_idxes, tag_ready - - index = 0 - - while index < len(self.tag_buf): - if index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - tag = self.tag_buf[index] - - new_idxes[index][1].wait() # Make sure the tag is ready - - self.update_versions(tag) - - new_idxes[index][4].set() # Tell that UpdateVersions processed the tag - - progress('vers: ' + tag.decode() + ' done', index+1) - - index += 1 - - progress('vers: Thread finished', index) - - def update_versions(self, tag): - global blobs_lock - - # Get blob hashes and associated file paths - blobs = scriptLines('list-blobs', '-p', tag) - buf = [] - - for blob in blobs: - hash, path = blob.split(b' ', maxsplit=1) - with blobs_lock: - idx = db.blob.get(hash) - buf.append((idx, path)) - - buf = sorted(buf) - obj = PathList() - for idx, path in buf: - obj.append(idx, path) - - # Store DT bindings documentation files to parse them later - if path[:33] == b'Documentation/devicetree/bindings': - bindings_idxes.append(idx) - - if verbose: - print(f"Tag {tag}: adding #{idx} {path}") - db.vers.put(tag, obj, sync=True) - - -def generate_defs_caches(): - for key in db.defs.get_keys(): - value = db.defs.get(key) - for family in ['C', 'K', 'D', 'M']: - if (lib.compatibleFamily(value.get_families(), family) or - lib.compatibleMacro(value.get_macros(), family)): - db.defs_cache[family].put(key, b'') - - -class UpdateDefs(Thread): - def __init__(self, start, inc): - Thread.__init__(self, name="UpdateDefsElixir") - self.index = start - self.inc = inc # Equivalent to the number of defs threads - - def run(self): - global new_idxes, tags_done, tag_ready, tags_defs, tags_defs_lock - - while not (tags_done and self.index >= len(new_idxes)): - if self.index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - new_idxes[self.index][1].wait() # Make sure the tag is ready - - with tags_defs_lock: - tags_defs[0] += 1 - - self.update_definitions(new_idxes[self.index][0]) - - new_idxes[self.index][2].set() # Tell that UpdateDefs processed the tag - - self.index += self.inc - - with tags_defs_lock: - tags_defs[1] += 1 - progress('defs: Thread ' + str(tags_defs[1]) + '/' + str(self.inc) + ' finished', tags_defs[0]) - - - def update_definitions(self, idxes): - global hash_file_lock, defs_lock, tags_defs - - for idx in idxes: - if idx % 1000 == 0: progress('defs: ' + str(idx), tags_defs[0]) - - with hash_file_lock: - hash = db.hash.get(idx) - filename = db.file.get(idx) - - family = lib.getFileFamily(filename) - if family in [None, 'M']: continue - - lines = scriptLines('parse-defs', hash, filename, family) - - with defs_lock: - for l in lines: - ident, type, line = l.split(b' ') - type = type.decode() - line = int(line.decode()) - - defs_idxes[idx*idx_key_mod + line] = ident - - if db.defs.exists(ident): - obj = db.defs.get(ident) - elif lib.isIdent(ident): - obj = data.DefList() - else: - continue - - obj.append(idx, type, line, family) - if verbose: - print(f"def {type} {ident} in #{idx} @ {line}") - db.defs.put(ident, obj) - - generate_defs_caches() - - -class UpdateRefs(Thread): - def __init__(self, start, inc): - Thread.__init__(self, name="UpdateRefsElixir") - self.index = start - self.inc = inc # Equivalent to the number of refs threads - - def run(self): - global new_idxes, tags_done, tags_refs, tags_refs_lock - - while not (tags_done and self.index >= len(new_idxes)): - if self.index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - new_idxes[self.index][1].wait() # Make sure the tag is ready - new_idxes[self.index][2].wait() # Make sure UpdateDefs processed the tag - - with tags_refs_lock: - tags_refs[0] += 1 - - self.update_references(new_idxes[self.index][0]) - - self.index += self.inc - - with tags_refs_lock: - tags_refs[1] += 1 - progress('refs: Thread ' + str(tags_refs[1]) + '/' + str(self.inc) + ' finished', tags_refs[0]) - - def update_references(self, idxes): - global hash_file_lock, defs_lock, refs_lock, tags_refs - - for idx in idxes: - if idx % 1000 == 0: progress('refs: ' + str(idx), tags_refs[0]) - - with hash_file_lock: - hash = db.hash.get(idx) - filename = db.file.get(idx) - - family = lib.getFileFamily(filename) - if family == None: continue - - prefix = b'' - # Kconfig values are saved as CONFIG_ - if family == 'K': - prefix = b'CONFIG_' - - tokens = scriptLines('tokenize-file', '-b', hash, family) - even = True - line_num = 1 - idents = {} - with defs_lock: - for tok in tokens: - even = not even - if even: - tok = prefix + tok - - if (db.defs.exists(tok) and - not ( (idx*idx_key_mod + line_num) in defs_idxes and - defs_idxes[idx*idx_key_mod + line_num] == tok ) and - (family != 'M' or tok.startswith(b'CONFIG_'))): - # We only index CONFIG_??? in makefiles - if tok in idents: - idents[tok] += ',' + str(line_num) - else: - idents[tok] = str(line_num) - - else: - line_num += tok.count(b'\1') - - with refs_lock: - for ident, lines in idents.items(): - if db.refs.exists(ident): - obj = db.refs.get(ident) - else: - obj = data.RefList() - - obj.append(idx, lines, family) - if verbose: - print(f"ref: {ident} in #{idx} @ {lines}") - db.refs.put(ident, obj) - - -class UpdateDocs(Thread): - def __init__(self, start, inc): - Thread.__init__(self, name="UpdateDocsElixir") - self.index = start - self.inc = inc # Equivalent to the number of docs threads - - def run(self): - global new_idxes, tags_done, tags_docs, tags_docs_lock - - while not (tags_done and self.index >= len(new_idxes)): - if self.index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - new_idxes[self.index][1].wait() # Make sure the tag is ready - - with tags_docs_lock: - tags_docs[0] += 1 - - self.update_doc_comments(new_idxes[self.index][0]) - - self.index += self.inc - - with tags_docs_lock: - tags_docs[1] += 1 - progress('docs: Thread ' + str(tags_docs[1]) + '/' + str(self.inc) + ' finished', tags_docs[0]) - - def update_doc_comments(self, idxes): - global hash_file_lock, docs_lock, tags_docs - - for idx in idxes: - if idx % 1000 == 0: progress('docs: ' + str(idx), tags_docs[0]) - - with hash_file_lock: - hash = db.hash.get(idx) - filename = db.file.get(idx) - - family = lib.getFileFamily(filename) - if family in [None, 'M']: continue - - lines = scriptLines('parse-docs', hash, filename) - with docs_lock: - for l in lines: - ident, line = l.split(b' ') - line = int(line.decode()) - - if db.docs.exists(ident): - obj = db.docs.get(ident) - else: - obj = data.RefList() - - obj.append(idx, str(line), family) - if verbose: - print(f"doc: {ident} in #{idx} @ {line}") - db.docs.put(ident, obj) - - -class UpdateComps(Thread): - def __init__(self, start, inc): - Thread.__init__(self, name="UpdateCompsElixir") - self.index = start - self.inc = inc # Equivalent to the number of comps threads - - def run(self): - global new_idxes, tags_done, tags_comps, tags_comps_lock - - while not (tags_done and self.index >= len(new_idxes)): - if self.index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - new_idxes[self.index][1].wait() # Make sure the tag is ready - - with tags_comps_lock: - tags_comps[0] += 1 - - self.update_compatibles(new_idxes[self.index][0]) - - new_idxes[self.index][3].set() # Tell that UpdateComps processed the tag - - self.index += self.inc - - with tags_comps_lock: - tags_comps[1] += 1 - progress('comps: Thread ' + str(tags_comps[1]) + '/' + str(self.inc) + ' finished', tags_comps[0]) - - def update_compatibles(self, idxes): - global hash_file_lock, comps_lock, tags_comps - - for idx in idxes: - if idx % 1000 == 0: progress('comps: ' + str(idx), tags_comps[0]) - - with hash_file_lock: - hash = db.hash.get(idx) - filename = db.file.get(idx) - - family = lib.getFileFamily(filename) - if family in [None, 'K', 'M']: continue - - lines = compatibles_parser.run(scriptLines('get-blob', hash), family) - comps = {} - for l in lines: - ident, line = l.split(' ') - - if ident in comps: - comps[ident] += ',' + str(line) - else: - comps[ident] = str(line) - - with comps_lock: - for ident, lines in comps.items(): - if db.comps.exists(ident): - obj = db.comps.get(ident) - else: - obj = data.RefList() - - obj.append(idx, lines, family) - if verbose: - print(f"comps: {ident} in #{idx} @ {line}") - db.comps.put(ident, obj) - - -class UpdateCompsDocs(Thread): - def __init__(self, start, inc): - Thread.__init__(self, name="UpdateCompsDocsElixir") - self.index = start - self.inc = inc # Equivalent to the number of comps_docs threads - - def run(self): - global new_idxes, tags_done, tags_comps_docs, tags_comps_docs_lock - - while not (tags_done and self.index >= len(new_idxes)): - if self.index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - new_idxes[self.index][1].wait() # Make sure the tag is ready - new_idxes[self.index][3].wait() # Make sure UpdateComps processed the tag - new_idxes[self.index][4].wait() # Make sure UpdateVersions processed the tag - - with tags_comps_docs_lock: - tags_comps_docs[0] += 1 - - self.update_compatibles_bindings(new_idxes[self.index][0]) - - self.index += self.inc - - with tags_comps_docs_lock: - tags_comps_docs[1] += 1 - progress('comps_docs: Thread ' + str(tags_comps_docs[1]) + '/' + str(self.inc) + ' finished', tags_comps_docs[0]) - - def update_compatibles_bindings(self, idxes): - global hash_file_lock, comps_lock, comps_docs_lock, tags_comps_docs, bindings_idxes - - for idx in idxes: - if idx % 1000 == 0: progress('comps_docs: ' + str(idx), tags_comps_docs[0]) - - if not idx in bindings_idxes: # Parse only bindings doc files - continue - - with hash_file_lock: - hash = db.hash.get(idx) - - family = 'B' - lines = compatibles_parser.run(scriptLines('get-blob', hash), family) - comps_docs = {} - with comps_lock: - for l in lines: - ident, line = l.split(' ') - - if db.comps.exists(ident): - if ident in comps_docs: - comps_docs[ident] += ',' + str(line) - else: - comps_docs[ident] = str(line) - - with comps_docs_lock: - for ident, lines in comps_docs.items(): - if db.comps_docs.exists(ident): - obj = db.comps_docs.get(ident) - else: - obj = data.RefList() - - obj.append(idx, lines, family) - if verbose: - print(f"comps_docs: {ident} in #{idx} @ {line}") - db.comps_docs.put(ident, obj) - - -def progress(msg, current): - print('{} - {} ({:.1%})'.format(project, msg, current/num_tags)) - - -# Main - -# Check number of threads arg -if len(argv) >= 2 and argv[1].isdigit() : - cpu = int(argv[1]) - - if cpu < 5 : - cpu = 5 - -# Distribute threads among functions using the following rules : -# There are more (or equal) refs threads than others -# There are more (or equal) defs threads than docs or comps threads -# Example : if cpu=6 : defs=1, refs=2, docs=1, comps=1, comps_docs=1 -# if cpu=7 : defs=2, refs=2, docs=1, comps=1, comps_docs=1 -# if cpu=8 : defs=2, refs=3, docs=1, comps=1, comps_docs=1 -# if cpu=11: defs=2, refs=3, docs=2, comps=2, comps_docs=2 -quo, rem = divmod(cpu, 5) -num_th_refs = quo -num_th_defs = quo -num_th_docs = quo - -# If DT bindings support is enabled, use $quo threads for each of the 2 threads -# Otherwise add them to the remaining threads -if dts_comp_support: - num_th_comps = quo - num_th_comps_docs = quo -else : - num_th_comps = 0 - num_th_comps_docs = 0 - rem += 2*quo - -quo, rem = divmod(rem, 2) -num_th_defs += quo -num_th_refs += quo + rem - -tag_buf = [] -for tag in scriptLines('list-tags'): - if not db.vers.exists(tag): - tag_buf.append(tag) - -num_tags = len(tag_buf) -project = lib.currentProject() - -print(project + ' - found ' + str(num_tags) + ' new tags') - -if not num_tags: - # Backward-compatibility: generate defs caches if they are empty. - if db.defs_cache['C'].db.stat()['nkeys'] == 0: - generate_defs_caches() - exit(0) - -threads_list.append(UpdateIds(tag_buf)) -threads_list.append(UpdateVersions(tag_buf)) - -# Define defs threads -for i in range(num_th_defs): - threads_list.append(UpdateDefs(i, num_th_defs)) -# Define refs threads -for i in range(num_th_refs): - threads_list.append(UpdateRefs(i, num_th_refs)) -# Define docs threads -for i in range(num_th_docs): - threads_list.append(UpdateDocs(i, num_th_docs)) -# Define comps threads -for i in range(num_th_comps): - threads_list.append(UpdateComps(i, num_th_comps)) -# Define comps_docs threads -for i in range(num_th_comps_docs): - threads_list.append(UpdateCompsDocs(i, num_th_comps_docs)) - - -# Start to process tags -threads_list[0].start() - -# Wait until the first tag is ready -with tag_ready: - tag_ready.wait() - -# Start remaining threads -for i in range(1, len(threads_list)): - threads_list[i].start() - -# Make sure all threads finished -for i in range(len(threads_list)): - threads_list[i].join() diff --git a/utils/index b/utils/index index 6e84a3e7..61250a22 100755 --- a/utils/index +++ b/utils/index @@ -59,14 +59,10 @@ project_fetch() { # $1 is the project path (parent of data/ and repo/). project_index() { - if test -z "$ELIXIR_THREADS"; then - ELIXIR_THREADS="$(nproc)" - fi - elixir_sources="$(dirname "$(dirname "$0")")" LXR_REPO_DIR=$1/repo LXR_DATA_DIR=$1/data \ - python3 "$elixir_sources/update.py" $ELIXIR_THREADS + python3 -m elixir.update } # $1 is the Elixir root data path.