From fd2df021d78a640e6055ec297f0d621502089775 Mon Sep 17 00:00:00 2001 From: "FC (Fay) Stegerman" Date: Sun, 1 Sep 2024 01:48:41 +0200 Subject: [PATCH] port sqlite extension to libpcre2 --- .gitignore | 2 +- Makefile | 2 +- README.md | 4 +- jiten/jmdict.py | 6 +- jiten/kanji.py | 4 +- jiten/misc.py | 2 +- jiten/sql.py | 6 +- jiten/templates/_help.html | 2 +- setup.py | 6 +- sqlite3-pcre.c | 120 ---------------------------------- sqlite3-pcre2.c | 129 +++++++++++++++++++++++++++++++++++++ 11 files changed, 146 insertions(+), 137 deletions(-) delete mode 100644 sqlite3-pcre.c create mode 100644 sqlite3-pcre2.c diff --git a/.gitignore b/.gitignore index 36eb2e4..f23adf6 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,6 @@ __pycache__ /htmlcov /jiten/.version -/jiten/_sqlite3_pcre.*.so +/jiten/_sqlite3_pcre*.so /tmp /tmp-html diff --git a/Makefile b/Makefile index b912fd8..bb4164b 100644 --- a/Makefile +++ b/Makefile @@ -54,7 +54,7 @@ coverage: tmp-html clean: cleanup rm -f jiten/res/*.sqlite3 - rm -f jiten/_sqlite3_pcre.*.so + rm -f jiten/_sqlite3_pcre*.so rm -fr jiten.egg-info/ $(MAKE) -C jiten/res/jmdict clean $(MAKE) -C jiten/res/sentences clean diff --git a/README.md b/README.md index 5a039ff..ed3956b 100644 --- a/README.md +++ b/README.md @@ -233,7 +233,7 @@ $ make * Python >= 3.8 + Flask + click + [kanjidraw](https://github.com/obfusk/kanjidraw). * To build the SQLite PCRE C extension: a C compiler (e.g. - `gcc`/`clang`) and the `sqlite3` & `pcre` libraries & headers. + `gcc`/`clang`) and the `sqlite3` & `pcre2` libraries & headers. * Basic build tools like `make` and `patch`. * To run the WebView GUI: [pywebview](https://pywebview.flowrl.com) >= 3.3.5. @@ -241,7 +241,7 @@ $ make ### Debian/Ubuntu ```bash -$ apt install python3-dev build-essential libsqlite3-dev libpcre3-dev +$ apt install python3-dev build-essential libsqlite3-dev libpcre2-dev $ apt install python3-flask # optional: Flask & click w/o pip ``` diff --git a/jiten/jmdict.py b/jiten/jmdict.py index 407019b..f6b22cf 100644 --- a/jiten/jmdict.py +++ b/jiten/jmdict.py @@ -235,7 +235,7 @@ from . import misc as M from . import pitch as P from .kana import katakana2hiragana -from .sql import sqlite_do, load_pcre_extension +from .sql import sqlite_do, load_pcre2_extension DBVERSION = 14 # NB: update this when data/schema changes SQLITE_FILE = M.resource_path("res/jmdict.sqlite3") @@ -678,7 +678,7 @@ def search(q, langs = [LANGS[0]], max_results = None, # {{{1 """.format(fltr, ordr, limit), dict(q = "%"+q+"%", si = sinfo)) # safe! elif M.q2like(q): - load_pcre_extension(c.connection) + load_pcre2_extension(c.connection) prms = dict(q = M.q2like(q), re = M.q2rx(q), si = sinfo) query = (""" SELECT rank, seq, jlpt FROM ( @@ -697,7 +697,7 @@ def search(q, langs = [LANGS[0]], max_results = None, # {{{1 {} {} {} """.format(fltr, ordr, limit), prms) # safe! else: - load_pcre_extension(c.connection) + load_pcre2_extension(c.connection) query = (""" SELECT rank, seq, jlpt FROM ( SELECT entry FROM kanji WHERE elem REGEXP :re diff --git a/jiten/kanji.py b/jiten/kanji.py index 788c8c6..b93716d 100644 --- a/jiten/kanji.py +++ b/jiten/kanji.py @@ -128,7 +128,7 @@ from . import jmdict as J from . import misc as M -from .sql import sqlite_do, load_pcre_extension +from .sql import sqlite_do, load_pcre2_extension SQLITE_FILE = M.resource_path("res/kanji.sqlite3") KANJIDIC_FILE = M.resource_path("res/jmdict/kanjidic2.xml.gz") @@ -407,7 +407,7 @@ def search(q, max_results = None, level = None, jlpt = None, for r in c.execute("SELECT * FROM entry WHERE code = ?", (ord(char),)): yield row2entry(r) # #=1 else: - load_pcre_extension(c.connection) + load_pcre2_extension(c.connection) for r in c.execute(f""" SELECT * FROM entry WHERE ( on_ REGEXP :re OR diff --git a/jiten/misc.py b/jiten/misc.py index a9fc57f..8dacae1 100644 --- a/jiten/misc.py +++ b/jiten/misc.py @@ -261,7 +261,7 @@ def download_file(url, file, sha512 = None, tmp = ".tmp"): # {{{1 ), libffi = dict(url = "https://github.com/libffi/libffi"), - libpcre = dict(url = "https://www.pcre.org"), + libpcre2 = dict(url = "https://www.pcre.org"), openssl = dict(url = "https://www.openssl.org"), pyjnius = dict(url = "https://github.com/kivy/pyjnius"), python3 = dict(url = "https://www.python.org"), diff --git a/jiten/sql.py b/jiten/sql.py index b81abd6..c448fd5 100644 --- a/jiten/sql.py +++ b/jiten/sql.py @@ -44,9 +44,9 @@ def sqlite_do(file, write = False): finally: conn.close() -def load_pcre_extension(conn): - spec = importlib.util.find_spec("jiten._sqlite3_pcre") - if spec is None: raise RuntimeError("jiten._sqlite3_pcre not found") +def load_pcre2_extension(conn): + spec = importlib.util.find_spec("jiten._sqlite3_pcre2") + if spec is None: raise RuntimeError("jiten._sqlite3_pcre2 not found") conn.enable_load_extension(True) if hasattr(conn, "load_extension"): conn.load_extension(spec.origin) diff --git a/jiten/templates/_help.html b/jiten/templates/_help.html index e228067..3bf5278 100644 --- a/jiten/templates/_help.html +++ b/jiten/templates/_help.html @@ -240,7 +240,7 @@
target="_blank" rel="noopener">Python Regex Syntax.
For when the details matter: - PCRE Syntax (what jiten actually uses).
Prefix "Commands"
diff --git a/setup.py b/setup.py index bb8890c..dd5d1ec 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,8 @@ clean = "clean" in sys.argv[1:] info = Path(__file__).with_name("README.md").read_text(encoding = "utf8") -pcre = setuptools.Extension("jiten._sqlite3_pcre", ["sqlite3-pcre.c"], - libraries = "pcre sqlite3".split()) +pcre2 = setuptools.Extension("jiten._sqlite3_pcre2", ["sqlite3-pcre2.c"], + libraries = "pcre2-8 sqlite3".split()) data = [ "res/jlpt/N" + l + "-" + x for l in "12345" for x in "kanji vocab-eng vocab-hiragana".split() ] \ @@ -98,5 +98,5 @@ python_requires = ">=3.8", install_requires = ["Flask", "click>=6.0", "kanjidraw>=0.2.1"], extras_require = dict(gui = ["pywebview>=3.3.5"]), - ext_modules = [pcre], + ext_modules = [pcre2], ) diff --git a/sqlite3-pcre.c b/sqlite3-pcre.c deleted file mode 100644 index 35efe64..0000000 --- a/sqlite3-pcre.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Written by Alexey Tourbin . - * Modified by FC Stegerman . - * - * The author has dedicated the code to the public domain. Anyone is free - * to copy, modify, publish, use, compile, sell, or distribute the original - * code, either in source code form or as a compiled binary, for any purpose, - * commercial or non-commercial, and by any means. - */ -#include -#include -#include -#include -#include -SQLITE_EXTENSION_INIT1 - -typedef struct { - char *s; - pcre *p; - pcre_extra *e; -} cache_entry; - -#ifndef CACHE_SIZE -#define CACHE_SIZE 16 -#endif - -static -void regexp(sqlite3_context *ctx, int argc, sqlite3_value **argv) -{ - const char *re, *str; - pcre *p; - pcre_extra *e; - - assert(argc == 2); - - re = (const char *) sqlite3_value_text(argv[0]); - if (!re) { - sqlite3_result_error(ctx, "[REGEXP] no regexp", -1); - return; - } - - str = (const char *) sqlite3_value_text(argv[1]); - if (!str) { - sqlite3_result_error(ctx, "[REGEXP] no string", -1); - return; - } - - /* simple LRU cache */ - { - int i; - int found = 0; - cache_entry *cache = sqlite3_user_data(ctx); - - assert(cache); - - for (i = 0; i < CACHE_SIZE && cache[i].s; i++) - if (strcmp(re, cache[i].s) == 0) { - found = 1; - break; - } - if (found) { - if (i > 0) { - cache_entry c = cache[i]; - memmove(cache + 1, cache, i * sizeof(cache_entry)); - cache[0] = c; - } - } - else { - cache_entry c; - const char *err; - int pos; - c.p = pcre_compile(re, PCRE_UTF8 | PCRE_UCP, &err, &pos, NULL); - if (!c.p) { - char *e2 = sqlite3_mprintf("[REGEXP] %s: %s (offset %d)", re, err, pos); - sqlite3_result_error(ctx, e2, -1); - sqlite3_free(e2); - return; - } - c.e = pcre_study(c.p, 0, &err); - c.s = strdup(re); - if (!c.s) { - sqlite3_result_error(ctx, "[REGEXP] strdup: ENOMEM", -1); - pcre_free(c.p); - pcre_free(c.e); - return; - } - i = CACHE_SIZE - 1; - if (cache[i].s) { - free(cache[i].s); - assert(cache[i].p); - pcre_free(cache[i].p); - pcre_free(cache[i].e); - } - memmove(cache + 1, cache, i * sizeof(cache_entry)); - cache[0] = c; - } - p = cache[0].p; - e = cache[0].e; - } - - { - int rc; - assert(p); - rc = pcre_exec(p, e, str, strlen(str), 0, 0, NULL, 0); - sqlite3_result_int(ctx, rc >= 0); - return; - } -} - -int sqlite3_extension_init(sqlite3 *db, char **err, const sqlite3_api_routines *api) -{ - SQLITE_EXTENSION_INIT2(api) - cache_entry *cache = calloc(CACHE_SIZE, sizeof(cache_entry)); - if (!cache) { - *err = "[REGEXP] calloc: ENOMEM"; - return 1; - } - sqlite3_create_function(db, "REGEXP", 2, SQLITE_UTF8, cache, regexp, NULL, NULL); - return 0; -} diff --git a/sqlite3-pcre2.c b/sqlite3-pcre2.c new file mode 100644 index 0000000..178bd06 --- /dev/null +++ b/sqlite3-pcre2.c @@ -0,0 +1,129 @@ +/* + * Written by Alexey Tourbin . + * Modified by FC (Fay) Stegerman . + * + * The author has dedicated the code to the public domain. Anyone is free + * to copy, modify, publish, use, compile, sell, or distribute the original + * code, either in source code form or as a compiled binary, for any purpose, + * commercial or non-commercial, and by any means. + */ +#define PCRE2_CODE_UNIT_WIDTH 8 +#include +#include +#include +#include +#include +SQLITE_EXTENSION_INIT1 + +typedef struct { + char *str; + pcre2_code *code; + pcre2_match_data *match_data; +} cache_entry; + +#ifndef CACHE_SIZE +#define CACHE_SIZE 16 +#endif + +static void regexp(sqlite3_context *ctx, int argc, sqlite3_value **argv) +{ + const char *re, *str; + pcre2_code *code; + pcre2_match_data *match_data; + + assert(argc == 2); + + re = (const char *) sqlite3_value_text(argv[0]); + if (re == NULL) { + sqlite3_result_error(ctx, "[REGEXP] no regexp", -1); + return; + } + + str = (const char *) sqlite3_value_text(argv[1]); + if (str == NULL) { + sqlite3_result_error(ctx, "[REGEXP] no string", -1); + return; + } + + /* simple LRU cache */ + { + int i; + int found = 0; + cache_entry *cache = sqlite3_user_data(ctx); + + assert(cache != NULL); + + for (i = 0; i < CACHE_SIZE && cache[i].str; i++) { + if (strcmp(re, cache[i].str) == 0) { + found = 1; + break; + } + } + if (found) { + if (i > 0) { + cache_entry c = cache[i]; + memmove(cache + 1, cache, i * sizeof(cache_entry)); + cache[0] = c; + } + } + else { + cache_entry c; + int err_no; + PCRE2_SIZE err_off; + c.code = pcre2_compile((PCRE2_SPTR)re, PCRE2_ZERO_TERMINATED, + PCRE2_UTF | PCRE2_UCP, &err_no, &err_off, NULL); + if (c.code == NULL) { + PCRE2_UCHAR err[256]; + pcre2_get_error_message(err_no, err, sizeof(err)); + char *e = sqlite3_mprintf("[REGEXP] %s: %s (offset %d)", re, err, err_off); + sqlite3_result_error(ctx, e, -1); + sqlite3_free(e); + return; + } + c.match_data = pcre2_match_data_create_from_pattern(c.code, NULL); + if (c.match_data == NULL) { + sqlite3_result_error(ctx, "[REGEXP] pcre2_match_data_create_from_pattern: ENOMEM", -1); + pcre2_code_free(c.code); + return; + } + c.str = strdup(re); + if (c.str == NULL) { + sqlite3_result_error(ctx, "[REGEXP] strdup: ENOMEM", -1); + pcre2_match_data_free(c.match_data); + pcre2_code_free(c.code); + return; + } + i = CACHE_SIZE - 1; + if (cache[i].str) { + free(cache[i].str); + assert(cache[i].code != NULL); + pcre2_match_data_free(cache[i].match_data); + pcre2_code_free(cache[i].code); + } + memmove(cache + 1, cache, i * sizeof(cache_entry)); + cache[0] = c; + } + code = cache[0].code; + match_data = cache[0].match_data; + } + + { + int rc; + assert(code != NULL); + rc = pcre2_match(code, (PCRE2_SPTR)str, PCRE2_ZERO_TERMINATED, 0, 0, match_data, NULL); + sqlite3_result_int(ctx, rc >= 0); + return; + } +} + +int sqlite3_extension_init(sqlite3 *db, char **err, const sqlite3_api_routines *api) +{ + SQLITE_EXTENSION_INIT2(api) + cache_entry *cache = calloc(CACHE_SIZE, sizeof(cache_entry)); + if (cache == NULL) { + *err = "[REGEXP] calloc: ENOMEM"; + return 1; + } + sqlite3_create_function(db, "REGEXP", 2, SQLITE_UTF8, cache, regexp, NULL, NULL); + return 0; +}