Skip to content

Commit

Permalink
Merge pull request #6 from javstash/ja-search-support
Browse files Browse the repository at this point in the history
Ja search support
  • Loading branch information
javstash authored Jan 18, 2025
2 parents e25483b + 33a2f44 commit 8492e6e
Show file tree
Hide file tree
Showing 8 changed files with 259 additions and 29 deletions.
4 changes: 1 addition & 3 deletions docker/production/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: '3.8'

services:
postgres:
container_name: postgres
Expand All @@ -14,7 +12,7 @@ services:

stash-box:
container_name: stash-box
image: stashapp/stash-box:development
image: javstash/stash-box:development
restart: always
logging:
driver: "json-file"
Expand Down
43 changes: 36 additions & 7 deletions docker/production/postgres/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,41 @@
FROM postgres:14.2
ARG POSTGRES_VERSION=17
FROM postgres:$POSTGRES_VERSION AS build
ARG POSTGRES_VERSION=17
RUN apt-get update && apt-get install -y --no-install-recommends postgresql-server-dev-$POSTGRES_VERSION gcc make icu-devtools libicu-dev

RUN buildDeps='git make gcc postgresql-server-dev-14' \
RUN mkdir -p /root/parser
WORKDIR /root/parser
COPY pg_cjk_parser.c /root/parser/
COPY pg_cjk_parser.control /root/parser/
COPY Makefile /root/parser/
COPY pg_cjk_parser--0.0.1.sql /root/parser/
COPY zht2zhs.h /root/parser/
RUN make clean && make USE_PGXS=1 install

FROM postgres:17.2-bookworm

ARG POSTGRES_VERSION=17
COPY --from=build /root/parser/pg_cjk_parser.bc /usr/lib/postgresql/$POSTGRES_VERSION/lib/bitcode
COPY --from=build /root/parser/pg_cjk_parser.so /usr/lib/postgresql/$POSTGRES_VERSION/lib
COPY --from=build /root/parser/pg_cjk_parser--0.0.1.sql /usr/share/postgresql/$POSTGRES_VERSION/extension
COPY --from=build /root/parser/pg_cjk_parser.control /usr/share/postgresql/$POSTGRES_VERSION/extension

RUN buildDeps='git make build-essential postgresql-server-dev-17 wget libicu-dev' \
&& apt update && apt install -y $buildDeps --no-install-recommends --reinstall ca-certificates \
&& git clone https://github.com/fake-name/pg-spgist_hamming.git \
&& make -C pg-spgist_hamming/bktree \
&& make -C pg-spgist_hamming/bktree install \
&& rm -rf pg-spgist_hamming \
&& apt purge -y --auto-remove $buildDeps
&& git clone https://github.com/evirma/pg_bktree.git /usr/local/src/bktree \
&& cd /usr/local/src/bktree \
&& make USE_PGXS=1 && make USE_PGXS=1 install \
&& cd .. \
&& wget https://github.com/pgbigm/pg_bigm/archive/refs/tags/v1.2-20240606.tar.gz \
&& tar zxf v1.2-20240606.tar.gz \
&& cd pg_bigm-1.2-20240606 \
&& make USE_PGXS=1 && make USE_PGXS=1 install \
&& echo shared_preload_libraries='pg_bigm' >> /var/lib/postgresql/data/postgresql.conf \
&& cd .. \
&& rm -rf pg_bigm-1.2-20240606 \
&& rm -rf bktree \
&& apt purge -y --auto-remove $buildDeps \
&& apt clean

EXPOSE 5432
CMD docker-entrypoint.sh postgres
106 changes: 106 additions & 0 deletions pkg/database/migrations/postgres-ja/1_ja_parser_postinstallation.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
CREATE EXTENSION pg_cjk_parser;

CREATE TEXT SEARCH PARSER public.pg_cjk_parser (
START = prsd2_cjk_start,
GETTOKEN = prsd2_cjk_nexttoken,
END = prsd2_cjk_end,
LEXTYPES = prsd2_cjk_lextype,
HEADLINE = prsd2_cjk_headline);

CREATE TEXT SEARCH CONFIGURATION public.config_2_gram_cjk (
PARSER = pg_cjk_parser
);

SET default_text_search_config = 'public.config_2_gram_cjk';

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR asciihword
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR cjk
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR email
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR asciiword
WITH english_stem;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR entity
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR file
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR float
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR host
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR hword
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR hword_asciipart
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR hword_numpart
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR hword_part
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR int
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR numhword
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR numword
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR protocol
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR sfloat
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR tag
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR uint
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR url
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR url_path
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR version
WITH simple;

ALTER TEXT SEARCH CONFIGURATION public.config_2_gram_cjk
ADD MAPPING FOR word
WITH simple;
97 changes: 97 additions & 0 deletions pkg/database/migrations/postgres-ja/2_ja_bigm_textsearch.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
DO $$
BEGIN
IF current_setting('is_superuser') = 'on' THEN
CREATE EXTENSION IF NOT EXISTS pg_bigm;
END IF;
END$$;

-- From #35, use GIN and switch back to default

DROP INDEX scene_search_ts_idx;
CREATE INDEX scene_search_ts_idx ON scene_search USING gist (
(
to_tsvector('config_2_gram_cjk', COALESCE(scene_date, '')) ||
to_tsvector('config_2_gram_cjk', studio_name) ||
to_tsvector('config_2_gram_cjk', COALESCE(performer_names, '')) ||
to_tsvector('config_2_gram_cjk', scene_title) ||
to_tsvector('config_2_gram_cjk', COALESCE(scene_code, ''))
)
);

-- From #2, gin_bigm_ops instead of gin_trgm_ops

DROP INDEX name_trgm_idx;
DROP INDEX name_bigm_idx;
CREATE INDEX name_bigm_idx ON "performers" USING GIN ("name" gin_bigm_ops);

-- From #12, gin_bigm_ops instead of gin_trgm_ops

DROP INDEX disambiguation_trgm_idx;
DROP INDEX disambiguation_bigm_idx;
CREATE INDEX disambiguation_bigm_idx ON "performers" USING GIN ("disambiguation" gin_bigm_ops);
DROP INDEX performer_alias_trgm_idx;
DROP INDEX performer_alias_bigm_idx;
CREATE INDEX performer_alias_bigm_idx ON "performer_aliases" USING GIN ("alias" gin_bigm_ops);

-- From #35 with the regex function around the scene title removed

CREATE OR REPLACE FUNCTION update_scene() RETURNS TRIGGER AS $$
BEGIN
IF (NEW.title != OLD.title OR NEW.date != OLD.date OR NEW.studio_id != OLD.studio_id OR COALESCE(NEW.code, '') != COALESCE(OLD.code, '')) THEN
UPDATE scene_search
SET
scene_title = NEW.title,
scene_date = NEW.date,
studio_name = SUBQUERY.studio_name,
scene_code = NEW.code
FROM (
SELECT S.id as sid, T.name || ' ' || CASE WHEN TP.name IS NOT NULL THEN (TP.name) ELSE '' END AS studio_name
FROM scenes S
JOIN studios T ON S.studio_id = T.id
LEFT JOIN studios TP ON T.parent_studio_id = TP.id
) SUBQUERY
WHERE scene_id = NEW.id
AND scene_id = SUBQUERY.sid;
END IF;
RETURN NULL;
END;
$$ LANGUAGE plpgsql; --The trigger used to update a table.

-- From #35 with the regex function around the scene title removed

CREATE OR REPLACE FUNCTION insert_scene() RETURNS TRIGGER AS $$
BEGIN
INSERT INTO scene_search (scene_id, scene_title, scene_date, studio_name, scene_code)
SELECT
NEW.id,
NEW.title,
NEW.date,
T.name || ' ' || CASE WHEN TP.name IS NOT NULL THEN (TP.name) ELSE '' END,
NEW.code
FROM studios T
LEFT JOIN studios TP ON T.parent_studio_id = TP.id
WHERE T.id = NEW.studio_id;
RETURN NULL;
END;
$$ LANGUAGE plpgsql; --The trigger used to update a table.


TRUNCATE TABLE scene_search;

-- From #35 with the regex function around the scene title removed

INSERT INTO scene_search
SELECT
S.id as scene_id,
S.title AS scene_title,
S.date::TEXT AS scene_date,
T.name || ' ' || CASE WHEN TP.name IS NOT NULL THEN (TP.name) ELSE '' END AS studio_name,
ARRAY_TO_STRING(ARRAY_CAT(ARRAY_AGG(P.name), ARRAY_AGG(PS.as)), ' ', '') AS performer_names,
S.code as scene_code
FROM scenes S
LEFT JOIN scene_performers PS ON PS.scene_id = S.id
LEFT JOIN performers P ON PS.performer_id = P.id
LEFT JOIN studios T ON T.id = S.studio_id
LEFT JOIN studios TP ON T.parent_studio_id = TP.id
GROUP BY S.id, S.title, T.name, TP.name;

12 changes: 6 additions & 6 deletions pkg/sqlx/querybuilder_performer.go
Original file line number Diff line number Diff line change
Expand Up @@ -570,18 +570,18 @@ func (qb *performerQueryBuilder) SearchPerformers(term string, limit int) (model
query := `
SELECT P.* FROM (
SELECT id, SUM(similarity) AS score FROM (
SELECT P.id, similarity(P.name, $1) AS similarity
SELECT P.id, bigm_similarity(P.name, $1) AS similarity
FROM performers P
WHERE P.deleted = FALSE AND P.name % $1 AND similarity(P.name, $1) > 0.5
WHERE P.deleted = FALSE AND P.name =% $1 AND bigm_similarity(P.name, $1) > 0.5
UNION
SELECT P.id, (similarity(COALESCE(PA.alias, ''), $1) * 0.5) AS similarity
SELECT P.id, (bigm_similarity(COALESCE(PA.alias, ''), $1) * 0.5) AS similarity
FROM performers P
LEFT JOIN performer_aliases PA on PA.performer_id = P.id
WHERE P.deleted = FALSE AND PA.alias % $1 AND similarity(COALESCE(PA.alias, ''), $1) > 0.6
WHERE P.deleted = FALSE AND PA.alias =% $1 AND bigm_similarity(COALESCE(PA.alias, ''), $1) > 0.6
UNION
SELECT P.id, (similarity(COALESCE(P.disambiguation, ''), $1) * 0.3) AS similarity
SELECT P.id, (bigm_similarity(COALESCE(P.disambiguation, ''), $1) * 0.3) AS similarity
FROM performers P
WHERE P.deleted = FALSE AND P.disambiguation % $1 AND similarity(COALESCE(P.disambiguation), $1) > 0.7
WHERE P.deleted = FALSE AND P.disambiguation =% $1 AND bigm_similarity(COALESCE(P.disambiguation), $1) > 0.7
) A
GROUP BY id
ORDER BY score DESC
Expand Down
12 changes: 6 additions & 6 deletions pkg/sqlx/querybuilder_scene.go
Original file line number Diff line number Diff line change
Expand Up @@ -804,12 +804,12 @@ func (qb *sceneQueryBuilder) SearchScenes(term string, limit int) ([]*models.Sce
SELECT S.* FROM scenes S
LEFT JOIN scene_search SS ON SS.scene_id = S.id
WHERE (
to_tsvector('english', COALESCE(scene_date, '')) ||
to_tsvector('english', studio_name) ||
to_tsvector('english', COALESCE(performer_names, '')) ||
to_tsvector('english', scene_title) ||
to_tsvector('english', COALESCE(scene_code, ''))
) @@ websearch_to_tsquery('english', ?)
to_tsvector('config_2_gram_cjk', COALESCE(scene_date, '')) ||
to_tsvector('config_2_gram_cjk', studio_name) ||
to_tsvector('config_2_gram_cjk', COALESCE(performer_names, '')) ||
to_tsvector('config_2_gram_cjk', scene_title) ||
to_tsvector('config_2_gram_cjk', COALESCE(scene_code, ''))
) @@ websearch_to_tsquery('config_2_gram_cjk', ?)
AND S.deleted = FALSE
LIMIT ?`
var args []interface{}
Expand Down
8 changes: 4 additions & 4 deletions pkg/sqlx/querybuilder_studio.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,14 +239,14 @@ func (qb *studioQueryBuilder) SearchStudios(term string, limit int) (models.Stud
query := `
SELECT S.* FROM (
SELECT id, SUM(similarity) AS score FROM (
SELECT S.id, similarity(S.name, $1) AS similarity
SELECT S.id, bigm_similarity(S.name, $1) AS similarity
FROM studios S
WHERE S.deleted = FALSE AND S.name % $1 AND similarity(S.name, $1) > 0.5
WHERE S.deleted = FALSE AND S.name =% $1 AND bigm_similarity(S.name, $1) > 0.5
UNION
SELECT S.id, (similarity(COALESCE(SA.alias, ''), $1) * 0.5) AS similarity
SELECT S.id, (bigm_similarity(COALESCE(SA.alias, ''), $1) * 0.5) AS similarity
FROM studios S
LEFT JOIN studio_aliases SA on SA.studio_id = S.id
WHERE S.deleted = FALSE AND SA.alias % $1 AND similarity(COALESCE(SA.alias, ''), $1) > 0.5
WHERE S.deleted = FALSE AND SA.alias =% $1 AND bigm_similarity(COALESCE(SA.alias, ''), $1) > 0.5
) A
GROUP BY id
ORDER BY score DESC
Expand Down
6 changes: 3 additions & 3 deletions pkg/sqlx/querybuilder_tag.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,9 +267,9 @@ func (qb *tagQueryBuilder) SearchTags(term string, limit int) ([]*models.Tag, er
SELECT T.* FROM tags T
LEFT JOIN tag_aliases TA ON TA.tag_id = T.id
WHERE (
to_tsvector('english', T.name) ||
to_tsvector('english', COALESCE(TA.alias, ''))
) @@ plainto_tsquery($1)
to_tsvector('config_2_gram_cjk', T.name) ||
to_tsvector('config_2_gram_cjk', COALESCE(TA.alias, ''))
) @@ plainto_tsquery('config_2_gram_cjk', $1)
AND T.deleted = FALSE
GROUP BY T.id
ORDER BY T.name ASC
Expand Down

0 comments on commit 8492e6e

Please sign in to comment.