-
Notifications
You must be signed in to change notification settings - Fork 491
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #438 from ApsaraDB/POLARDB_11_DEV
merge: 20231016
- Loading branch information
Showing
91 changed files
with
18,129 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Generated subdirectories | ||
/log/ | ||
/results/ | ||
/tmp_check/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
|
||
MODULE_big = smlar | ||
OBJS = smlar.o smlar_gist.o smlar_gin.o smlar_cache.o \ | ||
tsarr.o smlar_guc.o smlar_stat.o | ||
|
||
EXTENSION = smlar | ||
DATA = smlar--1.0.sql smlar--unpackaged--1.0.sql | ||
REGRESS = smlar int2 int4 int8 float4 float8 money oid \ | ||
timestamp timestamptz time timetz date interval \ | ||
macaddr inet cidr \ | ||
text varchar char bytea bit varbit numeric \ | ||
int4g int8g intervalg textg \ | ||
int4i int8i intervali texti \ | ||
composite_int4 composite_text | ||
|
||
ifdef USE_PGXS | ||
PG_CONFIG = pg_config | ||
PGXS := $(shell $(PG_CONFIG) --pgxs) | ||
include $(PGXS) | ||
else | ||
subdir = external/smlar | ||
top_builddir = ../.. | ||
include $(top_builddir)/src/Makefile.global | ||
include $(top_srcdir)/contrib/contrib-global.mk | ||
endif | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
float4 smlar(anyarray, anyarray) | ||
- computes similary of two arrays. Arrays should be the same type. | ||
|
||
float4 smlar(anyarray, anyarray, bool useIntersect) | ||
- computes similary of two arrays of composite types. Composite type looks like: | ||
CREATE TYPE type_name AS (element_name anytype, weight_name FLOAT4); | ||
useIntersect option points to use only intersected elements in denominator | ||
see an exmaples in sql/composite_int4.sql or sql/composite_text.sql | ||
|
||
float4 smlar( anyarray a, anyarray b, text formula ); | ||
- computes similary of two arrays by given formula, arrays should | ||
be the same type. | ||
Predefined variables in formula: | ||
N.i - number of common elements in both array (intersection) | ||
N.a - number of uniqueelements in first array | ||
N.b - number of uniqueelements in second array | ||
Example: | ||
smlar('{1,4,6}'::int[], '{5,4,6}' ) | ||
smlar('{1,4,6}'::int[], '{5,4,6}', 'N.i / sqrt(N.a * N.b)' ) | ||
That calls are equivalent. | ||
|
||
anyarray % anyarray | ||
- returns true if similarity of that arrays is greater than limit | ||
|
||
float4 show_smlar_limit() - deprecated | ||
- shows the limit for % operation | ||
|
||
float4 set_smlar_limit(float4) - deprecated | ||
- sets the limit for % operation | ||
|
||
Use instead of show_smlar_limit/set_smlar_limit GUC variable | ||
smlar.threshold (see below) | ||
|
||
|
||
text[] tsvector2textarray(tsvector) | ||
- transforms tsvector type to text array | ||
|
||
anyarray array_unique(anyarray) | ||
- sort and unique array | ||
|
||
float4 inarray(anyarray, anyelement) | ||
- returns zero if second argument does not present in a first one | ||
and 1.0 in opposite case | ||
|
||
float4 inarray(anyarray, anyelement, float4, float4) | ||
- returns fourth argument if second argument does not present in | ||
a first one and third argument in opposite case | ||
|
||
GUC configuration variables: | ||
|
||
smlar.threshold FLOAT | ||
Array's with similarity lower than threshold are not similar | ||
by % operation | ||
|
||
smlar.persistent_cache BOOL | ||
Cache of global stat is stored in transaction-independent memory | ||
|
||
smlar.type STRING | ||
Type of similarity formula: cosine(default), tfidf, overlap | ||
|
||
smlar.stattable STRING | ||
Name of table stored set-wide statistic. Table should be | ||
defined as | ||
CREATE TABLE table_name ( | ||
value data_type UNIQUE, | ||
ndoc int4 (or bigint) NOT NULL CHECK (ndoc>0) | ||
); | ||
And row with null value means total number of documents. | ||
See an examples in sql/*g.sql files | ||
Note: used on for smlar.type = 'tfidf' | ||
|
||
smlar.tf_method STRING | ||
Calculation method for term frequency. Values: | ||
"n" - simple counting of entries (default) | ||
"log" - 1 + log(n) | ||
"const" - TF is equal to 1 | ||
Note: used on for smlar.type = 'tfidf' | ||
|
||
smlar.idf_plus_one BOOL | ||
If false (default), calculate idf as log(d/df), | ||
if true - as log(1+d/df) | ||
Note: used on for smlar.type = 'tfidf' | ||
|
||
Module provides several GUC variables smlar.threshold, it's highly | ||
recommended to add to postgesql.conf: | ||
custom_variable_classes = 'smlar' # list of custom variable class names | ||
smlar.threshold = 0.6 #or any other value > 0 and < 1 | ||
and other smlar.* variables | ||
|
||
GiST/GIN support for % and && operations for: | ||
Array Type | GIN operator class | GiST operator class | ||
---------------+----------------------+---------------------- | ||
bit[] | _bit_sml_ops | | ||
bytea[] | _bytea_sml_ops | _bytea_sml_ops | ||
char[] | _char_sml_ops | _char_sml_ops | ||
cidr[] | _cidr_sml_ops | _cidr_sml_ops | ||
date[] | _date_sml_ops | _date_sml_ops | ||
float4[] | _float4_sml_ops | _float4_sml_ops | ||
float8[] | _float8_sml_ops | _float8_sml_ops | ||
inet[] | _inet_sml_ops | _inet_sml_ops | ||
int2[] | _int2_sml_ops | _int2_sml_ops | ||
int4[] | _int4_sml_ops | _int4_sml_ops | ||
int8[] | _int8_sml_ops | _int8_sml_ops | ||
interval[] | _interval_sml_ops | _interval_sml_ops | ||
macaddr[] | _macaddr_sml_ops | _macaddr_sml_ops | ||
money[] | _money_sml_ops | | ||
numeric[] | _numeric_sml_ops | _numeric_sml_ops | ||
oid[] | _oid_sml_ops | _oid_sml_ops | ||
text[] | _text_sml_ops | _text_sml_ops | ||
time[] | _time_sml_ops | _time_sml_ops | ||
timestamp[] | _timestamp_sml_ops | _timestamp_sml_ops | ||
timestamptz[] | _timestamptz_sml_ops | _timestamptz_sml_ops | ||
timetz[] | _timetz_sml_ops | _timetz_sml_ops | ||
varbit[] | _varbit_sml_ops | | ||
varchar[] | _varchar_sml_ops | _varchar_sml_ops | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
set extra_float_digits =0; | ||
SELECT set_smlar_limit(0.6); | ||
set_smlar_limit | ||
----------------- | ||
0.6 | ||
(1 row) | ||
|
||
SELECT | ||
t, | ||
ARRAY( | ||
SELECT | ||
v::int4::bit(10) | ||
FROM | ||
generate_series(1, t) as v | ||
) AS v | ||
INTO test_bit | ||
FROM | ||
generate_series(1, 200) as t; | ||
SELECT t, smlar(v, '{10,9,8,7,6,5,4,3,2,1}'::int4[]::bit(10)[]) AS s FROM test_bit WHERE v % '{10,9,8,7,6,5,4,3,2,1}'::int4[]::bit(10)[] ORDER BY s DESC, t; | ||
t | s | ||
----+---------- | ||
10 | 1 | ||
11 | 0.953463 | ||
9 | 0.948683 | ||
12 | 0.912871 | ||
8 | 0.894427 | ||
13 | 0.877058 | ||
14 | 0.845154 | ||
7 | 0.83666 | ||
15 | 0.816497 | ||
16 | 0.790569 | ||
6 | 0.774597 | ||
17 | 0.766965 | ||
18 | 0.745356 | ||
19 | 0.725476 | ||
5 | 0.707107 | ||
20 | 0.707107 | ||
21 | 0.690066 | ||
22 | 0.6742 | ||
23 | 0.65938 | ||
24 | 0.645497 | ||
4 | 0.632456 | ||
25 | 0.632456 | ||
26 | 0.620174 | ||
27 | 0.608581 | ||
(24 rows) | ||
|
||
SELECT t, smlar(v, '{50,49,8,7,6,5,4,33,2,1}'::int4[]::bit(10)[]) AS s FROM test_bit WHERE v % '{50,49,8,7,6,5,4,33,2,1}'::int4[]::bit(10)[] ORDER BY s DESC, t; | ||
t | s | ||
----+---------- | ||
8 | 0.782624 | ||
9 | 0.737865 | ||
7 | 0.717137 | ||
10 | 0.7 | ||
11 | 0.667424 | ||
6 | 0.645497 | ||
12 | 0.63901 | ||
13 | 0.613941 | ||
(8 rows) | ||
|
||
CREATE INDEX idx_test_bit ON test_bit USING gin (v _bit_sml_ops); | ||
SET enable_seqscan=off; | ||
SELECT t, smlar(v, '{10,9,8,7,6,5,4,3,2,1}'::int4[]::bit(10)[]) AS s FROM test_bit WHERE v % '{10,9,8,7,6,5,4,3,2,1}'::int4[]::bit(10)[] ORDER BY s DESC, t; | ||
t | s | ||
----+---------- | ||
10 | 1 | ||
11 | 0.953463 | ||
9 | 0.948683 | ||
12 | 0.912871 | ||
8 | 0.894427 | ||
13 | 0.877058 | ||
14 | 0.845154 | ||
7 | 0.83666 | ||
15 | 0.816497 | ||
16 | 0.790569 | ||
6 | 0.774597 | ||
17 | 0.766965 | ||
18 | 0.745356 | ||
19 | 0.725476 | ||
5 | 0.707107 | ||
20 | 0.707107 | ||
21 | 0.690066 | ||
22 | 0.6742 | ||
23 | 0.65938 | ||
24 | 0.645497 | ||
4 | 0.632456 | ||
25 | 0.632456 | ||
26 | 0.620174 | ||
27 | 0.608581 | ||
(24 rows) | ||
|
||
SELECT t, smlar(v, '{50,49,8,7,6,5,4,33,2,1}'::int4[]::bit(10)[]) AS s FROM test_bit WHERE v % '{50,49,8,7,6,5,4,33,2,1}'::int4[]::bit(10)[] ORDER BY s DESC, t; | ||
t | s | ||
----+---------- | ||
8 | 0.782624 | ||
9 | 0.737865 | ||
7 | 0.717137 | ||
10 | 0.7 | ||
11 | 0.667424 | ||
6 | 0.645497 | ||
12 | 0.63901 | ||
13 | 0.613941 | ||
(8 rows) | ||
|
||
SET enable_seqscan=on; |
Oops, something went wrong.