Skip to content

Commit

Permalink
bug(bq,sf,rs|clustering):ST_CLUSTERKMEANS remove duplicated coords (#491
Browse files Browse the repository at this point in the history
)
  • Loading branch information
vdelacruzb authored Apr 4, 2024
1 parent f2920b7 commit 8e0a749
Show file tree
Hide file tree
Showing 13 changed files with 42 additions and 19 deletions.
4 changes: 4 additions & 0 deletions clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ Takes a set of points as input and partitions them into clusters using the k-mea
* `geog`: `ARRAY<GEOGRAPHY>` points to be clustered.
* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(<NUMBER OF POINTS>/2)` is used.

````hint:info
The resulting geometries are unique. So duplicated points will be removed from the input array
````

**Return type**

`ARRAY<STRUCT<cluster INT64, geom GEOGRAPHY>>`
Expand Down
9 changes: 6 additions & 3 deletions clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
----------------------------
-- Copyright (C) 2021 CARTO
----------------------------
--------------------------------
-- Copyright (C) 2021-2024 CARTO
--------------------------------

CREATE OR REPLACE FUNCTION `@@BQ_DATASET@@.__CLUSTERKMEANS`
(geojson ARRAY<STRING>, numberOfClusters INT64)
Expand All @@ -15,8 +15,11 @@ AS """
const options = {};
if (numberOfClusters != null) {
options.numberOfClusters = Number(numberOfClusters);
} else {
options.numberOfClusters = parseInt(Math.sqrt(geojson.length/2))
}
options.mutate = true;
geojson = Array.from(new Set(geojson));
const featuresCollection = lib.clustering.featureCollection(geojson.map(x => lib.clustering.feature(JSON.parse(x))));
lib.clustering.clustersKmeans(featuresCollection, options);
const cluster = [];
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 10 additions & 3 deletions clouds/redshift/libraries/python/lib/clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@ def load_geom(geom):
geom = json.dumps(_geom)
return loads(geom)

def remove_duplicated_coords(arr):
import numpy as np
unique_rows = []
for row in arr:
if not any(np.array_equal(row, unique_row) for unique_row in unique_rows):
unique_rows.append(row)
return np.array(unique_rows)

def clusterkmeanstable(geom, k):
from .kmeans import KMeans
Expand Down Expand Up @@ -39,14 +46,14 @@ def clusterkmeans(geom, k):
if geom.type != 'MultiPoint':
raise Exception('Invalid operation: Input points parameter must be MultiPoint.')
else:
coords = np.array(list(geojson.utils.coords(geom)))
coords = remove_duplicated_coords(np.array(list(geojson.utils.coords(geom))))
cluster_idxs, centers, loss = KMeans()(coords, k)
return geojson.dumps(
[
{
'cluster': cluster_idxs[idx],
'geom': {'coordinates': point, 'type': 'Point'},
'geom': {'coordinates': point.tolist(), 'type': 'Point'},
}
for idx, point in enumerate(geom['coordinates'])
for idx, point in enumerate(coords)
]
)
4 changes: 4 additions & 0 deletions clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ Takes a set of points as input and partitions them into clusters using the k-mea
* `geog`: `GEOMETRY` points to be clustered.
* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(<NUMBER OF POINTS>/2)`).

````hint:info
The resulting geometries are unique. So duplicated points will be removed from the input multipoint
````

**Return type**

`SUPER`: containing objects with `cluster` as the cluster id and `geom` as the geometry in GeoJSON format.
Expand Down
6 changes: 3 additions & 3 deletions clouds/redshift/modules/sql/clustering/ST_CLUSTERKMEANS.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
----------------------------
-- Copyright (C) 2021 CARTO
----------------------------
--------------------------------
-- Copyright (C) 2021-2024 CARTO
--------------------------------

CREATE OR REPLACE FUNCTION @@RS_SCHEMA@@.__CLUSTERKMEANS
(geom VARCHAR(MAX), numberofClusters INT)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}}]
[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
[{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
[{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
4 changes: 4 additions & 0 deletions clouds/snowflake/modules/doc/clustering/ST_CLUSTERKMEANS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ Takes a set of points as input and partitions them into clusters using the k-mea
* `geojsons`: `ARRAY` points to be clustered.
* `numberOfClusters` (optional): `INT` numberOfClusters that will be generated. By default `numberOfClusters` is `Math.sqrt(<NUMBER OF POINTS>/2)`.

````hint:info
The resulting geometries are unique. So duplicated points will be removed from the input array
````

**Return type**

`ARRAY`: containing objects with `cluster`, as the cluster id, and `geom`, as the geometry geojson.
Expand Down
7 changes: 4 additions & 3 deletions clouds/snowflake/modules/sql/clustering/ST_CLUSTERKMEANS.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
----------------------------
-- Copyright (C) 2021 CARTO
----------------------------
--------------------------------
-- Copyright (C) 2021-2024 CARTO
--------------------------------

CREATE OR REPLACE FUNCTION @@SF_SCHEMA@@._CLUSTERKMEANS
(geojsons ARRAY, numberOfClusters DOUBLE)
Expand All @@ -17,6 +17,7 @@ AS $$
const options = {};
options.numberOfClusters = Number(NUMBEROFCLUSTERS);
options.mutate = true;
GEOJSONS = Array.from(new Set(GEOJSONS));
const featuresCollection = clusteringLib.featureCollection(GEOJSONS.map(x => clusteringLib.feature(JSON.parse(x))));
clusteringLib.clustersKmeans(featuresCollection, options);
const cluster = [];
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 8e0a749

Please sign in to comment.