From 3da42be58ec01bfe95acb2fef04537321de003ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20Sch=C3=BCtte?= Date: Thu, 30 May 2024 17:22:55 -0400 Subject: [PATCH 1/2] Create search.md --- design/search.md | 147 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 design/search.md diff --git a/design/search.md b/design/search.md new file mode 100644 index 000000000..c8491b23f --- /dev/null +++ b/design/search.md @@ -0,0 +1,147 @@ + + + + +![[VPCzJyCm48Pt_ufJftP0x1bGAuIgIhG22XDYEE9hQicnW-qW-FVu8qT2HTlfSkzpxtrONVg0BlIj5bW7ww3yNZnnY3v_YIvYgbOTcW2pbNDe6d8LR56PMOHoS0wwjUQWceoLy1ou9tJrCOCbl0oEninVjjzPIJ1trDf0YrILCqA8lE_LJLu2edkw2N1Tr7C-wiM-WYVwwCa7gFCt79GcKRI_5Ce1yV2h3stuAcpEsOrH (1).svg]] + + + +# Main + +## Prerequisites +- Local data must be configured +- ENV flag passed into process +- "search" config file present + +## Config +- Read "search config" from file +- Config must contain the following for reach node: + +| Name | Type | Note | +| ------------- | ------ | ------------------------------------------------------------------------------------------------------------------------------- | +| endpoint | string | Full url to gql endpoint | +| arrangerField | string | Endpoints can have other data on the root object. Need to get specific Arranger config object. Appears to be "file" by default. | +## Request Nodes +Outline: +- GQL HTTP calls to endpoints +- Specify only returning "arrangerField" in gql query +- Handles network level errors eg. node not found + +Query step: +- Read `aggregations` field which Arranger generates by default (`aggsState` isn't always generated - only for UI?) +- Get field name and type eg. `donor_specimen_sample: NumericAggregation` + +Sample query: +```graphql +{ + # This is the root arranger type + RootType: __type(name:"file"){ + name + fields { + name + type { + name + } + } + } + # This is the data we are interested in, just aggregations + # input: typename from above query + Aggregations: __type(name:"fileAggregations"){ + name + fields { + name # field name + type { + name # type name for resolvers + } + } + } +} +``` + +Sample response: +```json +{ + "data": { + "RootType": { + "name": "file", // arranger root field + "fields": [ + { + "name": "aggregations", + "type": { + "name": "fileAggregations" + } + }, + { + "name": "configs", + "type": { + "name": "ConfigsWithState" + } + }, + { + "name": "hits", + "type": { + "name": "fileConnection" + } + }, + { + "name": "mapping", + "type": { + "name": "JSON" + } + } + ] + }, + "Aggregations": { + "name": "fileAggregations", + "fields": [ + { + "name": "analysis__analysis_id", // field name to merge + "type": { + "name": "Aggregations" // type for resolvers + } + }, + ... + { + "name": "analysis__analysis_version", + "type": { + "name": "NumericAggregations" + } + }, +``` + +## Merge Schemas +Creates a union of all schema types into stitched schema + +Introspection response object +```json +{ + "name": "analysis__analysis_version", + "type": { + "name": "NumericAggregations" + } +}, +``` + +GQL schema type: +```gql +{ + analysis__analysis_version: NumericAggregations +} +``` + +Process: +1. Create stitched schema `stitched` from local schema aggregation fields +3. For each *n* remote schema aggregations: + 1. iterate fields + 1. If `name` AND `type.name` exist in `stitched` - return + 2. else - add `[name]: [type.name]` to `stitched` + +Example: + +## Generate Resolvers + + + +but it's startup time so we have lots of time dont pre optimise + +name + type + version From 0dec18b3be68265dd0a74b42e355cc027da85ae6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20Sch=C3=BCtte?= Date: Fri, 31 May 2024 03:40:40 -0400 Subject: [PATCH 2/2] Update search.md --- design/search.md | 197 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 175 insertions(+), 22 deletions(-) diff --git a/design/search.md b/design/search.md index c8491b23f..fc79d8595 100644 --- a/design/search.md +++ b/design/search.md @@ -1,12 +1,7 @@ - - -![[VPCzJyCm48Pt_ufJftP0x1bGAuIgIhG22XDYEE9hQicnW-qW-FVu8qT2HTlfSkzpxtrONVg0BlIj5bW7ww3yNZnnY3v_YIvYgbOTcW2pbNDe6d8LR56PMOHoS0wwjUQWceoLy1ou9tJrCOCbl0oEninVjjzPIJ1trDf0YrILCqA8lE_LJLu2edkw2N1Tr7C-wiM-WYVwwCa7gFCt79GcKRI_5Ce1yV2h3stuAcpEsOrH (1).svg]] - - - -# Main +# Federated Search +![VPCzJyCm48Pt_ufJftP0x1bGAuIgIhG22XDYEE9hQicnW-qW-FVu8qT2HTlfSkzpxtrONVg0BlIj5bW7ww3yNZnnY3v_YIvYgbOTcW2pbNDe6d8LR56PMOHoS0wwjUQWceoLy1ou9tJrCOCbl0oEninVjjzPIJ1trDf0YrILCqA8lE_LJLu2edkw2N1Tr7C-wiM-WYVwwCa7gFCt79GcKRI_5Ce1yV2h3stuAcpEsOrH (1)](https://github.com/overture-stack/arranger/assets/1486054/87bb0689-38c6-49eb-ab65-96696a04d86a) ## Prerequisites - Local data must be configured @@ -15,12 +10,24 @@ ## Config - Read "search config" from file -- Config must contain the following for reach node: +- Export object containing node data for use in gql endpoint + +Example Config provided: -| Name | Type | Note | +| Field | Type | Note | | ------------- | ------ | ------------------------------------------------------------------------------------------------------------------------------- | | endpoint | string | Full url to gql endpoint | | arrangerField | string | Endpoints can have other data on the root object. Need to get specific Arranger config object. Appears to be "file" by default. | + +Example Search Node data: + +| Field | Description | +| ------ | --------------------------- | +| url | gql endpoint | +| name | name of node eg. Toronto | +| schema | version of Arranger running | +| status | node status eg. "connected" | + ## Request Nodes Outline: - GQL HTTP calls to endpoints @@ -35,6 +42,7 @@ Sample query: ```graphql { # This is the root arranger type + # "file" is the Arranger field RootType: __type(name:"file"){ name fields { @@ -45,7 +53,7 @@ Sample query: } } # This is the data we are interested in, just aggregations - # input: typename from above query + # input: typename "fileAggregations" retrieved from previous query Aggregations: __type(name:"fileAggregations"){ name fields { @@ -59,7 +67,7 @@ Sample query: ``` Sample response: -```json +```js { "data": { "RootType": { @@ -110,22 +118,22 @@ Sample response: ``` ## Merge Schemas -Creates a union of all schema types into stitched schema +Creates a union of all schema types into stitched schema including search node bucket breakdown. A configuration method should be provided to admins giving them freedom to filter out, or map fields to their liking. Introspection response object ```json { - "name": "analysis__analysis_version", - "type": { - "name": "NumericAggregations" - } -}, + "name": "analysis__analysis_version", + "type": { + "name": "NumericAggregations" + } +} ``` GQL schema type: ```gql { - analysis__analysis_version: NumericAggregations + analysis__analysis_version: NumericAggregations } ``` @@ -133,15 +141,160 @@ Process: 1. Create stitched schema `stitched` from local schema aggregation fields 3. For each *n* remote schema aggregations: 1. iterate fields - 1. If `name` AND `type.name` exist in `stitched` - return + 1. If `name` AND `type.name` pair exist in `stitched` - return 2. else - add `[name]: [type.name]` to `stitched` Example: -## Generate Resolvers +Inputs - GQL responses from introspection query +Node A +```js +// GQL resp +{ + "name": "fileAggregations", + "fields": [ + { + "name": "donors__gender", // field name to merge + "type": { + "name": "Aggregations" // type + } + } + ] +} +``` +Node B: +```js +{ + "name": "fileAggregations", + "fields": [ + { + "name": "donors__gender", // field name to merge + "type": { + "name": "Aggregations" // type + } + }, + { + "name": "donors__age", + "type": { + "name": "NumericAggregations" + } + } + ] +} +``` +Output - Stitched schema +```graphql +schema { + query { + network { + aggregations { + donors__gender { + search_node_agg: Aggregation + agg: Aggregation + } + donors__age: { + search_node_agg: NumericAggregation + agg: NumericAggregation + } + } + } + } +} + +``` + +Sample query to stitched schema: + +```js +// Node A response +{ + donor: { + aggregations: { + __typename: "Aggregation", + gender: { + buckets: [ + { + key: "Male", + bucket_count: 123, + }, + { + key: "Female", + bucket_count: 456, + }, + ], + }, + }, + }, +}; + +// Node B response +{ + donor: { + aggregations: { + gender: { + __typename: "Aggregation", + buckets: [ + { + key: "Male", + bucket_count: 789, + }, + { + key: "Female", + bucket_count: 234, + }, + ], + }, + }, + }, +}; + +// Full response +{ + donor: { + network: { + aggregations: { + gender: { + search_node_agg: { + buckets: [ + { + key: "Node A", + bucket_count: 579, // male + female + }, + { + key: "Node B", + bucket_count: 1023, // male + female + }, + ], + }, + agg: { + buckets: [ + { + key: "Male", + bucket_count: 912, // Node A male + Node B male + }, + { + key: "Female", + bucket_count: 456, // Node A female + Node B female + }, + ], + }, + }, + }, + }, + }, +}; + + +``` +## Generate Resolvers +![Untitled-2024-01-24-1548](https://github.com/overture-stack/arranger/assets/1486054/09ff0512-da31-4c51-b2a3-e7a161843edf) -but it's startup time so we have lots of time dont pre optimise +New resolvers are needed to aggregate the aggregates for all available aggregation types. +ref: https://github.com/overture-stack/arranger/blob/develop/modules/server/src/schema/Aggregations.js -name + type + version +- Individual search node data is queried (http) +- Apply data transforms based on the `__typename` field eg. `NumberAggregation` +- Add additional data eg. `search_node` breakdown of aggregate +- Return fully resolved request