From 3da42be58ec01bfe95acb2fef04537321de003ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciar=C3=A1n=20Sch=C3=BCtte?= <ciaran.schutte@oicr.on.ca>
Date: Thu, 30 May 2024 17:22:55 -0400
Subject: [PATCH 1/2] Create search.md

---
 design/search.md | 147 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 design/search.md

diff --git a/design/search.md b/design/search.md
new file mode 100644
index 000000000..c8491b23f
--- /dev/null
+++ b/design/search.md
@@ -0,0 +1,147 @@
+
+
+
+
+![[VPCzJyCm48Pt_ufJftP0x1bGAuIgIhG22XDYEE9hQicnW-qW-FVu8qT2HTlfSkzpxtrONVg0BlIj5bW7ww3yNZnnY3v_YIvYgbOTcW2pbNDe6d8LR56PMOHoS0wwjUQWceoLy1ou9tJrCOCbl0oEninVjjzPIJ1trDf0YrILCqA8lE_LJLu2edkw2N1Tr7C-wiM-WYVwwCa7gFCt79GcKRI_5Ce1yV2h3stuAcpEsOrH (1).svg]]
+
+
+
+# Main
+
+## Prerequisites
+- Local data must be configured
+- ENV flag passed into process
+- "search" config file present
+  
+## Config
+- Read "search config" from file
+- Config must contain the following for reach node:
+
+| Name          | Type   | Note                                                                                                                            |
+| ------------- | ------ | ------------------------------------------------------------------------------------------------------------------------------- |
+| endpoint      | string | Full url to gql endpoint                                                                                                        |
+| arrangerField | string | Endpoints can have other data on the root object. Need to get specific Arranger config object. Appears to be "file" by default. |
+## Request Nodes
+Outline:
+- GQL HTTP calls to endpoints
+- Specify only returning "arrangerField" in gql query
+- Handles network level errors eg. node not found
+
+Query step:
+- Read `aggregations` field which Arranger generates by default (`aggsState` isn't always generated - only for UI?)
+- Get field name and type eg.  `donor_specimen_sample: NumericAggregation`
+
+Sample query:
+```graphql
+{
+  # This is the root arranger type
+  RootType: __type(name:"file"){
+    name
+    fields {
+      name
+      type {
+        name
+      }
+    }
+  }
+  # This is the data we are interested in, just aggregations
+  # input: typename from above query
+  Aggregations: __type(name:"fileAggregations"){
+    name
+    fields {
+      name # field name
+      type {
+        name # type name for resolvers
+      }
+    }
+  }
+}
+```
+
+Sample response:
+```json
+{
+  "data": {
+    "RootType": {
+      "name": "file", // arranger root field
+      "fields": [
+        {
+          "name": "aggregations",
+          "type": {
+            "name": "fileAggregations"
+          }
+        },
+        {
+          "name": "configs",
+          "type": {
+            "name": "ConfigsWithState"
+          }
+        },
+        {
+          "name": "hits",
+          "type": {
+            "name": "fileConnection"
+          }
+        },
+        {
+          "name": "mapping",
+          "type": {
+            "name": "JSON"
+          }
+        }
+      ]
+    },
+    "Aggregations": {
+      "name": "fileAggregations",
+      "fields": [
+        {
+          "name": "analysis__analysis_id", // field name to merge
+          "type": {
+            "name": "Aggregations" // type for resolvers
+          }
+        },
+        ...
+        {
+          "name": "analysis__analysis_version",
+          "type": {
+            "name": "NumericAggregations"
+          }
+        },
+```
+
+## Merge Schemas
+Creates a union of all schema types into stitched schema
+
+Introspection response object
+```json
+{
+	"name": "analysis__analysis_version",
+	"type": {
+		"name": "NumericAggregations"
+	}
+},
+```
+
+GQL schema type:
+```gql
+{
+	analysis__analysis_version: NumericAggregations
+}
+```
+
+Process:
+1. Create stitched schema `stitched` from local schema aggregation fields
+3. For each *n* remote schema aggregations:
+	1. iterate fields
+		1. If `name` AND `type.name` exist in `stitched` - return
+		2. else - add `[name]: [type.name]`  to `stitched` 
+		   
+Example:
+
+## Generate Resolvers
+
+
+
+but it's startup time so we have lots of time dont pre optimise
+
+name + type + version

From 0dec18b3be68265dd0a74b42e355cc027da85ae6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciar=C3=A1n=20Sch=C3=BCtte?= <ciaran.schutte@oicr.on.ca>
Date: Fri, 31 May 2024 03:40:40 -0400
Subject: [PATCH 2/2] Update search.md

---
 design/search.md | 197 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 175 insertions(+), 22 deletions(-)

diff --git a/design/search.md b/design/search.md
index c8491b23f..fc79d8595 100644
--- a/design/search.md
+++ b/design/search.md
@@ -1,12 +1,7 @@
 
 
-
-
-![[VPCzJyCm48Pt_ufJftP0x1bGAuIgIhG22XDYEE9hQicnW-qW-FVu8qT2HTlfSkzpxtrONVg0BlIj5bW7ww3yNZnnY3v_YIvYgbOTcW2pbNDe6d8LR56PMOHoS0wwjUQWceoLy1ou9tJrCOCbl0oEninVjjzPIJ1trDf0YrILCqA8lE_LJLu2edkw2N1Tr7C-wiM-WYVwwCa7gFCt79GcKRI_5Ce1yV2h3stuAcpEsOrH (1).svg]]
-
-
-
-# Main
+# Federated Search
+![VPCzJyCm48Pt_ufJftP0x1bGAuIgIhG22XDYEE9hQicnW-qW-FVu8qT2HTlfSkzpxtrONVg0BlIj5bW7ww3yNZnnY3v_YIvYgbOTcW2pbNDe6d8LR56PMOHoS0wwjUQWceoLy1ou9tJrCOCbl0oEninVjjzPIJ1trDf0YrILCqA8lE_LJLu2edkw2N1Tr7C-wiM-WYVwwCa7gFCt79GcKRI_5Ce1yV2h3stuAcpEsOrH (1)](https://github.com/overture-stack/arranger/assets/1486054/87bb0689-38c6-49eb-ab65-96696a04d86a)
 
 ## Prerequisites
 - Local data must be configured
@@ -15,12 +10,24 @@
   
 ## Config
 - Read "search config" from file
-- Config must contain the following for reach node:
+- Export object containing node data for use in gql endpoint
+
+Example Config provided:
 
-| Name          | Type   | Note                                                                                                                            |
+| Field         | Type   | Note                                                                                                                            |
 | ------------- | ------ | ------------------------------------------------------------------------------------------------------------------------------- |
 | endpoint      | string | Full url to gql endpoint                                                                                                        |
 | arrangerField | string | Endpoints can have other data on the root object. Need to get specific Arranger config object. Appears to be "file" by default. |
+
+Example Search Node data:
+
+| Field  | Description                 |
+| ------ | --------------------------- |
+| url    | gql endpoint                |
+| name   | name of node eg. Toronto    |
+| schema | version of Arranger running |
+| status | node status eg. "connected" |
+
 ## Request Nodes
 Outline:
 - GQL HTTP calls to endpoints
@@ -35,6 +42,7 @@ Sample query:
 ```graphql
 {
   # This is the root arranger type
+  # "file" is the Arranger field
   RootType: __type(name:"file"){
     name
     fields {
@@ -45,7 +53,7 @@ Sample query:
     }
   }
   # This is the data we are interested in, just aggregations
-  # input: typename from above query
+  # input: typename "fileAggregations" retrieved from previous query
   Aggregations: __type(name:"fileAggregations"){
     name
     fields {
@@ -59,7 +67,7 @@ Sample query:
 ```
 
 Sample response:
-```json
+```js
 {
   "data": {
     "RootType": {
@@ -110,22 +118,22 @@ Sample response:
 ```
 
 ## Merge Schemas
-Creates a union of all schema types into stitched schema
+Creates a union of all schema types into stitched schema including search node bucket breakdown. A configuration method should be provided to admins giving them freedom to filter out, or map fields to their liking.
 
 Introspection response object
 ```json
 {
-	"name": "analysis__analysis_version",
-	"type": {
-		"name": "NumericAggregations"
-	}
-},
+  "name": "analysis__analysis_version",
+  "type": {
+    "name": "NumericAggregations"
+  }
+}
 ```
 
 GQL schema type:
 ```gql
 {
-	analysis__analysis_version: NumericAggregations
+  analysis__analysis_version: NumericAggregations
 }
 ```
 
@@ -133,15 +141,160 @@ Process:
 1. Create stitched schema `stitched` from local schema aggregation fields
 3. For each *n* remote schema aggregations:
 	1. iterate fields
-		1. If `name` AND `type.name` exist in `stitched` - return
+		1. If `name` AND `type.name` pair exist in `stitched` - return
 		2. else - add `[name]: [type.name]`  to `stitched` 
 		   
 Example:
 
-## Generate Resolvers
+Inputs - GQL responses from introspection query
+Node A 
+```js
+// GQL resp
+{
+  "name": "fileAggregations",
+  "fields": [
+    {
+      "name": "donors__gender", // field name to merge
+      "type": {
+        "name": "Aggregations" // type
+      }
+    }
+  ]
+}
 
+```
+Node B:
+```js
+{
+  "name": "fileAggregations",
+  "fields": [
+    {
+      "name": "donors__gender", // field name to merge
+      "type": {
+        "name": "Aggregations" // type
+      }
+    },
+    {
+      "name": "donors__age", 
+      "type": {
+        "name": "NumericAggregations" 
+      }
+    }
+  ]
+}
+```
 
+Output - Stitched schema
+```graphql
+schema {
+  query {
+    network {
+      aggregations {
+        donors__gender {
+          search_node_agg: Aggregation
+          agg: Aggregation
+        }
+        donors__age: {
+          search_node_agg: NumericAggregation
+          agg: NumericAggregation
+        }
+      }
+    }
+  }
+}
+
+```
+
+Sample query to stitched schema:
+
+```js
+// Node A response
+{
+  donor: {
+    aggregations: {
+      __typename: "Aggregation",
+      gender: {
+        buckets: [
+          {
+            key: "Male",
+            bucket_count: 123,
+          },
+          {
+            key: "Female",
+            bucket_count: 456,
+          },
+        ],
+      },
+    },
+  },
+};
+
+// Node B response
+{
+  donor: {
+    aggregations: {
+      gender: {
+        __typename: "Aggregation",
+        buckets: [
+          {
+            key: "Male",
+            bucket_count: 789,
+          },
+          {
+            key: "Female",
+            bucket_count: 234,
+          },
+        ],
+      },
+    },
+  },
+};
+
+// Full response
+{
+  donor: {
+    network: {
+      aggregations: {
+        gender: {
+          search_node_agg: {
+            buckets: [
+              {
+                key: "Node A",
+                bucket_count: 579, // male + female
+              },
+              {
+                key: "Node B",
+                bucket_count: 1023, // male + female
+              },
+            ],
+          },
+          agg: {
+            buckets: [
+              {
+                key: "Male",
+                bucket_count: 912, // Node A male + Node B male
+              },
+              {
+                key: "Female",
+                bucket_count: 456, // Node A female + Node B female
+              },
+            ],
+          },
+        },
+      },
+    },
+  },
+};
+
+
+```
+## Generate Resolvers
+![Untitled-2024-01-24-1548](https://github.com/overture-stack/arranger/assets/1486054/09ff0512-da31-4c51-b2a3-e7a161843edf)
 
-but it's startup time so we have lots of time dont pre optimise
+New resolvers are needed to aggregate the aggregates for all available aggregation types.
+ref: https://github.com/overture-stack/arranger/blob/develop/modules/server/src/schema/Aggregations.js
 
-name + type + version
+- Individual search node data is queried (http) 
+- Apply data transforms based on the `__typename` field eg. `NumberAggregation`
+- Add additional data eg. `search_node` breakdown of aggregate
+- Return fully resolved request