dapr · msfussell · Jun 17, 2023 · Jun 15, 2023 · Jun 16, 2023 · Jun 16, 2023
@@ -0,0 +1,118 @@
+import os
+from re import S
+import sys
+import json
+from bs4 import BeautifulSoup
+from algoliasearch.search_client import SearchClient
+
+url = "docs.dapr.io"
+if len(sys.argv) > 1:
+    starting_directory = os.path.join(os.getcwd(), str(sys.argv[1])) 
+else:
+    starting_directory = os.getcwd()
+
+ALGOLIA_APP_ID = os.getenv('ALGOLIA_APP_ID')
+ALGOLIA_API_KEY = os.getenv('ALGOLIA_API_WRITE_KEY')
+ALGOLIA_INDEX_NAME = os.getenv('ALGOLIA_INDEX_NAME')
+
+client = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_API_KEY)
+index = client.init_index(ALGOLIA_INDEX_NAME)
+
+excluded_files = [
+    "404.html",
+]
+
+exluded_directories = [
+    "zh-hans",
+]
+
+rankings = {
+    "Getting started": 0,
+    "Concepts": 100,
+    "Developing applications": 200,
+    "Operations": 300,
+    "Reference": 400,
+    "Contributing": 500,
+    "Home": 600
+}
+
+def scan_directory(directory: str, pages: list):
+    if os.path.basename(directory) in exluded_directories:
+        print(f'Skipping directory: {directory}')
+        return
+    for file in os.listdir(directory):
+        path = os.path.join(directory, file)
+        if os.path.isfile(path):
+            if file.endswith(".html") and file not in excluded_files:
+                if '<!-- DISABLE_ALGOLIA -->' not in open(path, encoding="utf8").read():
+                    print(f'Indexing: {path}')
+                    pages.append(path)
+                else:
+                    print(f'Skipping hidden page: {path}')
+        else:
+            scan_directory(path, pages)
+
+def parse_file(path: str):
+    data = {}
+    data["hierarchy"] = {}
+    data["rank"] = 999
+    data["subrank"] = 99
+    data["type"] = "lvl2"
+    data["lvl0"] = ""
+    data["lvl1"] = ""
+    data["lvl2"] = ""
+    data["lvl3"] = ""
+    text = ""
+    subrank = 0
+    with open(path, "r", errors='ignore') as file:
+        content = file.read()
+        soup = BeautifulSoup(content, "html.parser")
+    for meta in soup.find_all("meta"):
+        if meta.get("name") == "description":
+            data["lvl2"] = meta.get("content")
+            data["hierarchy"]["lvl1"] = meta.get("content")
+        elif meta.get("property") == "og:title":
+            data["lvl0"] = meta.get("content")
+            data["hierarchy"]["lvl0"] = meta.get("content")
+            data["hierarchy"]["lvl2"] = meta.get("content")
+        elif meta.get("property") == "og:url":
+            data["url"] = meta.get("content")
+            data["path"] = meta.get("content").split(url)[1]
+            data["objectID"] = meta.get("content").split(url)[1]
+    breadcrumbs = soup.find_all("li", class_="breadcrumb-item")
+    try:
+        subrank = len(breadcrumbs)
+        data["subrank"] = subrank
+    except:
+        subrank = 99
+        data["subrank"] = 99
+    for bc in breadcrumbs:
+        section = bc.text.strip()
+        data["lvl1"] = section
+        data["hierarchy"]["lvl0"] = section
+        try:
+            data["rank"] = rankings[section] + subrank
+        except:
+            print(f"Rank not found for section {section}")
+            data["rank"] = 998
+        break
+    for p in soup.find_all("p"):
+        if p.text != "":
+            text = text + p.text
+    data["text"] = text
+    return data
+
+def index_payload(payload):
+    res = index.replace_all_objects(payload)
+    res.wait()
+
+
+if __name__ == "__main__":
+    pages = []
+    payload = []
+    scan_directory(starting_directory, pages)
+    for page in pages:
+        data = parse_file(page)
+        if "objectID" in data:
+            payload.append(data)
+    index_payload(payload)
@@ -79,3 +79,29 @@ jobs:
         with:
           azure_static_web_apps_api_token: ${{ secrets.AZURE_STATIC_WEB_APPS_API_TOKEN_PROUD_BAY_0E9E0E81E }}
           action: "close"
+
+  algolia_index:
+    name: Index site for Algolia
+    if: github.event_name == 'push'
+    needs: ['build_and_deploy_job']
+    runs-on: ubuntu-latest
+    env:
+      ALGOLIA_APP_ID: ${{ secrets.ALGOLIA_APP_ID }}
+      ALGOLIA_API_WRITE_KEY: ${{ secrets.ALGOLIA_API_WRITE_KEY }}
+      ALGOLIA_INDEX_NAME: daprdocs
+    steps:
+      - name: Checkout docs repo
+        uses: actions/checkout@v2
+        with:
+          submodules: false
+      - name: Download Hugo artifacts
+        uses: actions/download-artifact@v3
+        with:
+          name: hugo_build
+          path: site/
+      - name: Install Python packages
+        run: |
+          pip install --upgrade bs4
+          pip install --upgrade 'algoliasearch>=2.0,<3.0'
+      - name: Index site
+        run: python ./.github/scripts/algolia.py ./site
@@ -1,19 +1,13 @@
+<script src="/js/copy-code-button.js"></script>
+
 {{ with .Site.Params.algolia_docsearch }}
-<script src="https://cdn.jsdelivr.net/npm/docsearch.js@2.6.3/dist/cdn/docsearch.min.js"></script>
-<script>
+<script src="https://cdn.jsdelivr.net/npm/@docsearch/js@3"></script>
+<script type="text/javascript">
   docsearch({
-    // Your apiKey and indexName will be given to you once
-    // we create your config
-    apiKey: '54ae43aa28ce8f00c54c8d5f544d29b9',
-    indexName: 'crawler_dapr',
+    container: '#docsearch',
     appId: 'O0QLQGNF38',
-    // Replace inputSelector with a CSS selector
-    // matching your search input
-    inputSelector: '.td-search-input',
-    // Set debug to true to inspect the dropdown
-    debug: false,
+    apiKey: '54ae43aa28ce8f00c54c8d5f544d29b9',
+    indexName: 'daprdocs',
   });
 </script>
 {{ end }}
-
-<script src="/js/copy-code-button.js"></script>
@@ -1,3 +1,3 @@
 {{ with .Site.Params.algolia_docsearch }}
-<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
+<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@docsearch/css@3" />
 {{ end }}
@@ -0,0 +1,30 @@
+{{ if .Site.Params.gcs_engine_id -}}
+<input type="search" class="form-control td-search-input" placeholder="&#xf002; {{ T "ui_search" }}" aria-label="{{ T "ui_search" }}" autocomplete="off">
+{{ else if .Site.Params.algolia_docsearch -}}
+<div id="docsearch"></div>
+{{ else if .Site.Params.offlineSearch -}}
+{{ $offlineSearchIndex := resources.Get "json/offline-search-index.json" | resources.ExecuteAsTemplate "offline-search-index.json" . -}}
+{{ if hugo.IsProduction -}}
+{{/* Use `md5` as finger print hash function to shorten file name to avoid `file name too long` error. */ -}}
+{{ $offlineSearchIndex = $offlineSearchIndex | fingerprint "md5" -}}
+{{ end -}}
+{{ $offlineSearchLink := $offlineSearchIndex.RelPermalink -}}
+
+<input
+  type="search"
+  class="form-control td-search-input"
+  placeholder="&#xf002; {{ T "ui_search" }}"
+  aria-label="{{ T "ui_search" }}"
+  autocomplete="off"
+  {{/*
+    The data attribute name of the json file URL must end with `src` since
+    Hugo's absurlreplacer requires `src`, `href`, `action` or `srcset` suffix for the attribute name.
+    If the absurlreplacer is not applied, the URL will start with `/`.
+    It causes the json file loading error when when relativeURLs is enabled.
+    https://github.com/google/docsy/issues/181
+  */}}
+  data-offline-search-index-json-src="{{ $offlineSearchLink }}"
+  data-offline-search-base-href="/"
+  data-offline-search-max-results="{{ .Site.Params.offlineSearchMaxResults | default 10 }}"
+>
+{{ end -}}