From 0d0d29ac92cfba8bb1dcc5e526e6f39fc88354a3 Mon Sep 17 00:00:00 2001 From: Aaron Crawfis Date: Fri, 16 Jun 2023 21:18:24 -0700 Subject: [PATCH] Upgrade Algolia search to v3 (#3560) * update dapr publish command Signed-off-by: Hannah Hunter Signed-off-by: Aaron Crawfis * Split workflow into two steps Signed-off-by: Aaron Crawfis * Update upload path Signed-off-by: Aaron Crawfis * Add concurrency check Signed-off-by: Aaron Crawfis * Add Algolia workflow script and step Signed-off-by: Aaron Crawfis * Update Algolia box to v3 Signed-off-by: Aaron Crawfis * Fix secret name Signed-off-by: Aaron Crawfis * Override default search bar in Docsy v3 Signed-off-by: Aaron Crawfis * Remove temporary comment Signed-off-by: Aaron Crawfis * Consolidate build and deploy Signed-off-by: Aaron Crawfis --------- Signed-off-by: Hannah Hunter Signed-off-by: Aaron Crawfis Co-authored-by: Hannah Hunter Co-authored-by: Mark Fussell --- .github/scripts/algolia.py | 118 ++++++++++++++++++ .github/workflows/website-root.yml | 26 ++++ daprdocs/layouts/partials/hooks/body-end.html | 20 ++- daprdocs/layouts/partials/hooks/head-end.html | 2 +- daprdocs/layouts/partials/search-input.html | 30 +++++ 5 files changed, 182 insertions(+), 14 deletions(-) create mode 100644 .github/scripts/algolia.py create mode 100644 daprdocs/layouts/partials/search-input.html diff --git a/.github/scripts/algolia.py b/.github/scripts/algolia.py new file mode 100644 index 00000000000..5071ea58006 --- /dev/null +++ b/.github/scripts/algolia.py @@ -0,0 +1,118 @@ +import os +from re import S +import sys +import json +from bs4 import BeautifulSoup +from algoliasearch.search_client import SearchClient + +url = "docs.dapr.io" +if len(sys.argv) > 1: + starting_directory = os.path.join(os.getcwd(), str(sys.argv[1])) +else: + starting_directory = os.getcwd() + +ALGOLIA_APP_ID = os.getenv('ALGOLIA_APP_ID') +ALGOLIA_API_KEY = os.getenv('ALGOLIA_API_WRITE_KEY') +ALGOLIA_INDEX_NAME = os.getenv('ALGOLIA_INDEX_NAME') + +client = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_API_KEY) +index = client.init_index(ALGOLIA_INDEX_NAME) + +excluded_files = [ + "404.html", +] + +exluded_directories = [ + "zh-hans", +] + +rankings = { + "Getting started": 0, + "Concepts": 100, + "Developing applications": 200, + "Operations": 300, + "Reference": 400, + "Contributing": 500, + "Home": 600 +} + +def scan_directory(directory: str, pages: list): + if os.path.basename(directory) in exluded_directories: + print(f'Skipping directory: {directory}') + return + for file in os.listdir(directory): + path = os.path.join(directory, file) + if os.path.isfile(path): + if file.endswith(".html") and file not in excluded_files: + if '' not in open(path, encoding="utf8").read(): + print(f'Indexing: {path}') + pages.append(path) + else: + print(f'Skipping hidden page: {path}') + else: + scan_directory(path, pages) + +def parse_file(path: str): + data = {} + data["hierarchy"] = {} + data["rank"] = 999 + data["subrank"] = 99 + data["type"] = "lvl2" + data["lvl0"] = "" + data["lvl1"] = "" + data["lvl2"] = "" + data["lvl3"] = "" + text = "" + subrank = 0 + with open(path, "r", errors='ignore') as file: + content = file.read() + soup = BeautifulSoup(content, "html.parser") + for meta in soup.find_all("meta"): + if meta.get("name") == "description": + data["lvl2"] = meta.get("content") + data["hierarchy"]["lvl1"] = meta.get("content") + elif meta.get("property") == "og:title": + data["lvl0"] = meta.get("content") + data["hierarchy"]["lvl0"] = meta.get("content") + data["hierarchy"]["lvl2"] = meta.get("content") + elif meta.get("property") == "og:url": + data["url"] = meta.get("content") + data["path"] = meta.get("content").split(url)[1] + data["objectID"] = meta.get("content").split(url)[1] + breadcrumbs = soup.find_all("li", class_="breadcrumb-item") + try: + subrank = len(breadcrumbs) + data["subrank"] = subrank + except: + subrank = 99 + data["subrank"] = 99 + for bc in breadcrumbs: + section = bc.text.strip() + data["lvl1"] = section + data["hierarchy"]["lvl0"] = section + try: + data["rank"] = rankings[section] + subrank + except: + print(f"Rank not found for section {section}") + data["rank"] = 998 + break + for p in soup.find_all("p"): + if p.text != "": + text = text + p.text + data["text"] = text + return data + +def index_payload(payload): + res = index.replace_all_objects(payload) + res.wait() + + +if __name__ == "__main__": + pages = [] + payload = [] + scan_directory(starting_directory, pages) + for page in pages: + data = parse_file(page) + if "objectID" in data: + payload.append(data) + index_payload(payload) diff --git a/.github/workflows/website-root.yml b/.github/workflows/website-root.yml index 98d2aec1199..ed8c3b491f1 100644 --- a/.github/workflows/website-root.yml +++ b/.github/workflows/website-root.yml @@ -79,3 +79,29 @@ jobs: with: azure_static_web_apps_api_token: ${{ secrets.AZURE_STATIC_WEB_APPS_API_TOKEN_PROUD_BAY_0E9E0E81E }} action: "close" + + algolia_index: + name: Index site for Algolia + if: github.event_name == 'push' + needs: ['build_and_deploy_job'] + runs-on: ubuntu-latest + env: + ALGOLIA_APP_ID: ${{ secrets.ALGOLIA_APP_ID }} + ALGOLIA_API_WRITE_KEY: ${{ secrets.ALGOLIA_API_WRITE_KEY }} + ALGOLIA_INDEX_NAME: daprdocs + steps: + - name: Checkout docs repo + uses: actions/checkout@v2 + with: + submodules: false + - name: Download Hugo artifacts + uses: actions/download-artifact@v3 + with: + name: hugo_build + path: site/ + - name: Install Python packages + run: | + pip install --upgrade bs4 + pip install --upgrade 'algoliasearch>=2.0,<3.0' + - name: Index site + run: python ./.github/scripts/algolia.py ./site diff --git a/daprdocs/layouts/partials/hooks/body-end.html b/daprdocs/layouts/partials/hooks/body-end.html index 695cf863809..79cbc117cd9 100644 --- a/daprdocs/layouts/partials/hooks/body-end.html +++ b/daprdocs/layouts/partials/hooks/body-end.html @@ -1,19 +1,13 @@ + + {{ with .Site.Params.algolia_docsearch }} - - + {{ end }} - - \ No newline at end of file diff --git a/daprdocs/layouts/partials/hooks/head-end.html b/daprdocs/layouts/partials/hooks/head-end.html index 804fe38e9ec..03e91efa215 100644 --- a/daprdocs/layouts/partials/hooks/head-end.html +++ b/daprdocs/layouts/partials/hooks/head-end.html @@ -1,3 +1,3 @@ {{ with .Site.Params.algolia_docsearch }} - + {{ end }} \ No newline at end of file diff --git a/daprdocs/layouts/partials/search-input.html b/daprdocs/layouts/partials/search-input.html new file mode 100644 index 00000000000..22e90024773 --- /dev/null +++ b/daprdocs/layouts/partials/search-input.html @@ -0,0 +1,30 @@ +{{ if .Site.Params.gcs_engine_id -}} + +{{ else if .Site.Params.algolia_docsearch -}} +
+{{ else if .Site.Params.offlineSearch -}} +{{ $offlineSearchIndex := resources.Get "json/offline-search-index.json" | resources.ExecuteAsTemplate "offline-search-index.json" . -}} +{{ if hugo.IsProduction -}} +{{/* Use `md5` as finger print hash function to shorten file name to avoid `file name too long` error. */ -}} +{{ $offlineSearchIndex = $offlineSearchIndex | fingerprint "md5" -}} +{{ end -}} +{{ $offlineSearchLink := $offlineSearchIndex.RelPermalink -}} + + +{{ end -}}