Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade Algolia search to v3 #3560

Merged
merged 12 commits into from
Jun 17, 2023
118 changes: 118 additions & 0 deletions .github/scripts/algolia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import os
from re import S
import sys
import json
from bs4 import BeautifulSoup
from algoliasearch.search_client import SearchClient

url = "docs.dapr.io"
if len(sys.argv) > 1:
starting_directory = os.path.join(os.getcwd(), str(sys.argv[1]))
else:
starting_directory = os.getcwd()

ALGOLIA_APP_ID = os.getenv('ALGOLIA_APP_ID')
ALGOLIA_API_KEY = os.getenv('ALGOLIA_API_WRITE_KEY')
ALGOLIA_INDEX_NAME = os.getenv('ALGOLIA_INDEX_NAME')

client = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_API_KEY)
index = client.init_index(ALGOLIA_INDEX_NAME)

excluded_files = [
"404.html",
]

exluded_directories = [
"zh-hans",
]

rankings = {
"Getting started": 0,
"Concepts": 100,
"Developing applications": 200,
"Operations": 300,
"Reference": 400,
"Contributing": 500,
"Home": 600
}

def scan_directory(directory: str, pages: list):
if os.path.basename(directory) in exluded_directories:
print(f'Skipping directory: {directory}')
return
for file in os.listdir(directory):
path = os.path.join(directory, file)
if os.path.isfile(path):
if file.endswith(".html") and file not in excluded_files:
if '<!-- DISABLE_ALGOLIA -->' not in open(path, encoding="utf8").read():
print(f'Indexing: {path}')
pages.append(path)
else:
print(f'Skipping hidden page: {path}')
else:
scan_directory(path, pages)

def parse_file(path: str):
data = {}
data["hierarchy"] = {}
data["rank"] = 999
data["subrank"] = 99
data["type"] = "lvl2"
data["lvl0"] = ""
data["lvl1"] = ""
data["lvl2"] = ""
data["lvl3"] = ""
text = ""
subrank = 0
with open(path, "r", errors='ignore') as file:
content = file.read()
soup = BeautifulSoup(content, "html.parser")
for meta in soup.find_all("meta"):
if meta.get("name") == "description":
data["lvl2"] = meta.get("content")
data["hierarchy"]["lvl1"] = meta.get("content")
elif meta.get("property") == "og:title":
data["lvl0"] = meta.get("content")
data["hierarchy"]["lvl0"] = meta.get("content")
data["hierarchy"]["lvl2"] = meta.get("content")
elif meta.get("property") == "og:url":
data["url"] = meta.get("content")
data["path"] = meta.get("content").split(url)[1]
data["objectID"] = meta.get("content").split(url)[1]
breadcrumbs = soup.find_all("li", class_="breadcrumb-item")
try:
subrank = len(breadcrumbs)
data["subrank"] = subrank
except:
subrank = 99
data["subrank"] = 99
for bc in breadcrumbs:
section = bc.text.strip()
data["lvl1"] = section
data["hierarchy"]["lvl0"] = section
try:
data["rank"] = rankings[section] + subrank
except:
print(f"Rank not found for section {section}")
data["rank"] = 998
break
for p in soup.find_all("p"):
if p.text != "":
text = text + p.text
data["text"] = text
return data

def index_payload(payload):
res = index.replace_all_objects(payload)
res.wait()


if __name__ == "__main__":
pages = []
payload = []
scan_directory(starting_directory, pages)
for page in pages:
data = parse_file(page)
if "objectID" in data:
payload.append(data)
index_payload(payload)
26 changes: 26 additions & 0 deletions .github/workflows/website-root.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,29 @@ jobs:
with:
azure_static_web_apps_api_token: ${{ secrets.AZURE_STATIC_WEB_APPS_API_TOKEN_PROUD_BAY_0E9E0E81E }}
action: "close"

algolia_index:
name: Index site for Algolia
if: github.event_name == 'push'
needs: ['build_and_deploy_job']
runs-on: ubuntu-latest
env:
ALGOLIA_APP_ID: ${{ secrets.ALGOLIA_APP_ID }}
ALGOLIA_API_WRITE_KEY: ${{ secrets.ALGOLIA_API_WRITE_KEY }}
ALGOLIA_INDEX_NAME: daprdocs
steps:
- name: Checkout docs repo
uses: actions/checkout@v2
with:
submodules: false
- name: Download Hugo artifacts
uses: actions/download-artifact@v3
with:
name: hugo_build
path: site/
- name: Install Python packages
run: |
pip install --upgrade bs4
pip install --upgrade 'algoliasearch>=2.0,<3.0'
- name: Index site
run: python ./.github/scripts/algolia.py ./site
20 changes: 7 additions & 13 deletions daprdocs/layouts/partials/hooks/body-end.html
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
<script src="/js/copy-code-button.js"></script>

{{ with .Site.Params.algolia_docsearch }}
<script src="https://cdn.jsdelivr.net/npm/docsearch.js@2.6.3/dist/cdn/docsearch.min.js"></script>
<script>
<script src="https://cdn.jsdelivr.net/npm/@docsearch/js@3"></script>
<script type="text/javascript">
docsearch({
// Your apiKey and indexName will be given to you once
// we create your config
apiKey: '54ae43aa28ce8f00c54c8d5f544d29b9',
indexName: 'crawler_dapr',
container: '#docsearch',
appId: 'O0QLQGNF38',
// Replace inputSelector with a CSS selector
// matching your search input
inputSelector: '.td-search-input',
// Set debug to true to inspect the dropdown
debug: false,
apiKey: '54ae43aa28ce8f00c54c8d5f544d29b9',
indexName: 'daprdocs',
});
</script>
{{ end }}

<script src="/js/copy-code-button.js"></script>
2 changes: 1 addition & 1 deletion daprdocs/layouts/partials/hooks/head-end.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{{ with .Site.Params.algolia_docsearch }}
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@docsearch/css@3" />
{{ end }}
30 changes: 30 additions & 0 deletions daprdocs/layouts/partials/search-input.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{{ if .Site.Params.gcs_engine_id -}}
<input type="search" class="form-control td-search-input" placeholder="&#xf002; {{ T "ui_search" }}" aria-label="{{ T "ui_search" }}" autocomplete="off">
{{ else if .Site.Params.algolia_docsearch -}}
<div id="docsearch"></div>
{{ else if .Site.Params.offlineSearch -}}
{{ $offlineSearchIndex := resources.Get "json/offline-search-index.json" | resources.ExecuteAsTemplate "offline-search-index.json" . -}}
{{ if hugo.IsProduction -}}
{{/* Use `md5` as finger print hash function to shorten file name to avoid `file name too long` error. */ -}}
{{ $offlineSearchIndex = $offlineSearchIndex | fingerprint "md5" -}}
{{ end -}}
{{ $offlineSearchLink := $offlineSearchIndex.RelPermalink -}}

<input
type="search"
class="form-control td-search-input"
placeholder="&#xf002; {{ T "ui_search" }}"
aria-label="{{ T "ui_search" }}"
autocomplete="off"
{{/*
The data attribute name of the json file URL must end with `src` since
Hugo's absurlreplacer requires `src`, `href`, `action` or `srcset` suffix for the attribute name.
If the absurlreplacer is not applied, the URL will start with `/`.
It causes the json file loading error when when relativeURLs is enabled.
https://github.com/google/docsy/issues/181
*/}}
data-offline-search-index-json-src="{{ $offlineSearchLink }}"
data-offline-search-base-href="/"
data-offline-search-max-results="{{ .Site.Params.offlineSearchMaxResults | default 10 }}"
>
{{ end -}}