Skip to content

Commit

Permalink
Merge pull request #4100 from owid/data-catalog-explorers-algolia
Browse files Browse the repository at this point in the history
🎉 Explorers in the Data Catalog
  • Loading branch information
ikesau authored Nov 8, 2024
2 parents 3d61b31 + c510e3c commit f04ea06
Show file tree
Hide file tree
Showing 25 changed files with 1,410 additions and 626 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -297,9 +297,10 @@ update.chart-entities: itsJustJavascript
reindex: itsJustJavascript
@echo '==> Reindexing search in Algolia'
node --enable-source-maps itsJustJavascript/baker/algolia/configureAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexPagesToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexChartsToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsAndChartsToAlgolia.js

delete-algolia-index: itsJustJavascript
@echo '==> Deleting Algolia index'
Expand Down
2 changes: 1 addition & 1 deletion adminSiteServer/apiRouter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ import { denormalizeLatestCountryData } from "../baker/countryProfiles.js"
import {
indexIndividualGdocPost,
removeIndividualGdocPostFromIndex,
} from "../baker/algolia/algoliaUtils.js"
} from "../baker/algolia/utils/pages.js"
import { References } from "../adminSiteClient/ChartEditor.js"
import { DeployQueueServer } from "../baker/DeployQueueServer.js"
import { FunctionalRouter } from "./FunctionalRouter.js"
Expand Down
35 changes: 35 additions & 0 deletions baker/algolia/configureAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,41 @@ export const configureAlgolia = async () => {
],
})

const explorerViewsAndChartsIndex = client.initIndex(
getIndexName(SearchIndexName.ExplorerViewsAndCharts)
)

await explorerViewsAndChartsIndex.setSettings({
...baseSettings,
searchableAttributes: [
"unordered(title)",
"unordered(slug)",
"unordered(variantName)",
"unordered(subtitle)",
"unordered(tags)",
"unordered(availableEntities)",
],
ranking: ["typo", "words", "exact", "attribute", "custom", "proximity"],
customRanking: [
"desc(score)",
// For multiple explorer views with the same title, we want to avoid surfacing duplicates.
// So, rank a result with viewTitleIndexWithinExplorer=0 way more highly than one with 1, 2, etc.
"asc(viewTitleIndexWithinExplorer)",
"asc(titleLength)",
],
attributesToSnippet: ["subtitle:24"],
attributeForDistinct: "id",
optionalWords: ["vs"],

// These lines below essentially demote matches in the `subtitle` and `availableEntities` fields:
// If we find a match (only) there, then it doesn't count towards `exact`, and is therefore ranked lower.
// We also disable prefix matching and typo tolerance on these.
disableExactOnAttributes: ["tags", "subtitle", "availableEntities"],
disableTypoToleranceOnAttributes: ["subtitle", "availableEntities"],
disablePrefixOnAttributes: ["subtitle"],
attributesForFaceting: ["tags", "availableEntities"],
})

const synonyms = [
["owid", "our world in data"],
["kids", "children"],
Expand Down
227 changes: 14 additions & 213 deletions baker/algolia/indexChartsToAlgolia.ts
Original file line number Diff line number Diff line change
@@ -1,222 +1,23 @@
import * as db from "../../db/db.js"
import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js"
import { getAlgoliaClient } from "./configureAlgolia.js"
import { isPathRedirectedToExplorer } from "../../explorerAdminServer/ExplorerRedirects.js"
import { ChartRecord, SearchIndexName } from "../../site/search/searchTypes.js"
import {
KeyChartLevel,
OwidGdocLinkType,
excludeNullish,
isNil,
countries,
orderBy,
removeTrailingParenthetical,
uniq,
} from "@ourworldindata/utils"
import { MarkdownTextWrap } from "@ourworldindata/components"
import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js"
import { getRelatedArticles } from "../../db/model/Post.js"
ALGOLIA_INDEXING,
BUGSNAG_NODE_API_KEY,
} from "../../settings/serverSettings.js"
import { getAlgoliaClient } from "./configureAlgolia.js"
import { SearchIndexName } from "../../site/search/searchTypes.js"
import { getIndexName } from "../../site/search/searchClient.js"
import { getPublishedLinksTo } from "../../db/model/Link.js"

const computeScore = (record: Omit<ChartRecord, "score">): number => {
const { numRelatedArticles, views_7d } = record
return numRelatedArticles * 500 + views_7d
}

const countriesWithVariantNames = new Set(
countries
.filter((country) => country.variantNames?.length || country.shortName)
.map((country) => country.name)
)

const processAvailableEntities = (availableEntities: string[] | null) => {
if (!availableEntities) return []

// Algolia is a bit weird with synonyms:
// If we have a synonym "USA" -> "United States", and we search for "USA",
// then it seems that Algolia can only find that within `availableEntities`
// if "USA" is within the first 100-or-so entries of the array.
// So, the easy solution is to sort the entities to ensure that countries
// with variant names are at the top.
// Also, entities containing a hyphen like "low-income countries" can also
// only be found if they're within the first 100-or-so entries.
// - @marcelgerber, 2024-03-25
return orderBy(
availableEntities,
[
(entityName) =>
countriesWithVariantNames.has(
removeTrailingParenthetical(entityName)
),
(entityName) => entityName.includes("-"),
(entityName) => entityName,
],
["desc", "desc", "asc"]
)
}

interface RawChartRecordRow {
id: number
slug: string
title: string
variantName: string
subtitle: string
numDimensions: string
publishedAt: string
updatedAt: string
entityNames: string
tags: string
keyChartForTags: string
}

interface ParsedChartRecordRow {
id: number
slug: string
title: string
variantName: string
subtitle: string
numDimensions: string
publishedAt: string
updatedAt: string
entityNames: string[]
tags: string[]
keyChartForTags: string[]
}

const parseAndProcessChartRecords = (
rawRecord: RawChartRecordRow
): ParsedChartRecordRow => {
let parsedEntities: string[] = []
if (rawRecord.entityNames !== null) {
// This is a very rough way to check for the Algolia record size limit, but it's better than the update failing
// because we exceed the 20KB record size limit
if (rawRecord.entityNames.length < 12000)
parsedEntities = excludeNullish(
JSON.parse(rawRecord.entityNames as string) as (string | null)[]
) as string[]
else {
console.info(
`Chart ${rawRecord.id} has too many entities, skipping its entities`
)
}
}
const entityNames = processAvailableEntities(parsedEntities)

const tags = JSON.parse(rawRecord.tags)
const keyChartForTags = JSON.parse(
rawRecord.keyChartForTags as string
).filter((t: string | null) => t)

return {
...rawRecord,
entityNames,
tags,
keyChartForTags,
}
}

const getChartsRecords = async (
knex: db.KnexReadonlyTransaction
): Promise<ChartRecord[]> => {
const chartsToIndex = await db.knexRaw<RawChartRecordRow>(
knex,
`-- sql
WITH indexable_charts_with_entity_names AS (
SELECT c.id,
cc.slug,
cc.full ->> "$.title" AS title,
cc.full ->> "$.variantName" AS variantName,
cc.full ->> "$.subtitle" AS subtitle,
JSON_LENGTH(cc.full ->> "$.dimensions") AS numDimensions,
c.publishedAt,
c.updatedAt,
JSON_ARRAYAGG(e.name) AS entityNames
FROM charts c
LEFT JOIN chart_configs cc ON c.configId = cc.id
LEFT JOIN charts_x_entities ce ON c.id = ce.chartId
LEFT JOIN entities e ON ce.entityId = e.id
WHERE cc.full ->> "$.isPublished" = 'true'
AND c.isIndexable IS TRUE
GROUP BY c.id
)
SELECT c.id,
c.slug,
c.title,
c.variantName,
c.subtitle,
c.numDimensions,
c.publishedAt,
c.updatedAt,
c.entityNames, -- this array may contain null values, will have to filter these out
JSON_ARRAYAGG(t.name) AS tags,
JSON_ARRAYAGG(IF(ct.keyChartLevel = ${KeyChartLevel.Top}, t.name, NULL)) AS keyChartForTags -- this results in an array that contains null entries, will have to filter them out
FROM indexable_charts_with_entity_names c
LEFT JOIN chart_tags ct ON c.id = ct.chartId
LEFT JOIN tags t on ct.tagId = t.id
GROUP BY c.id
HAVING COUNT(t.id) >= 1
`
)

const parsedRows = chartsToIndex.map(parseAndProcessChartRecords)

const pageviews = await getAnalyticsPageviewsByUrlObj(knex)

const parentTagsByChildName = await db.getParentTagsByChildName(knex)

const records: ChartRecord[] = []
for (const c of parsedRows) {
// Our search currently cannot render explorers, so don't index them because
// otherwise they will fail when rendered in the search results
if (isPathRedirectedToExplorer(`/grapher/${c.slug}`)) continue

const relatedArticles = (await getRelatedArticles(knex, c.id)) ?? []
const linksFromGdocs = await getPublishedLinksTo(
knex,
[c.slug],
OwidGdocLinkType.Grapher
)

const plaintextSubtitle = isNil(c.subtitle)
? undefined
: new MarkdownTextWrap({
text: c.subtitle,
fontSize: 10, // doesn't matter, but is a mandatory field
}).plaintext

const parentTags = c.tags.flatMap(
// a chart can be tagged with a tag that isn't in the tag graph
(tag) => parentTagsByChildName[tag] || []
)

const record = {
objectID: c.id.toString(),
chartId: c.id,
slug: c.slug,
title: c.title,
variantName: c.variantName,
subtitle: plaintextSubtitle,
availableEntities: c.entityNames,
numDimensions: parseInt(c.numDimensions),
publishedAt: c.publishedAt,
updatedAt: c.updatedAt,
tags: uniq([...c.tags, ...parentTags]),
keyChartForTags: c.keyChartForTags as string[],
titleLength: c.title.length,
// Number of references to this chart in all our posts and pages
numRelatedArticles: relatedArticles.length + linksFromGdocs.length,
views_7d: pageviews[`/grapher/${c.slug}`]?.views_7d ?? 0,
}
const score = computeScore(record)
records.push({ ...record, score })
}

return records
}
import { getChartsRecords } from "./utils/charts.js"
import Bugsnag from "@bugsnag/js"

const indexChartsToAlgolia = async () => {
if (!ALGOLIA_INDEXING) return
if (BUGSNAG_NODE_API_KEY) {
Bugsnag.start({
apiKey: BUGSNAG_NODE_API_KEY,
context: "index-explorer-views-to-algolia",
autoTrackSessions: false,
})
}

const client = getAlgoliaClient()
if (!client) {
Expand Down
81 changes: 81 additions & 0 deletions baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import Bugsnag from "@bugsnag/js"
import * as db from "../../db/db.js"
import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js"
import {
ALGOLIA_INDEXING,
BUGSNAG_NODE_API_KEY,
} from "../../settings/serverSettings.js"
import { getAlgoliaClient } from "./configureAlgolia.js"
import {
getExplorerViewRecords,
adaptExplorerViews,
} from "./utils/explorerViews.js"
import { scaleRecordScores } from "./utils/shared.js"
import { getChartsRecords } from "./utils/charts.js"
import { getIndexName } from "../../site/search/searchClient.js"
import { SearchIndexName } from "../../site/search/searchTypes.js"

// We get 200k operations with Algolia's Open Source plan. We've hit 140k in the past so this might push us over.
// If we standardize the record shape, we could have this be the only index and have a `type` field
// to use in /search.
const indexExplorerViewsAndChartsToAlgolia = async () => {
if (!ALGOLIA_INDEXING) return
if (BUGSNAG_NODE_API_KEY) {
Bugsnag.start({
apiKey: BUGSNAG_NODE_API_KEY,
context: "index-explorer-views-to-algolia",
autoTrackSessions: false,
})
}
const indexName = getIndexName(SearchIndexName.ExplorerViewsAndCharts)
console.log(
`Indexing explorer views and charts to the "${indexName}" index on Algolia`
)
const client = getAlgoliaClient()
if (!client) {
await logErrorAndMaybeSendToBugsnag(
`Failed indexing explorer views (Algolia client not initialized)`
)
return
}

try {
const { explorerViews, grapherViews } =
await db.knexReadonlyTransaction(async (trx) => {
return {
explorerViews: await getExplorerViewRecords(trx, true),
grapherViews: await getChartsRecords(trx),
}
}, db.TransactionCloseMode.Close)

// Scale grapher records and the default explorer views between 1000 and 10000,
// Scale the remaining explorer views between 0 and 1000.
// This is because Graphers are generally higher quality than Explorers and we don't want
// the data catalog to smother Grapher results with hundreds of low-quality Explorer results.
const scaledGrapherViews = scaleRecordScores(
grapherViews,
[1000, 10000]
)
const scaledExplorerViews = adaptExplorerViews(explorerViews)

const records = [...scaledGrapherViews, ...scaledExplorerViews]

const index = client.initIndex(indexName)
console.log(`Indexing ${records.length} records`)
await index.replaceAllObjects(records)
console.log(`Indexing complete`)
} catch (error) {
console.log("Error: ", error)
await logErrorAndMaybeSendToBugsnag({
name: `IndexExplorerViewsToAlgoliaError`,
message: error,
})
}
}

process.on("unhandledRejection", (e) => {
console.error(e)
process.exit(1)
})

void indexExplorerViewsAndChartsToAlgolia()
Loading

0 comments on commit f04ea06

Please sign in to comment.