diff --git a/crowdin.yml b/crowdin.yml index 583a16179730..6c597c315886 100644 --- a/crowdin.yml +++ b/crowdin.yml @@ -1,6 +1,8 @@ files: - source: /content/**/*.md translation: /translations/%locale%/%original_path%/%original_file_name% + # See lib/page-data.js for a matching list of prefix exceptions + # Try to keep these in sync when editing in either location. ignore: - '/content/README.md' - '/content/early-access' diff --git a/lib/create-tree.js b/lib/create-tree.js index e2b02ea26725..aa832436491a 100644 --- a/lib/create-tree.js +++ b/lib/create-tree.js @@ -8,7 +8,7 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url)) const _basePaths = new Map() // Return a full directory based on __dirname from a specific language directory. // This function is memoized with a simple global cache object. -function getBasePath(directory) { +export function getBasePath(directory) { if (!_basePaths.has(directory)) { _basePaths.set(directory, path.posix.join(__dirname, '..', directory, 'content')) } diff --git a/lib/page-data.js b/lib/page-data.js index e5ac95f2ba51..cc4c4ccffff5 100644 --- a/lib/page-data.js +++ b/lib/page-data.js @@ -2,15 +2,29 @@ import { fileURLToPath } from 'url' import path from 'path' import languages from './languages.js' import { allVersions } from './all-versions.js' -import createTree from './create-tree.js' +import createTree, { getBasePath } from './create-tree.js' import renderContent from './render-content/index.js' import loadSiteData from './site-data.js' import nonEnterpriseDefaultVersion from './non-enterprise-default-version.js' +import Page from './page.js' const __dirname = path.dirname(fileURLToPath(import.meta.url)) const versions = Object.keys(allVersions) const enterpriseServerVersions = versions.filter((v) => v.startsWith('enterprise-server@')) const renderOpts = { textOnly: true, encodeEntities: true } +// These are the exceptions to the rule. +// These URI prefixes should match what you'll find in crowdin.yml. +// If a URI starts with one of these prefixes, it basically means we don't +// bother to "backfill" a translation in its spot. +// For example, `/en/github/site-policy-deprecated/foo` works +// only in English and we don't bother making `/ja/github/site-policy-deprecated/foo` +// work too. +const TRANSLATION_DRIFT_EXCEPTIONS = [ + 'github/site-policy-deprecated', + // Early access stuff never has translations. + 'early-access', +] + /** * We only need to initialize pages _once per language_ since pages don't change per version. So we do that * first since it's the most expensive work. This gets us a nested object with pages attached that we can use @@ -152,8 +166,112 @@ export function createMapFromArray(pageList) { } export async function loadPageMap(pageList) { - const pages = pageList || (await loadPageList()) - return createMapFromArray(pages) + const pages = await correctTranslationOrphans(pageList || (await loadPageList())) + const pageMap = createMapFromArray(pages) + return pageMap +} + +// If a translation page exists, that doesn't have an English equivalent, +// remove it. +// If an English page exists, that doesn't have an translation equivalent, +// add it. +// Note, this function is exported purely for the benefit of the unit tests. +export async function correctTranslationOrphans(pageList, basePath = null) { + const englishRelativePaths = new Set() + for (const page of pageList) { + if (page.languageCode === 'en') { + englishRelativePaths.add(page.relativePath) + } + } + + // Prime the Map with an empty set for each language prefix. + // It's important that we do this for *every* language rather than + // just populating `nonEnglish` based on those pages that *are* present. + // Otherwise, we won't have an index of all the languages + // that *might* be missing. + const nonEnglish = new Map() + Object.keys(languages) + .filter((lang) => lang !== 'en') + .forEach((languageCode) => { + nonEnglish.set(languageCode, new Set()) + }) + + // By default, when backfilling, we set the `basePath` to be that of + // English. But for the benefit of being able to do unit tests, + // we make this an optional argument. Then, unit tests can use + // its "tests/fixtures" directory. + const englishBasePath = basePath || getBasePath(languages.en.dir) + + // Filter out all non-English pages that appear to be excess. + // E.g. if an English doc was renamed from `content/foo.md` to + // `content/bar.md` what will happen is that `translations/*/content/foo.md` + // will still linger around and we want to remove that even if it was + // scooped up from disk. + const newPageList = [] + for (const page of pageList) { + if (page.languageCode === 'en') { + // English pages are never considered "excess" + newPageList.push(page) + continue + } + + // If this translation page exists in English, keep it but also + // add it to the set of relative paths that is known. + if (englishRelativePaths.has(page.relativePath)) { + nonEnglish.get(page.languageCode).add(page.relativePath) + newPageList.push(page) + continue + } + + // All else is considered "excess" and should be excluded from + // the new list of pages. So do nothing. + if (process.env.NODE_ENV === 'development') { + console.log( + `For ${page.languageCode}, omit the page ${page.relativePath} even though it's present on disk.` + ) + } + } + + const pageLoadPromises = [] + for (const relativePath of englishRelativePaths) { + for (const [languageCode, relativePaths] of nonEnglish) { + if (!relativePaths.has(relativePath)) { + // At this point, we've found an English `relativePath` that is + // not used by this language. + // But before we decide to "backfill" it from the English equivalent + // we first need to figure out if it should be excluded. + // The reason for doing this check this late is for the benefit + // of optimization. In general, when the translation pipeline has + // done its magic, this should be very rare, so it's unnecessary + // to do this exception check on every single English relativePath. + if (TRANSLATION_DRIFT_EXCEPTIONS.find((exception) => relativePath.startsWith(exception))) { + continue + } + + // The magic right here! + // The trick is that we can't clone instances of class Page. We need + // to create them for this language. But the trick is that we + // use the English relative path so it can have something to read. + // For example, if we have figured out that + // `translations/ja-JP/content/foo.md` doesn't exist, we pretend + // that we can use `foo.md` and the base path of `content/`. + pageLoadPromises.push( + Page.init({ + basePath: englishBasePath, + relativePath: relativePath, + languageCode: languageCode, + }) + ) + if (process.env.NODE_ENV === 'development') { + console.log(`Backfill ${relativePath} for ${languageCode} from English`) + } + } + } + } + const additionalPages = await Promise.all(pageLoadPromises) + newPageList.push(...additionalPages) + + return newPageList } export default { diff --git a/tests/unit/load-translation-orphans.js b/tests/unit/load-translation-orphans.js new file mode 100644 index 000000000000..581c6ee23cc9 --- /dev/null +++ b/tests/unit/load-translation-orphans.js @@ -0,0 +1,60 @@ +import path from 'path' +import { fileURLToPath } from 'url' + +import { expect } from '@jest/globals' + +import languages from '../../lib/languages.js' +import Page from '../../lib/page.js' +import { loadPageMap, correctTranslationOrphans } from '../../lib/page-data.js' +const __dirname = path.dirname(fileURLToPath(import.meta.url)) + +describe('loading page map with translation orphans', () => { + test('inject missing translations from English', async () => { + const basePath = path.join(__dirname, '../fixtures') + const page = await Page.init({ + relativePath: 'page-that-does-not-exist-in-translations-dir.md', + basePath, + languageCode: 'en', + }) + console.assert(page, 'page could not be loaded') + + const pageList = [page] + const pageMap = await loadPageMap(await correctTranslationOrphans(pageList, basePath)) + // It should make a copy of the English into each language + expect(Object.keys(pageMap).length).toBe(Object.keys(languages).length) + + // +1 for the test just above, 2 tests per language. + expect.assertions(1 + Object.keys(languages).length * 2) + + for (const languageCode of Object.keys(languages)) { + for (const permalink of page.permalinks) { + const translationHref = permalink.href.replace('/en', `/${languageCode}`) + const translationPage = pageMap[translationHref] + expect(translationPage).toBeTruthy() + expect(translationPage.languageCode).toBe(languageCode) + } + } + }) + + test('remove translation pages that were not in English', async () => { + const basePath = path.join(__dirname, '../fixtures') + const page = await Page.init({ + relativePath: 'page-that-does-not-exist-in-translations-dir.md', + basePath, + languageCode: 'en', + }) + console.assert(page, 'page could not be loaded') + const orphan = await Page.init({ + relativePath: 'article-with-videos.md', + basePath, + languageCode: 'ja', + }) + console.assert(orphan, 'page could not be loaded') + const orphanPermalinks = orphan.permalinks.map((p) => p.href) + + const pageList = await correctTranslationOrphans([page, orphan], basePath) + const pageMap = await loadPageMap(pageList) + expect(pageMap[orphanPermalinks[0]]).toBeFalsy() + expect(Object.keys(pageMap).length).toBe(Object.keys(languages).length) + }) +}) diff --git a/tests/unit/pages.js b/tests/unit/pages.js index 4a8f0454173f..fe66b9456c25 100644 --- a/tests/unit/pages.js +++ b/tests/unit/pages.js @@ -1,6 +1,6 @@ import { jest } from '@jest/globals' import path from 'path' -import { loadPages, loadPageMap } from '../../lib/page-data.js' +import { loadPages, loadPageMap, correctTranslationOrphans } from '../../lib/page-data.js' import libLanguages from '../../lib/languages.js' import { liquid } from '../../lib/render-content/index.js' import patterns from '../../lib/patterns.js' @@ -13,13 +13,24 @@ import removeFPTFromPath from '../../lib/remove-fpt-from-path.js' const languageCodes = Object.keys(libLanguages) const slugger = new GithubSlugger() +// By default, the tests don't change that each translation has an +// equivalent English page (e.g. `translations/*/content/foo.md` +// expects `content/foo.md`) +// Set the environment variable `TEST_TRANSLATION_MATCHING=true` +// to enable that test. +const testIfRequireTranslationMatching = JSON.parse( + process.env.TEST_TRANSLATION_MATCHING || 'false' +) + ? test + : test.skip + describe('pages module', () => { jest.setTimeout(60 * 1000) let pages beforeAll(async () => { - pages = await loadPages() + pages = await correctTranslationOrphans(await loadPages()) }) describe('loadPages', () => { @@ -76,8 +87,8 @@ describe('pages module', () => { const message = `Found ${duplicates.length} duplicate redirect_from ${ duplicates.length === 1 ? 'path' : 'paths' - }. - Ensure that you don't define the same path more than once in the redirect_from property in a single file and across all English files. + }. + Ensure that you don't define the same path more than once in the redirect_from property in a single file and across all English files. You may also receive this error if you have defined the same children property more than once.\n ${duplicates.join('\n')}` expect(duplicates.length, message).toBe(0) @@ -152,26 +163,29 @@ describe('pages module', () => { expect(liquidErrors.length, failureMessage).toBe(0) }) - test('every non-English page has a matching English page', async () => { - const englishPaths = chain(walk('content', { directories: false })) - .uniq() - .value() - - const nonEnglishPaths = chain(Object.values(libLanguages)) - .filter((language) => language.code !== 'en') - .map((language) => walk(`${language.dir}/content`, { directories: false })) - .flatten() - .uniq() - .value() - - const diff = difference(nonEnglishPaths, englishPaths) - const failureMessage = ` + testIfRequireTranslationMatching( + 'every non-English page has a matching English page', + async () => { + const englishPaths = chain(walk('content', { directories: false })) + .uniq() + .value() + + const nonEnglishPaths = chain(Object.values(libLanguages)) + .filter((language) => language.code !== 'en') + .map((language) => walk(`${language.dir}/content`, { directories: false })) + .flatten() + .uniq() + .value() + + const diff = difference(nonEnglishPaths, englishPaths) + const failureMessage = ` Found ${diff.length} non-English pages without a matching English page:\n - ${diff.join('\n - ')} Remove them with script/i18n/prune-stale-files.js and commit your changes using "git commit --no-verify". ` - expect(diff.length, failureMessage).toBe(0) - }) + expect(diff.length, failureMessage).toBe(0) + } + ) }) describe('loadPageMap', () => {