Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integration of HtmlToText library #183

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions plugins/search-confluence-backend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
"dependencies": {
"@backstage/config": "^1.0.6",
"@backstage/plugin-search-common": "^1.2.1",
"@types/html-to-text": "^9.0.1",
"html-to-text": "^9.0.1",
"node-fetch": "^2.6.7",
"p-limit": "^3.1.0",
"winston": "^3.2.1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@ import pLimit from 'p-limit';
import { Readable } from 'stream';
import { Logger } from 'winston';
import { ConfluenceDocument, ConfluenceDocumentList, IndexableAncestorRef, IndexableConfluenceDocument } from './types';
import {compile, compiledFunction, HtmlToTextOptions} from "html-to-text";

type ConfluenceCollatorOptions = {
logger: Logger;

parallelismLimit: number;
html2TextCompile: compiledFunction

wikiUrl: string;
spaces: string[];
Expand All @@ -35,17 +37,21 @@ export class ConfluenceCollatorFactory implements DocumentCollatorFactory {
private spaces: string[];
private auth: {username: string, password: string};

private html2TextCompile;

static fromConfig(
config: Config,
options: {
logger: Logger,
parallelismLimit?: number,
htmlToTextOptions?: HtmlToTextOptions
},
) {
return new ConfluenceCollatorFactory({
logger: options.logger,

parallelismLimit: options.parallelismLimit || 15,
html2TextCompile: compile(options.htmlToTextOptions),

wikiUrl: config.getString('confluence.wikiUrl'),
spaces: config.getStringArray('confluence.spaces'),
Expand All @@ -60,6 +66,7 @@ export class ConfluenceCollatorFactory implements DocumentCollatorFactory {
this.logger = options.logger;

this.parallelismLimit = options.parallelismLimit;
this.html2TextCompile = options.html2TextCompile;
this.wikiUrl = options.wikiUrl;
this.spaces = options.spaces;
this.auth = options.auth;
Expand Down Expand Up @@ -209,6 +216,6 @@ export class ConfluenceCollatorFactory implements DocumentCollatorFactory {
}

private stripHtml(input: string): string {
return input.replace(/(<([^>]+)>)/gi, "");
return this.html2TextCompile(input);
}
}
124 changes: 123 additions & 1 deletion yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2872,7 +2872,9 @@ __metadata:
"@backstage/cli": ^0.22.3
"@backstage/config": ^1.0.6
"@backstage/plugin-search-common": ^1.2.1
"@types/html-to-text": ^9.0.1
"@types/node-fetch": ^2.6.1
html-to-text: ^9.0.1
node-fetch: ^2.6.7
p-limit: ^3.1.0
winston: ^3.2.1
Expand Down Expand Up @@ -3559,6 +3561,16 @@ __metadata:
languageName: node
linkType: hard

"@selderee/plugin-htmlparser2@npm:^0.11.0":
version: 0.11.0
resolution: "@selderee/plugin-htmlparser2@npm:0.11.0"
dependencies:
domhandler: ^5.0.3
selderee: ^0.11.0
checksum: 6deafedd153e492359f8f0407d20903d82f2ef4950e420f4b2ee6ffbb955753524631aac7d6a5fe61dc7c7893e6928b4d8409e886157ad64a60ab37bc08b17c4
languageName: node
linkType: hard

"@sinclair/typebox@npm:^0.24.1":
version: 0.24.51
resolution: "@sinclair/typebox@npm:0.24.51"
Expand Down Expand Up @@ -4240,6 +4252,13 @@ __metadata:
languageName: node
linkType: hard

"@types/html-to-text@npm:^9.0.1":
version: 9.0.1
resolution: "@types/html-to-text@npm:9.0.1"
checksum: 5efed629a8d667164c4618645e88582153cf4b5f94d57124484c3da0511d6550391c34859ee5bab6f0bfa93e2a56ae6169a6097ce3038400513507727c773150
languageName: node
linkType: hard

"@types/http-proxy@npm:^1.17.8":
version: 1.17.9
resolution: "@types/http-proxy@npm:1.17.9"
Expand Down Expand Up @@ -7268,6 +7287,13 @@ __metadata:
languageName: node
linkType: hard

"deepmerge@npm:^4.3.1":
version: 4.3.1
resolution: "deepmerge@npm:4.3.1"
checksum: 2024c6a980a1b7128084170c4cf56b0fd58a63f2da1660dcfe977415f27b17dbe5888668b59d0b063753f3220719d5e400b7f113609489c90160bb9a5518d052
languageName: node
linkType: hard

"default-gateway@npm:^6.0.3":
version: 6.0.3
resolution: "default-gateway@npm:6.0.3"
Expand Down Expand Up @@ -7508,14 +7534,25 @@ __metadata:
languageName: node
linkType: hard

"dom-serializer@npm:^2.0.0":
version: 2.0.0
resolution: "dom-serializer@npm:2.0.0"
dependencies:
domelementtype: ^2.3.0
domhandler: ^5.0.2
entities: ^4.2.0
checksum: cd1810544fd8cdfbd51fa2c0c1128ec3a13ba92f14e61b7650b5de421b88205fd2e3f0cc6ace82f13334114addb90ed1c2f23074a51770a8e9c1273acbc7f3e6
languageName: node
linkType: hard

"domain-browser@npm:^1.1.1":
version: 1.2.0
resolution: "domain-browser@npm:1.2.0"
checksum: 8f1235c7f49326fb762f4675795246a6295e7dd566b4697abec24afdba2460daa7dfbd1a73d31efbf5606b3b7deadb06ce47cf06f0a476e706153d62a4ff2b90
languageName: node
linkType: hard

"domelementtype@npm:^2.0.1, domelementtype@npm:^2.2.0":
"domelementtype@npm:^2.0.1, domelementtype@npm:^2.2.0, domelementtype@npm:^2.3.0":
version: 2.3.0
resolution: "domelementtype@npm:2.3.0"
checksum: ee837a318ff702622f383409d1f5b25dd1024b692ef64d3096ff702e26339f8e345820f29a68bcdcea8cfee3531776b3382651232fbeae95612d6f0a75efb4f6
Expand All @@ -7540,6 +7577,15 @@ __metadata:
languageName: node
linkType: hard

"domhandler@npm:^5.0.2, domhandler@npm:^5.0.3":
version: 5.0.3
resolution: "domhandler@npm:5.0.3"
dependencies:
domelementtype: ^2.3.0
checksum: 0f58f4a6af63e6f3a4320aa446d28b5790a009018707bce2859dcb1d21144c7876482b5188395a188dfa974238c019e0a1e610d2fc269a12b2c192ea2b0b131c
languageName: node
linkType: hard

"domutils@npm:^2.5.2, domutils@npm:^2.8.0":
version: 2.8.0
resolution: "domutils@npm:2.8.0"
Expand All @@ -7551,6 +7597,17 @@ __metadata:
languageName: node
linkType: hard

"domutils@npm:^3.0.1":
version: 3.1.0
resolution: "domutils@npm:3.1.0"
dependencies:
dom-serializer: ^2.0.0
domelementtype: ^2.3.0
domhandler: ^5.0.3
checksum: e5757456ddd173caa411cfc02c2bb64133c65546d2c4081381a3bafc8a57411a41eed70494551aa58030be9e58574fcc489828bebd673863d39924fb4878f416
languageName: node
linkType: hard

"dot-case@npm:^3.0.4":
version: 3.0.4
resolution: "dot-case@npm:3.0.4"
Expand Down Expand Up @@ -7690,6 +7747,13 @@ __metadata:
languageName: node
linkType: hard

"entities@npm:^4.2.0":
version: 4.5.0
resolution: "entities@npm:4.5.0"
checksum: 853f8ebd5b425d350bffa97dd6958143179a5938352ccae092c62d1267c4e392a039be1bae7d51b6e4ffad25f51f9617531fedf5237f15df302ccfb452cbf2d7
languageName: node
linkType: hard

"entities@npm:^4.4.0":
version: 4.4.0
resolution: "entities@npm:4.4.0"
Expand Down Expand Up @@ -9688,6 +9752,19 @@ __metadata:
languageName: node
linkType: hard

"html-to-text@npm:^9.0.1":
version: 9.0.5
resolution: "html-to-text@npm:9.0.5"
dependencies:
"@selderee/plugin-htmlparser2": ^0.11.0
deepmerge: ^4.3.1
dom-serializer: ^2.0.0
htmlparser2: ^8.0.2
selderee: ^0.11.0
checksum: 205e0faa9b9aa281b369122acdffc5f348848e400f4037fde1fb12d68a6baa11644d2b64c3cc6821a79d3bc7316d89e85cc733d86f7f709858cb5c5b72faac65
languageName: node
linkType: hard

"html-webpack-plugin@npm:^5.3.1":
version: 5.5.0
resolution: "html-webpack-plugin@npm:5.5.0"
Expand Down Expand Up @@ -9715,6 +9792,18 @@ __metadata:
languageName: node
linkType: hard

"htmlparser2@npm:^8.0.2":
version: 8.0.2
resolution: "htmlparser2@npm:8.0.2"
dependencies:
domelementtype: ^2.3.0
domhandler: ^5.0.3
domutils: ^3.0.1
entities: ^4.4.0
checksum: 29167a0f9282f181da8a6d0311b76820c8a59bc9e3c87009e21968264c2987d2723d6fde5a964d4b7b6cba663fca96ffb373c06d8223a85f52a6089ced942700
languageName: node
linkType: hard

"http-cache-semantics@npm:^4.1.0":
version: 4.1.0
resolution: "http-cache-semantics@npm:4.1.0"
Expand Down Expand Up @@ -11533,6 +11622,13 @@ __metadata:
languageName: node
linkType: hard

"leac@npm:^0.6.0":
version: 0.6.0
resolution: "leac@npm:0.6.0"
checksum: a7a722cfc2ddfd6fb2620e5dee3ac8e9b0af4eb04325f3c8286a820de78becba3010a4d7026ff5189bb159eb7a851c3a1ac73e076eb0d54fcee0adaf695291ba
languageName: node
linkType: hard

"leven@npm:^3.1.0":
version: 3.1.0
resolution: "leven@npm:3.1.0"
Expand Down Expand Up @@ -13617,6 +13713,16 @@ __metadata:
languageName: node
linkType: hard

"parseley@npm:^0.12.0":
version: 0.12.1
resolution: "parseley@npm:0.12.1"
dependencies:
leac: ^0.6.0
peberminta: ^0.9.0
checksum: 147760bce6c4a4f8c62af021a84ced262f078f60a1119e6891eba69567a953e06295ad2c70e5e89892ad1d4af0126f0856742d657a19a29ebf58422cf3bfd4f3
languageName: node
linkType: hard

"parseurl@npm:~1.3.2, parseurl@npm:~1.3.3":
version: 1.3.3
resolution: "parseurl@npm:1.3.3"
Expand Down Expand Up @@ -13717,6 +13823,13 @@ __metadata:
languageName: node
linkType: hard

"peberminta@npm:^0.9.0":
version: 0.9.0
resolution: "peberminta@npm:0.9.0"
checksum: b983b68077269ca8a3327520a0a3f027fa930faa9fb3cb53bed1cb3847ebc0ed55db936d70b1745a756149911f5f450e898e87e25ab207f1b8b892bed48fb540
languageName: node
linkType: hard

"picocolors@npm:^1.0.0":
version: 1.0.0
resolution: "picocolors@npm:1.0.0"
Expand Down Expand Up @@ -15631,6 +15744,15 @@ __metadata:
languageName: node
linkType: hard

"selderee@npm:^0.11.0":
version: 0.11.0
resolution: "selderee@npm:0.11.0"
dependencies:
parseley: ^0.12.0
checksum: af8a68c1f4cde858152943b6fc9f2b7164c8fb1a1c9f01b44350dffd1f79783930d77a0ae33548a036816d17c8130eeb9d15f1db65c9262ca368ad3a0d750f66
languageName: node
linkType: hard

"select-hose@npm:^2.0.0":
version: 2.0.0
resolution: "select-hose@npm:2.0.0"
Expand Down