Skip to content

Commit e5ee95a

Browse files
Add hierarchical topic extraction in the website knowledge extraction workflow (#1690)
1 parent cfffaac commit e5ee95a

26 files changed

+4200
-63
lines changed

ts/packages/agents/browser/scripts/buildExtension.mjs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ const sharedScripts = {
5959
"views/pageMacros": "views/pageMacros.ts",
6060
"views/macrosLibrary": "views/macrosLibrary.ts",
6161
"views/entityGraphView": "views/entityGraphView.ts",
62+
"views/topicGraphView": "views/topicGraphView.ts",
6263
"views/pageKnowledge": "views/pageKnowledge.ts",
6364
"views/annotationsLibrary": "views/annotationsLibrary.ts",
6465
"views/knowledgeLibrary": "views/knowledgeLibrary.ts",
@@ -108,6 +109,8 @@ const libraryAssets = [
108109
"views/annotationsLibrary.html",
109110
"views/entityGraphView.css",
110111
"views/entityGraphView.html",
112+
"views/topicGraphView.css",
113+
"views/topicGraphView.html",
111114
"views/knowledgeLibrary.css",
112115
"views/knowledgeLibrary.html",
113116
"views/macrosLibrary.css",

ts/packages/agents/browser/src/agent/browserActionHandler.mts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,7 @@ async function processBrowserAgentMessage(
635635
case "getEntityNeighborhood":
636636
case "getGlobalImportanceLayer":
637637
case "getImportanceStatistics":
638+
case "getHierarchicalTopics":
638639
case "getViewportBasedNeighborhood": {
639640
const knowledgeResult = await handleKnowledgeAction(
640641
data.method,
@@ -944,6 +945,7 @@ async function resolveWebPage(
944945
entityGraph: "typeagent-browser://views/entityGraphView.html",
945946
knowledgelibrary: "typeagent-browser://views/knowledgeLibrary.html",
946947
macroslibrary: "typeagent-browser://views/macrosLibrary.html",
948+
topicGraph: "typeagent-browser://views/topicGraphView.html",
947949
};
948950

949951
const libraryUrl = libraryPages[site.toLowerCase()];

ts/packages/agents/browser/src/agent/browserConnector.mts

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -75,19 +75,12 @@ export class BrowserConnector {
7575
useTimestampIds?: boolean,
7676
compressionMode?: string,
7777
): Promise<any[]> {
78-
if (this.browserControl) {
79-
return this.browserControl.getHtmlFragments(
80-
useTimestampIds,
81-
compressionMode,
82-
);
83-
}
84-
// Fallback to sending action to browser if browserControl is not available
8578
const result = await this.sendActionToBrowser({
8679
actionName: "getHTML",
8780
parameters: {
8881
fullHTML: compressionMode === "None",
8982
downloadAsFile: false,
90-
extractText: true,
83+
extractText: compressionMode !== "knowledgeExtraction",
9184
useTimestampIds: useTimestampIds,
9285
},
9386
});
@@ -123,20 +116,13 @@ export class BrowserConnector {
123116
}
124117

125118
async clickOn(cssSelector: string): Promise<any> {
126-
if (this.browserControl) {
127-
return this.browserControl.clickOn(cssSelector);
128-
}
129-
130119
return this.sendActionToBrowser({
131120
actionName: "clickOnElement",
132121
parameters: { cssSelector },
133122
});
134123
}
135124

136125
async setDropdown(cssSelector: string, optionLabel: string): Promise<any> {
137-
if (this.browserControl) {
138-
return this.browserControl.setDropdown(cssSelector, optionLabel);
139-
}
140126
return this.sendActionToBrowser({
141127
actionName: "setDropdownValue",
142128
parameters: { cssSelector, optionLabel },
@@ -148,14 +134,6 @@ export class BrowserConnector {
148134
cssSelector?: string,
149135
submitForm?: boolean,
150136
): Promise<any> {
151-
if (this.browserControl) {
152-
return this.browserControl.enterTextIn(
153-
textValue,
154-
cssSelector,
155-
submitForm,
156-
);
157-
}
158-
159137
const actionName = cssSelector
160138
? "enterTextInElement"
161139
: "enterTextOnPage";
@@ -170,10 +148,6 @@ export class BrowserConnector {
170148
}
171149

172150
async awaitPageLoad(timeout?: number): Promise<any> {
173-
if (this.browserControl) {
174-
return this.browserControl.awaitPageLoad(timeout);
175-
}
176-
177151
const actionPromise = this.sendActionToBrowser({
178152
actionName: "awaitPageLoad",
179153
});
@@ -187,10 +161,6 @@ export class BrowserConnector {
187161
}
188162

189163
async awaitPageInteraction(timeout?: number) {
190-
if (this.browserControl) {
191-
return this.browserControl.awaitPageInteraction(timeout);
192-
}
193-
194164
if (!timeout) {
195165
timeout = 400;
196166
}

ts/packages/agents/browser/src/agent/import/importProgressEvents.mts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ export interface ImportProgressEvent {
1212
| "fetching"
1313
| "processing"
1414
| "extracting"
15+
| "graph-building"
16+
| "persisting"
1517
| "complete"
1618
| "error";
1719
current: number;
@@ -36,6 +38,18 @@ export interface ImportProgressEvent {
3638
filename?: string;
3739
currentAction?: string;
3840
};
41+
graphBuildingPhase?:
42+
| "entities"
43+
| "relationships"
44+
| "topics"
45+
| "communities";
46+
entitiesProcessed?: number;
47+
relationshipsBuilt?: number;
48+
topicsHierarchized?: number;
49+
lastSavePoint?: number;
50+
nextSavePoint?: number;
51+
dataPersistedToDisk?: boolean;
52+
graphPersistedToDb?: boolean;
3953
}
4054

4155
export class ImportProgressEventEmitter extends EventEmitter {
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT License.
3+
4+
import * as fs from "fs/promises";
5+
import * as path from "path";
6+
import registerDebug from "debug";
7+
const debug = registerDebug("typeagent:browser:import");
8+
9+
export interface ImportState {
10+
importId: string;
11+
totalWebsites: number;
12+
processedWebsites: number;
13+
lastSavePoint: number;
14+
failedUrls: string[];
15+
startTime: number;
16+
lastProgressTime: number;
17+
extractionMode: string;
18+
source: string;
19+
type: string;
20+
filePath?: string;
21+
}
22+
23+
export class ImportStateManager {
24+
private static readonly STATE_DIR = path.join(
25+
process.cwd(),
26+
".import-states",
27+
);
28+
private static readonly COLLECTION_BACKUPS_DIR = path.join(
29+
process.cwd(),
30+
".collection-backups",
31+
);
32+
33+
static async ensureDirectories(): Promise<void> {
34+
await fs.mkdir(this.STATE_DIR, { recursive: true });
35+
await fs.mkdir(this.COLLECTION_BACKUPS_DIR, { recursive: true });
36+
}
37+
38+
static async saveImportState(state: ImportState): Promise<void> {
39+
await this.ensureDirectories();
40+
const statePath = path.join(this.STATE_DIR, `${state.importId}.json`);
41+
await fs.writeFile(statePath, JSON.stringify(state, null, 2));
42+
debug(`Import state saved for ${state.importId} at ${statePath}`);
43+
}
44+
45+
static async loadImportState(
46+
importId: string,
47+
): Promise<ImportState | null> {
48+
try {
49+
const statePath = path.join(this.STATE_DIR, `${importId}.json`);
50+
const data = await fs.readFile(statePath, "utf-8");
51+
return JSON.parse(data);
52+
} catch (error) {
53+
debug(`Failed to load import state for ${importId}: ${error}`);
54+
return null;
55+
}
56+
}
57+
58+
static async deleteImportState(importId: string): Promise<void> {
59+
try {
60+
const statePath = path.join(this.STATE_DIR, `${importId}.json`);
61+
await fs.unlink(statePath);
62+
debug(`Import state deleted for ${importId}`);
63+
} catch (error) {
64+
debug(`Failed to delete import state for ${importId}: ${error}`);
65+
}
66+
}
67+
68+
static getCollectionBackupPath(
69+
importId: string,
70+
savePoint: number,
71+
): string {
72+
return path.join(
73+
this.COLLECTION_BACKUPS_DIR,
74+
`${importId}_${savePoint}.json`,
75+
);
76+
}
77+
78+
static async cleanupOldBackups(importId: string): Promise<void> {
79+
try {
80+
const files = await fs.readdir(this.COLLECTION_BACKUPS_DIR);
81+
const importFiles = files.filter((f) => f.startsWith(importId));
82+
83+
for (const file of importFiles) {
84+
await fs.unlink(path.join(this.COLLECTION_BACKUPS_DIR, file));
85+
}
86+
debug(
87+
`Cleaned up ${importFiles.length} backup files for ${importId}`,
88+
);
89+
} catch (error) {
90+
debug(`Failed to cleanup backups for ${importId}: ${error}`);
91+
}
92+
}
93+
94+
static calculateSavePoints(totalCount: number): number[] {
95+
const fixedInterval = 50;
96+
const percentageInterval = Math.ceil(totalCount * 0.2);
97+
const saveInterval = Math.min(fixedInterval, percentageInterval);
98+
99+
const savePoints = [];
100+
for (let i = saveInterval; i < totalCount; i += saveInterval) {
101+
savePoints.push(i);
102+
}
103+
savePoints.push(totalCount); // Always save at end
104+
105+
return savePoints;
106+
}
107+
}

ts/packages/agents/browser/src/agent/import/importWebSocketHandler.mts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ export class ImportWebSocketHandler {
2525

2626
private forwardProgressToWebSocket(progress: ImportProgressEvent) {
2727
try {
28-
const client = this.context.agentContext.currentClient;
28+
// Get client from agentWebSocketServer instead of currentClient
29+
const agentServer = this.context.agentContext.agentWebSocketServer;
30+
const client = agentServer?.getActiveClient();
31+
2932
if (client && client.socket.readyState === WebSocket.OPEN) {
3033
const websocketProgress = {
3134
type: "importProgress",

ts/packages/agents/browser/src/agent/knowledge/actions/extractionActions.mts

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,12 +1090,15 @@ export async function performKnowledgeExtractionWithNotifications(
10901090

10911091
const { progressState, aggregatedKnowledge } = activeExtraction;
10921092
const entitiesCount = aggregatedKnowledge.entities?.length || 0;
1093-
const topicsCount = aggregatedKnowledge.topics?.length || 0;
1093+
const topicsCount =
1094+
aggregatedKnowledge.topics?.length ||
1095+
(aggregatedKnowledge as any).keyTopics?.length ||
1096+
0;
10941097
const relationshipsCount =
10951098
aggregatedKnowledge.relationships?.length || 0;
10961099

10971100
// Only send notification if knowledge has changed
1098-
const currentState = `${entitiesCount}-${topicsCount}-${relationshipsCount}-${progressState.phase}`;
1101+
const currentState = `${entitiesCount}-${topicsCount}-${relationshipsCount}-${progressState.phase}-${progressState.percentage}`;
10991102
if (
11001103
currentState !== lastNotificationState &&
11011104
(entitiesCount > 0 || topicsCount > 0 || relationshipsCount > 0)
@@ -1166,7 +1169,10 @@ export async function performKnowledgeExtractionWithNotifications(
11661169
updateExtractionTimestamp(url);
11671170

11681171
const entitiesCount = knowledge.entities?.length || 0;
1169-
const topicsCount = knowledge.topics?.length || 0;
1172+
const topicsCount =
1173+
knowledge.topics?.length ||
1174+
knowledge.keyTopics?.length ||
1175+
0;
11701176
const relationshipsCount =
11711177
knowledge.relationships?.length || 0;
11721178

@@ -1202,7 +1208,10 @@ export async function performKnowledgeExtractionWithNotifications(
12021208
indexError,
12031209
);
12041210
const entitiesCount = knowledge.entities?.length || 0;
1205-
const topicsCount = knowledge.topics?.length || 0;
1211+
const topicsCount =
1212+
knowledge.topics?.length ||
1213+
knowledge.keyTopics?.length ||
1214+
0;
12061215
const relationshipsCount =
12071216
knowledge.relationships?.length || 0;
12081217

0 commit comments

Comments
 (0)