diff --git a/README.md b/README.md index bcb4087..ba71983 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,15 @@ ![demo](public/demo.png) ## Overview + GraphRAG Visualizer is an application designed to visualize Microsoft [GraphRAG](https://github.com/microsoft/graphrag) artifacts. By uploading parquet files generated from the GraphRAG indexing pipeline, users can easily view and analyze data without needing additional software or scripts. ## Features + - **Graph Visualization**: View the graph in 2D or 3D in the "Graph Visualization" tab. - **Data Tables**: Display data from the parquet files in the "Data Tables" tab. - **Search Functionality**: Fully supports search, allowing users to focus on specific nodes or relationships. - **Local Processing**: All artifacts are processed locally on your machine, ensuring data security and privacy. - ## Using the Search Functionality @@ -22,27 +23,69 @@ Once the [graphrag-api](https://github.com/noworneverev/graphrag-api) server is ![search](public/search.png) ## Graph Data Model -The logic for creating relationships for text units, documents, communities, and covariates is derived from the [GraphRAG import Neo4j Cypher notebook](https://github.com/microsoft/graphrag/blob/main/examples_notebooks/community_contrib/neo4j/graphrag_import_neo4j_cypher.ipynb). +The logic for creating relationships for text units, documents, communities, and covariates is derived from the [GraphRAG import Neo4j Cypher notebook](https://github.com/microsoft/graphrag/blob/main/examples_notebooks/community_contrib/neo4j/graphrag_import_neo4j_cypher.ipynb). ### Nodes -| Node | Type | -|-----------|--------------| +| Node | Type | +| --------- | -------------- | | Document | `RAW_DOCUMENT` | | Text Unit | `CHUNK` | | Community | `COMMUNITY` | | Finding | `FINDING` | | Covariate | `COVARIATE` | -| Entity | *Varies* | +| Entity | _Varies_ | ### Relationships -| Source Node | Relationship | Target Node | -|-------------|---------------|-------------| -| Entity | `RELATED` | Entity | -| Text Unit | `PART_OF` | Document | -| Text Unit | `HAS_ENTITY` | Entity | -| Text Unit | `HAS_COVARIATE` | Covariate | -| Community | `HAS_FINDING` | Finding | -| Entity | `IN_COMMUNITY` | Community | +| Source Node | Relationship | Target Node | +| ----------- | --------------- | ----------- | +| Entity | `RELATED` | Entity | +| Text Unit | `PART_OF` | Document | +| Text Unit | `HAS_ENTITY` | Entity | +| Text Unit | `HAS_COVARIATE` | Covariate | +| Community | `HAS_FINDING` | Finding | +| Entity | `IN_COMMUNITY` | Community | + +## Developer Instructions + +### Setting Up the Project + +1. Clone the repository to your local machine: + + ```bash + git clone https://github.com/noworneverev/graphrag-visualizer.git + cd graphrag-visualizer + ``` + +2. Install the necessary dependencies: + + ```bash + npm install + ``` + +3. Run the development server: + + ```bash + npm start + ``` + +4. Open the app in your browser: + ``` + http://localhost:3000 + ``` + +### Loading Parquet Files + +To load `.parquet` files automatically when the application starts, place your Parquet files in the `public/artifacts` directory. These files will be loaded into the application for visualization and data table display. The files can be organized as follows: + +- `public/artifacts/create_final_entities.parquet` +- `public/artifacts/create_final_relationships.parquet` +- `public/artifacts/create_final_documents.parquet` +- `public/artifacts/create_final_text_units.parquet` +- `public/artifacts/create_final_communities.parquet` +- `public/artifacts/create_final_community_reports.parquet` +- `public/artifacts/create_final_covariates.parquet` + +If the files are placed in the `public/artifacts` folder, the app will automatically load and display them on startup. diff --git a/public/artifacts/put_parquet_files_here b/public/artifacts/put_parquet_files_here new file mode 100644 index 0000000..e69de29 diff --git a/src/app/components/GraphDataHandler.tsx b/src/app/components/GraphDataHandler.tsx index 4e297b0..3b501aa 100644 --- a/src/app/components/GraphDataHandler.tsx +++ b/src/app/components/GraphDataHandler.tsx @@ -36,6 +36,7 @@ const GraphDataHandler: React.FC = () => { covariates, communityReports, handleFilesRead, + loadDefaultFiles, } = useFileHandler(); const graphData = useGraphData( @@ -57,6 +58,12 @@ const GraphDataHandler: React.FC = () => { const hasCommunities = communities.length > 0; const hasCovariates = covariates.length > 0; + useEffect(() => { + if (process.env.NODE_ENV === "development") { + loadDefaultFiles(); + } + }, []); + useEffect(() => { if (entities.length > 0) { setTabIndex(1); diff --git a/src/app/hooks/useFileHandler.ts b/src/app/hooks/useFileHandler.ts index 5f5cf32..38c3390 100644 --- a/src/app/hooks/useFileHandler.ts +++ b/src/app/hooks/useFileHandler.ts @@ -8,6 +8,17 @@ import { CommunityReport } from "../models/community-report"; import { Covariate } from "../models/covariate"; import { readParquetFile } from "../utils/parquet-utils"; +// Paths to default files in the public folder +const defaultFiles = [ + process.env.PUBLIC_URL + "/artifacts/create_final_entities.parquet", + process.env.PUBLIC_URL + "/artifacts/create_final_relationships.parquet", + process.env.PUBLIC_URL + "/artifacts/create_final_documents.parquet", + process.env.PUBLIC_URL + "/artifacts/create_final_text_units.parquet", + process.env.PUBLIC_URL + "/artifacts/create_final_communities.parquet", + process.env.PUBLIC_URL + "/artifacts/create_final_community_reports.parquet", + process.env.PUBLIC_URL + "/artifacts/create_final_covariates.parquet", +]; + const fileSchemas: { [key: string]: string } = { "create_final_entities.parquet": "entity", "create_final_relationships.parquet": "relationship", @@ -25,9 +36,15 @@ const useFileHandler = () => { const [textunits, setTextUnits] = useState([]); const [communities, setCommunities] = useState([]); const [covariates, setCovariates] = useState([]); - const [communityReports, setCommunityReports] = useState([]); + const [communityReports, setCommunityReports] = useState( + [] + ); const handleFilesRead = async (files: File[]) => { + await loadFiles(files); + }; + + const loadFiles = async (files: File[] | string[]) => { const entitiesArray: Entity[][] = []; const relationshipsArray: Relationship[][] = []; const documentsArray: Document[][] = []; @@ -37,8 +54,32 @@ const useFileHandler = () => { const covariatesArray: Covariate[][] = []; for (const file of files) { - const schema = fileSchemas[file.name]; - const data = await readParquetFile(file, schema); + const fileName = + typeof file === "string" ? file.split("/").pop()! : file.name; + const schema = fileSchemas[fileName]; + + let data; + if (typeof file === "string") { + // Fetch default file from public folder as binary data + const response = await fetch(file); + if (!response.ok) { + console.error(`Failed to fetch file ${file}: ${response.statusText}`); + continue; + } + + // Convert ArrayBuffer to File object + const buffer = await response.arrayBuffer(); + const blob = new Blob([buffer], { type: "application/x-parquet" }); + const fileBlob = new File([blob], fileName); + + // Use the File object in readParquetFile + data = await readParquetFile(fileBlob, schema); + // console.log(`Successfully loaded ${fileName} from public folder`); + } else { + // Handle drag-and-drop files directly + data = await readParquetFile(file, schema); + // console.log(`Successfully loaded ${file.name} from drag-and-drop`); + } if (schema === "entity") { entitiesArray.push(data); @@ -57,21 +98,61 @@ const useFileHandler = () => { } } - const allEntities = entitiesArray.flat(); - const allRelationships = relationshipsArray.flat(); - const allDocuments = documentsArray.flat(); - const allTextUnits = textUnitsArray.flat(); - const allCommunities = communitiesArray.flat(); - const allCommunityReports = communityReportsArray.flat(); - const allCovariates = covariatesArray.flat(); - - setEntities(allEntities); - setRelationships(allRelationships); - setDocuments(allDocuments); - setTextUnits(allTextUnits); - setCommunities(allCommunities); - setCommunityReports(allCommunityReports); - setCovariates(allCovariates); + setEntities(entitiesArray.flat()); + setRelationships(relationshipsArray.flat()); + setDocuments(documentsArray.flat()); + setTextUnits(textUnitsArray.flat()); + setCommunities(communitiesArray.flat()); + setCommunityReports(communityReportsArray.flat()); + setCovariates(covariatesArray.flat()); + }; + + const checkFileExists = async (filePath: string) => { + try { + const response = await fetch(filePath, { + method: "HEAD", + cache: "no-store", + }); + + if (response.ok) { + const contentType = response.headers.get("Content-Type"); + + if (contentType === "application/octet-stream") { + // Updated Content-Type check + console.log(`File exists: ${filePath}`); + return true; + } else { + console.warn( + `File does not exist or incorrect type: ${filePath} (Content-Type: ${contentType})` + ); + return false; + } + } else { + console.warn( + `File does not exist: ${filePath} (status: ${response.status})` + ); + return false; + } + } catch (error) { + console.error(`Error checking file existence for ${filePath}`, error); + return false; + } + }; + + const loadDefaultFiles = async () => { + const filesToLoad = []; + + for (const file of defaultFiles) { + const fileExists = await checkFileExists(file); + if (fileExists) { + filesToLoad.push(file); // Add to load queue if the file exists + } + } + if (filesToLoad.length > 0) { + await loadFiles(filesToLoad); + } else { + console.log("No default files found in the public folder."); + } }; return { @@ -83,6 +164,7 @@ const useFileHandler = () => { covariates, communityReports, handleFilesRead, + loadDefaultFiles, }; }; diff --git a/src/app/utils/parquet-utils.ts b/src/app/utils/parquet-utils.ts index 623eac1..249bdb2 100644 --- a/src/app/utils/parquet-utils.ts +++ b/src/app/utils/parquet-utils.ts @@ -16,23 +16,17 @@ export class AsyncBuffer { } } -// const parseValue = (value: any): any => { -// // Check if value is a string that ends with 'n', indicating it's a BigInt -// if (typeof value === "string" && value.endsWith("n")) { -// // Convert to a regular number -// return Number(value.slice(0, -1)); -// } -// return value; -// }; - -const parseValue = (value: any, type: 'number' | 'bigint'): any => { +const parseValue = (value: any, type: "number" | "bigint"): any => { if (typeof value === "string" && value.endsWith("n")) { return BigInt(value.slice(0, -1)); } - return type === 'bigint' ? BigInt(value) : Number(value); + return type === "bigint" ? BigInt(value) : Number(value); }; -export const readParquetFile = async (file: File, schema?: string): Promise => { +export const readParquetFile = async ( + file: File | Blob, + schema?: string +): Promise => { try { const arrayBuffer = await file.arrayBuffer(); const asyncBuffer = new AsyncBuffer(arrayBuffer); @@ -41,100 +35,116 @@ export const readParquetFile = async (file: File, schema?: string): Promise { - if (schema === 'entity') { - resolve(rows.map(row => ({ - id: row[0], - name: row[1], - type: row[2], - description: row[3], - human_readable_id: parseValue(row[4], 'number'), - graph_embedding: row[5], - text_unit_ids: row[6], - description_embedding: row[7], - }))); - } else if (schema === 'relationship') { - resolve(rows.map(row => ({ - source: row[0], - target: row[1], - type: 'RELATED', // Custom field to match neo4j - weight: row[2], - description: row[3], - text_unit_ids: row[4], - id: row[5], - human_readable_id: parseValue(row[6], 'number'), - source_degree: parseValue(row[7], 'number'), - target_degree: parseValue(row[8], 'number'), - rank: parseValue(row[9], 'number'), - }))); - } else if (schema === 'document') { - resolve(rows.map(row => ({ - id: row[0], - text_unit_ids: row[1], - raw_content: row[2], - title: row[3], - }))); - } else if (schema === 'text_unit') { - resolve(rows.map(row => ({ - id: row[0], - text: row[1], - n_tokens: parseValue(row[2], 'number'), - document_ids: row[3], - entity_ids: row[4], - relationship_ids: row[5], - }))); - } else if (schema === 'community') { - resolve(rows.map(row => ({ - id: row[0], - title: row[1], - level: parseValue(row[2], 'number'), - ...(row.length > 5 ? { raw_community: parseValue(row[3], 'number') } : {}), // From graphrag 0.3.X onwards, raw_community is removed - relationship_ids: row[row.length > 5 ? 4 : 3], - text_unit_ids: row[row.length > 5 ? 5 : 4], - }))); - } else if (schema === 'community_report') { - resolve(rows.map(row => ({ - community: parseValue(row[0], 'number'), - full_content: row[1], - level: parseValue(row[2], 'number'), - rank: parseValue(row[3], 'number'), - title: row[4], - rank_explanation: row[5], - summary: row[6], - findings: row[7].map((finding: any) => ({ - explanation: finding.explanation, - summary: finding.summary, - })), - full_content_json: row[8], - id: row[9], - - }))); - } else if (schema === 'covariate') { - resolve(rows.map(row => ({ - id: row[0], - human_readable_id: parseValue(row[1], 'number'), - covariate_type: row[2], - type: row[3], - description: row[4], - subject_id: row[5], - subject_type: row[6], - object_id: row[7], - object_type: row[8], - status: row[9], - start_date: row[10], - end_date: row[11], - source_text: row[12], - text_unit_id: row[13], - document_ids: row[14], - n_tokens: parseValue(row[15], 'number'), - }))); - } - - else { - resolve(rows.map(row => Object.fromEntries(row.map((value, index) => [index, value])))); + if (schema === "entity") { + resolve( + rows.map((row) => ({ + id: row[0], + name: row[1], + type: row[2], + description: row[3], + human_readable_id: parseValue(row[4], "number"), + graph_embedding: row[5], + text_unit_ids: row[6], + description_embedding: row[7], + })) + ); + } else if (schema === "relationship") { + resolve( + rows.map((row) => ({ + source: row[0], + target: row[1], + type: "RELATED", // Custom field to match neo4j + weight: row[2], + description: row[3], + text_unit_ids: row[4], + id: row[5], + human_readable_id: parseValue(row[6], "number"), + source_degree: parseValue(row[7], "number"), + target_degree: parseValue(row[8], "number"), + rank: parseValue(row[9], "number"), + })) + ); + } else if (schema === "document") { + resolve( + rows.map((row) => ({ + id: row[0], + text_unit_ids: row[1], + raw_content: row[2], + title: row[3], + })) + ); + } else if (schema === "text_unit") { + resolve( + rows.map((row) => ({ + id: row[0], + text: row[1], + n_tokens: parseValue(row[2], "number"), + document_ids: row[3], + entity_ids: row[4], + relationship_ids: row[5], + })) + ); + } else if (schema === "community") { + resolve( + rows.map((row) => ({ + id: row[0], + title: row[1], + level: parseValue(row[2], "number"), + ...(row.length > 5 + ? { raw_community: parseValue(row[3], "number") } + : {}), // From graphrag 0.3.X onwards, raw_community is removed + relationship_ids: row[row.length > 5 ? 4 : 3], + text_unit_ids: row[row.length > 5 ? 5 : 4], + })) + ); + } else if (schema === "community_report") { + resolve( + rows.map((row) => ({ + community: parseValue(row[0], "number"), + full_content: row[1], + level: parseValue(row[2], "number"), + rank: parseValue(row[3], "number"), + title: row[4], + rank_explanation: row[5], + summary: row[6], + findings: row[7].map((finding: any) => ({ + explanation: finding.explanation, + summary: finding.summary, + })), + full_content_json: row[8], + id: row[9], + })) + ); + } else if (schema === "covariate") { + resolve( + rows.map((row) => ({ + id: row[0], + human_readable_id: parseValue(row[1], "number"), + covariate_type: row[2], + type: row[3], + description: row[4], + subject_id: row[5], + subject_type: row[6], + object_id: row[7], + object_type: row[8], + status: row[9], + start_date: row[10], + end_date: row[11], + source_text: row[12], + text_unit_id: row[13], + document_ids: row[14], + n_tokens: parseValue(row[15], "number"), + })) + ); + } else { + resolve( + rows.map((row) => + Object.fromEntries(row.map((value, index) => [index, value])) + ) + ); } }, }; - parquetRead(options).catch(reject); }); } catch (err) {