-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Neo4J ChunkProcessor implementation + Multicontainer docker env setup
- Loading branch information
Showing
5 changed files
with
137 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,29 @@ | ||
# Use the official Python image from Docker Hub as the base image | ||
FROM python:3.9-slim | ||
|
||
# Set the working directory inside the container | ||
WORKDIR /app | ||
|
||
# Copy the pyproject.toml and poetry.lock to the working directory inside the container | ||
COPY pyproject.toml poetry.lock /app/ | ||
|
||
# Install Poetry (a dependency manager for Python) | ||
RUN pip install poetry | ||
|
||
# Install the dependencies listed in pyproject.toml | ||
RUN poetry install --no-dev | ||
|
||
# Copy the rest of the application code to the container | ||
|
||
COPY ./src /app/src | ||
COPY ./tests /app/tests | ||
COPY ./example_data /app/example_data | ||
COPY README.md /app/ | ||
|
||
# Set the working directory to the src directory | ||
WORKDIR /app/src | ||
|
||
# Set environment variables (optional) | ||
ENV NEO4J_URI=bolt://neo4j:7687 | ||
ENV NEO4J_USER=neo4j | ||
ENV NEO4J_PASSWORD=test | ||
ENV directory="../example_data" | ||
ENV mode="size" | ||
ENV chunk_size="300" | ||
ENV overlap_size="20" | ||
ENV txt_separator="\n\n" | ||
ENV NEO4J_URI="bolt://neo4j:7687" | ||
ENV NEO4J_USER="neo4j" | ||
ENV NEO4J_PASSWORD="test1234" | ||
|
||
|
||
# Define the command that runs your application when the container starts | ||
CMD ["poetry", "run", "python", "run_pipeline.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
from neo4j import GraphDatabase | ||
|
||
class ChunkProcessor: | ||
def __init__(self, neo4j_uri, neo4j_user, neo4j_password): | ||
self.neo4j_uri = neo4j_uri | ||
self.neo4j_user = neo4j_user | ||
self.neo4j_password = neo4j_password | ||
self.driver = GraphDatabase.driver(self.neo4j_uri, auth=(self.neo4j_user, self.neo4j_password)) | ||
|
||
def close(self): | ||
if self.driver: | ||
self.driver.close() | ||
|
||
def create_file_node(self, session, file_name): | ||
query = """ | ||
MERGE (f:File {name: $file_name}) | ||
RETURN f | ||
""" | ||
session.run(query, file_name=file_name) | ||
|
||
def create_chunk_node(self, session, chunk, index): | ||
query = """ | ||
MERGE (c:Chunk {index: $index, text: $text}) | ||
RETURN c | ||
""" | ||
session.run(query, index=index, text=chunk) | ||
|
||
def create_relationship_between_chunks(self, session, index1, index2): | ||
query = """ | ||
MATCH (c1:Chunk {index: $index1}), (c2:Chunk {index: $index2}) | ||
MERGE (c1)-[:NEXT]->(c2) | ||
""" | ||
session.run(query, index1=index1, index2=index2) | ||
|
||
def create_relationship_file_to_chunk(self, session, index1, index2): | ||
query = """ | ||
MATCH (c1:File {name: $index1}), (c2:Chunk {index: $index2}) | ||
MERGE (c1)-[:CONTAINS]->(c2) | ||
""" | ||
session.run(query, index1=index1, index2=index2) | ||
|
||
async def process_chunks(self, directory_reader): | ||
|
||
previous_index = None | ||
index = 1 | ||
|
||
with self.driver.session() as session: | ||
async for file_name, chunk in directory_reader.read_files(): | ||
if chunk.strip(): | ||
|
||
self.create_chunk_node(session, chunk, index) | ||
|
||
self.create_file_node(session, file_name) | ||
|
||
self.create_relationship_file_to_chunk(session, file_name, index) | ||
|
||
if previous_index is None or file_name != previous_file_name: | ||
previous_index = None | ||
|
||
if previous_index is not None: | ||
self.create_relationship_between_chunks(session, previous_index, index) | ||
|
||
previous_index = index | ||
previous_file_name = file_name | ||
index += 1 | ||
|
||
print("Chunk processing complete.") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,51 @@ | ||
import os | ||
import time | ||
import asyncio | ||
from directory_reader import DirectoryFileReader | ||
from chunk_to_network import neo4j_processor as neo4j_ingestion | ||
|
||
|
||
# This is only here for the local setup, otherwise its not needed. | ||
def init(): | ||
""" | ||
Initialize environment variables for all parameters. | ||
""" | ||
os.environ['directory'] = "../example_data" | ||
os.environ['mode'] = "size" | ||
os.environ['chunk_size'] = "300" | ||
os.environ['overlap_size'] = "20" | ||
os.environ['txt_separator'] = "\n\n" | ||
os.environ['NEO4J_URI'] = "bolt://localhost:7687" | ||
os.environ['NEO4J_USER'] = "neo4j" | ||
os.environ['NEO4J_PASSWORD'] = "test1234" | ||
|
||
async def main(): | ||
""" | ||
Main function to run the DirectoryFileReader and output chunked content | ||
of .txt and .pdf files. | ||
""" | ||
|
||
directory = "../example_data" | ||
## PARAMETERS | ||
directory = os.environ['directory'] | ||
mode = os.environ['mode'] | ||
chunk_size = int(os.environ['chunk_size']) | ||
overlap_size = int(os.environ['overlap_size']) | ||
txt_separator = os.environ['txt_separator'] | ||
NEO4J_URI = os.environ['NEO4J_URI'] | ||
NEO4J_USER = os.environ['NEO4J_USER'] | ||
NEO4J_PASSWORD = os.environ['NEO4J_PASSWORD'] | ||
|
||
mode = "separator" | ||
overlap_size = 0 | ||
txt_separator = "\n\n" # Example separator for text files (line break separator) | ||
directory_reader = DirectoryFileReader(directory, mode=mode, chunk_size=chunk_size, txt_separator=txt_separator, overlap_size=overlap_size) | ||
|
||
directory_reader = DirectoryFileReader(directory, mode=mode, chunk_size=500, txt_separator=txt_separator, overlap_size=overlap_size) | ||
chunk_processor = neo4j_ingestion.ChunkProcessor(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD) | ||
|
||
file_chunks = [] | ||
await chunk_processor.process_chunks(directory_reader) | ||
|
||
async for chunk in directory_reader.read_files(): | ||
if chunk.strip(): | ||
file_chunks.append(chunk) | ||
chunk_processor.close() | ||
|
||
print("All file chunks stored in memory:") | ||
for index, chunk in enumerate(file_chunks): | ||
print(f"Chunk {index + 1}:\n{chunk}\n{'-' * 80}") | ||
|
||
if __name__ == "__main__": | ||
asyncio.run(main()) | ||
|
||
#init() # To run locally | ||
asyncio.run(main()) | ||
time.sleep(20) |