-
Notifications
You must be signed in to change notification settings - Fork 0
/
persistence.py
37 lines (30 loc) · 1.64 KB
/
persistence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import os
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
class Persistence:
dir = "embeddings"
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = []
def get_storage(inputs):
if True or os.path.exists(inputs + "/*.docx") or os.path.exists(inputs + "/*.pdf"):
print('Re-indexing')
return Persistence.build_index(inputs, Persistence.dir)
else:
print('Accessing existing index')
return Persistence.reload_index(Persistence.dir)
def build_index(source_path, index_path):
for filename in os.listdir(source_path):
if filename.endswith(".pdf"):
print('Indexing PDF ' + filename)
loader = PyPDFLoader(source_path + '/' + filename)
elif filename.endswith(".doc") or filename.endswith(".docx"):
print('Indexing Word document ' + filename)
loader = Docx2txtLoader(source_path + '/' + filename)
data = loader.load()
Persistence.all_splits += Persistence.text_splitter.split_documents(data)
return Chroma.from_documents(documents=Persistence.all_splits, embedding=OpenAIEmbeddings(), persist_directory=index_path)
def reload_index(index_path):
return Chroma(persist_directory="embeddings", embedding_function=OpenAIEmbeddings())