-
Notifications
You must be signed in to change notification settings - Fork 95
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FSTORE-1404] LLM PDF Tutorial (#266)
* LLM PDF Search Tutorial using RAG and Fine-Tuning
- Loading branch information
Showing
14 changed files
with
2,026 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,285 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "82622ee3", | ||
"metadata": {}, | ||
"source": [ | ||
"## <span style=\"color:#ff5f27\">📝 Imports </span>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ade7fe1f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install -r requirements.txt -q" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "7ab771e2", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import PyPDF2\n", | ||
"import pandas as pd\n", | ||
"from sentence_transformers import SentenceTransformer\n", | ||
"\n", | ||
"from functions.pdf_preprocess import (\n", | ||
" download_files_to_folder, \n", | ||
" process_pdf_file,\n", | ||
")\n", | ||
"from functions.text_preprocess import process_text_data\n", | ||
"import config\n", | ||
"\n", | ||
"import warnings\n", | ||
"warnings.filterwarnings('ignore')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "7e8f1796", | ||
"metadata": {}, | ||
"source": [ | ||
"## <span style=\"color:#ff5f27\">💾 Download files from Google Drive </span>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ea8c756e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Call the function to download files\n", | ||
"new_files = download_files_to_folder(\n", | ||
" config.FOLDER_ID, \n", | ||
" config.DOWNLOAD_PATH,\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "f783e27e", | ||
"metadata": {}, | ||
"source": [ | ||
"## <span style=\"color:#ff5f27\">🧬 Text Extraction </span>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "0b3b6715", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Initialize an empty list\n", | ||
"document_text = []\n", | ||
"\n", | ||
"for file in new_files:\n", | ||
" process_pdf_file(\n", | ||
" file, \n", | ||
" document_text, \n", | ||
" config.DOWNLOAD_PATH,\n", | ||
" )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "348b723e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create a DataFrame\n", | ||
"columns = [\"file_name\", \"file_link\", \"page_number\", \"text\"]\n", | ||
"df_text = pd.DataFrame(\n", | ||
" data=document_text,\n", | ||
" columns=columns,\n", | ||
")\n", | ||
"# Display the DataFrame\n", | ||
"df_text" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "62a70763", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Process text data using the process_text_data function\n", | ||
"df_text_processed = process_text_data(df_text)\n", | ||
"\n", | ||
"# Display the processed DataFrame\n", | ||
"df_text_processed" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "10f9ea36", | ||
"metadata": {}, | ||
"source": [ | ||
"## <span style=\"color:#ff5f27\">⚙️ Embeddings Creation </span>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "9805c689", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load the SentenceTransformer model\n", | ||
"model = SentenceTransformer(\n", | ||
" config.MODEL_SENTENCE_TRANSFORMER,\n", | ||
").to(config.DEVICE)\n", | ||
"model.device" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c1b7a89a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Generate embeddings for the 'text' column using the SentenceTransformer model\n", | ||
"df_text_processed['embeddings'] = pd.Series(\n", | ||
" model.encode(df_text_processed['text']).tolist(),\n", | ||
")\n", | ||
"\n", | ||
"# Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame\n", | ||
"df_text_processed['context_id'] = [*range(df_text_processed.shape[0])]\n", | ||
"\n", | ||
"# Display the resulting DataFrame with the added 'embeddings' and 'context_id' columns\n", | ||
"df_text_processed" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "d2bced31", | ||
"metadata": {}, | ||
"source": [ | ||
"## <span style=\"color:#ff5f27;\"> 🔮 Connecting to Hopsworks Feature Store </span>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "7caf764d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import hopsworks\n", | ||
"\n", | ||
"project = hopsworks.login()\n", | ||
"\n", | ||
"fs = project.get_feature_store() " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "0ed9ac69", | ||
"metadata": {}, | ||
"source": [ | ||
"## <span style=\"color:#ff5f27;\"> 🪄 Feature Group Creation </span>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "9f5e486b", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from hsfs import embedding\n", | ||
"\n", | ||
"# Create the Embedding Index\n", | ||
"emb = embedding.EmbeddingIndex()\n", | ||
"\n", | ||
"emb.add_embedding(\n", | ||
" \"embeddings\", \n", | ||
" model.get_sentence_embedding_dimension(),\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "6e32b548", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Get or create the 'documents_fg' feature group\n", | ||
"documents_fg = fs.get_or_create_feature_group(\n", | ||
" name=\"documents_fg\",\n", | ||
" embedding_index=emb,\n", | ||
" primary_key=['context_id'],\n", | ||
" version=1,\n", | ||
" description='Information from various files, presenting details like file names, source links, and structured text excerpts from different pages and paragraphs.',\n", | ||
" online_enabled=True,\n", | ||
")\n", | ||
"\n", | ||
"documents_fg.insert(df_text_processed)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "d39a9ed6", | ||
"metadata": {}, | ||
"source": [ | ||
"## <span style=\"color:#ff5f27;\">🪄 Feature View Creation </span>\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "7a7bc2f0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Get or create the 'documents' feature view\n", | ||
"feature_view = fs.get_or_create_feature_view(\n", | ||
" name=\"documents\",\n", | ||
" version=1,\n", | ||
" description='Chunked context for RAG system',\n", | ||
" query=documents_fg.select([\"file_name\", \"file_link\", \"page_number\", \"paragraph\", \"text\"]),\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "708b9a5f", | ||
"metadata": {}, | ||
"source": [ | ||
"---" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import PyPDF2 | ||
import pandas as pd | ||
from sentence_transformers import SentenceTransformer | ||
|
||
from functions.pdf_preprocess import download_files_to_folder, process_pdf_file | ||
from functions.text_preprocess import process_text_data | ||
import config | ||
|
||
import hopsworks | ||
|
||
def pipeline(): | ||
# Call the function to download files | ||
new_files = download_files_to_folder( | ||
config.FOLDER_ID, | ||
config.DOWNLOAD_PATH, | ||
) | ||
|
||
if len(new_files) == 0: | ||
print('⛳️ Your folder is up to date!') | ||
return | ||
|
||
# Initialize an empty list | ||
document_text = [] | ||
|
||
for file in new_files: | ||
process_pdf_file( | ||
file, | ||
document_text, | ||
config.DOWNLOAD_PATH, | ||
) | ||
|
||
# Create a DataFrame | ||
columns = ["file_name", "page_number", "text"] | ||
df_text = pd.DataFrame( | ||
data=document_text, | ||
columns=columns, | ||
) | ||
|
||
# Process text data using the process_text_data function | ||
df_text_processed = process_text_data(df_text) | ||
|
||
# Retrieve a SentenceTransformer | ||
model = SentenceTransformer( | ||
config.MODEL_SENTENCE_TRANSFORMER, | ||
).to(config.DEVICE) | ||
|
||
# Generate embeddings for the 'text' column using the SentenceTransformer model | ||
df_text_processed['embeddings'] = pd.Series( | ||
model.encode(df_text_processed['text']).tolist(), | ||
) | ||
|
||
# Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame | ||
df_text_processed['context_id'] = [*range(df_text_processed.shape[0])] | ||
|
||
|
||
project = hopsworks.login() | ||
|
||
fs = project.get_feature_store() | ||
|
||
documents_fg = fs.get_feature_group( | ||
name="documents_fg", | ||
version=1, | ||
) | ||
|
||
documents_fg.insert(df_text_processed) | ||
return | ||
|
||
if __name__ == '__main__': | ||
pipeline() |
Oops, something went wrong.