Skip to content

Commit

Permalink
Merge pull request #386 from rmusser01/dev
Browse files Browse the repository at this point in the history
Web Scraping pipeline slightly updated
  • Loading branch information
rmusser01 authored Oct 22, 2024
2 parents 243163e + 978d4ec commit 785b12a
Show file tree
Hide file tree
Showing 6 changed files with 424 additions and 428 deletions.
4 changes: 2 additions & 2 deletions App_Function_Libraries/Gradio_UI/Chat_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def create_chat_interface():
lines=3,
visible=False)
with gr.Column(scale=2):
chatbot = gr.Chatbot(height=600, elem_classes="chatbot-container")
chatbot = gr.Chatbot(height=800, elem_classes="chatbot-container")
msg = gr.Textbox(label="Enter your message")
submit = gr.Button("Submit")
regenerate_button = gr.Button("Regenerate Last Message")
Expand Down Expand Up @@ -465,7 +465,7 @@ def create_chat_interface_stacked():
gr.Markdown("Scroll down for the chat window...")
with gr.Row():
with gr.Column(scale=1):
chatbot = gr.Chatbot(height=600, elem_classes="chatbot-container")
chatbot = gr.Chatbot(height=800, elem_classes="chatbot-container")
msg = gr.Textbox(label="Enter your message")
with gr.Row():
with gr.Column():
Expand Down
141 changes: 128 additions & 13 deletions App_Function_Libraries/Gradio_UI/Website_scraping_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@

#
# Local Imports
from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_from_sitemap, scrape_by_url_level, scrape_article
from App_Function_Libraries.Web_Scraping.Article_Summarization_Lib import scrape_and_summarize_multiple
from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_from_sitemap, scrape_by_url_level, \
scrape_article, collect_bookmarks, scrape_and_summarize_multiple
from App_Function_Libraries.DB.DB_Manager import load_preset_prompts
from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize
Expand All @@ -35,12 +35,12 @@ def get_url_depth(url: str) -> int:
return len(urlparse(url).path.strip('/').split('/'))


def sync_recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay=1.0):
def sync_recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay=1.0, custom_cookies=None):
def run_async_scrape():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
return loop.run_until_complete(
recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay)
recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay, custom_cookies=custom_cookies)
)

with ThreadPoolExecutor() as executor:
Expand All @@ -55,7 +55,8 @@ async def recursive_scrape(
progress_callback: callable,
delay: float = 1.0,
resume_file: str = 'scrape_progress.json',
user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
custom_cookies: Optional[List[Dict[str, Any]]] = None
) -> List[Dict]:
async def save_progress():
temp_file = resume_file + ".tmp"
Expand Down Expand Up @@ -90,6 +91,10 @@ def is_valid_url(url: str) -> bool:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent=user_agent)

# Set custom cookies if provided
if custom_cookies:
await context.add_cookies(custom_cookies)

try:
while to_visit and pages_scraped < max_pages:
current_url, current_depth = to_visit.pop(0)
Expand Down Expand Up @@ -293,7 +298,7 @@ def create_website_scraping_tab():
lines=5
)
with gr.Row():
summarize_checkbox = gr.Checkbox(label="Summarize Articles", value=False)
summarize_checkbox = gr.Checkbox(label="Summarize/Analyze Articles", value=False)
custom_prompt_checkbox = gr.Checkbox(label="Use a Custom Prompt", value=False, visible=True)
preset_prompt_checkbox = gr.Checkbox(label="Use a pre-set Prompt", value=False, visible=True)
with gr.Row():
Expand Down Expand Up @@ -348,14 +353,35 @@ def create_website_scraping_tab():
placeholder="Enter your API key here; Ignore if using Local API or Built-in API",
type="password"
)
custom_cookies_input = gr.Textbox(
label="Custom Cookies (JSON format)",
placeholder="Enter custom cookies in JSON format",
lines=3,
visible=True
)
keywords_input = gr.Textbox(
label="Keywords",
placeholder="Enter keywords here (comma-separated)",
value="default,no_keyword_set",
visible=True
)
# Updated: Added output to display parsed URLs
bookmarks_file_input = gr.File(
label="Upload Bookmarks File",
type="filepath",
file_types=[".json", ".html"],
visible=True
)
parsed_urls_output = gr.Textbox(
label="Parsed URLs from Bookmarks",
placeholder="URLs will be displayed here after uploading a bookmarks file.",
lines=10,
interactive=False,
visible=False # Initially hidden, shown only when URLs are parsed
)

scrape_button = gr.Button("Scrape and Summarize")

with gr.Column():
progress_output = gr.Textbox(label="Progress", lines=3)
result_output = gr.Textbox(label="Result", lines=20)
Expand Down Expand Up @@ -396,10 +422,61 @@ def update_prompts(preset_name):

preset_prompt.change(
update_prompts,
inputs=preset_prompt,
inputs=[preset_prompt],
outputs=[website_custom_prompt_input, system_prompt_input]
)

def parse_bookmarks(file_path):
"""
Parses the uploaded bookmarks file and extracts URLs.
Args:
file_path (str): Path to the uploaded bookmarks file.
Returns:
str: Formatted string of extracted URLs or error message.
"""
try:
bookmarks = collect_bookmarks(file_path)
# Extract URLs
urls = []
for value in bookmarks.values():
if isinstance(value, list):
urls.extend(value)
elif isinstance(value, str):
urls.append(value)
if not urls:
return "No URLs found in the bookmarks file."
# Format URLs for display
formatted_urls = "\n".join(urls)
return formatted_urls
except Exception as e:
logging.error(f"Error parsing bookmarks file: {str(e)}")
return f"Error parsing bookmarks file: {str(e)}"

def show_parsed_urls(bookmarks_file):
"""
Determines whether to show the parsed URLs output.
Args:
bookmarks_file: Uploaded file object.
Returns:
Tuple indicating visibility and content of parsed_urls_output.
"""
if bookmarks_file is None:
return gr.update(visible=False), ""
file_path = bookmarks_file.name
parsed_urls = parse_bookmarks(file_path)
return gr.update(visible=True), parsed_urls

# Connect the parsing function to the file upload event
bookmarks_file_input.change(
fn=show_parsed_urls,
inputs=[bookmarks_file_input],
outputs=[parsed_urls_output, parsed_urls_output]
)

async def scrape_and_summarize_wrapper(
scrape_method: str,
url_input: str,
Expand All @@ -413,15 +490,46 @@ async def scrape_and_summarize_wrapper(
keywords: str,
custom_titles: Optional[str],
system_prompt: Optional[str],
temperature: float = 0.7,
temperature: float,
custom_cookies: Optional[str],
bookmarks_file,
progress: gr.Progress = gr.Progress()
) -> str:
try:
result: List[Dict[str, Any]] = []

# Handle bookmarks file if provided
if bookmarks_file is not None:
bookmarks = collect_bookmarks(bookmarks_file.name)
# Extract URLs from bookmarks
urls_from_bookmarks = []
for value in bookmarks.values():
if isinstance(value, list):
urls_from_bookmarks.extend(value)
elif isinstance(value, str):
urls_from_bookmarks.append(value)
if scrape_method == "Individual URLs":
url_input = "\n".join(urls_from_bookmarks)
else:
if urls_from_bookmarks:
url_input = urls_from_bookmarks[0]
else:
return convert_json_to_markdown(json.dumps({"error": "No URLs found in the bookmarks file."}))

# Handle custom cookies
custom_cookies_list = None
if custom_cookies:
try:
custom_cookies_list = json.loads(custom_cookies)
if not isinstance(custom_cookies_list, list):
custom_cookies_list = [custom_cookies_list]
except json.JSONDecodeError as e:
return convert_json_to_markdown(json.dumps({"error": f"Invalid JSON format for custom cookies: {e}"}))

if scrape_method == "Individual URLs":
# FIXME modify scrape_and_summarize_multiple to accept custom_cookies
result = await scrape_and_summarize_multiple(url_input, custom_prompt, api_name, api_key, keywords,
custom_titles, system_prompt)
custom_titles, system_prompt, summarize_checkbox, custom_cookies=custom_cookies_list)
elif scrape_method == "Sitemap":
result = await asyncio.to_thread(scrape_from_sitemap, url_input)
elif scrape_method == "URL Level":
Expand All @@ -430,7 +538,8 @@ async def scrape_and_summarize_wrapper(
json.dumps({"error": "URL level is required for URL Level scraping."}))
result = await asyncio.to_thread(scrape_by_url_level, url_input, url_level)
elif scrape_method == "Recursive Scraping":
result = await recursive_scrape(url_input, max_pages, max_depth, progress.update, delay=1.0)
result = await recursive_scrape(url_input, max_pages, max_depth, progress.update, delay=1.0,
custom_cookies=custom_cookies_list)
else:
return convert_json_to_markdown(json.dumps({"error": f"Unknown scraping method: {scrape_method}"}))

Expand Down Expand Up @@ -496,7 +605,8 @@ async def scrape_and_summarize_wrapper(
fn=lambda *args: asyncio.run(scrape_and_summarize_wrapper(*args)),
inputs=[scrape_method, url_input, url_level, max_pages, max_depth, summarize_checkbox,
website_custom_prompt_input, api_name_input, api_key_input, keywords_input,
custom_article_title_input, system_prompt_input, temp_slider],
custom_article_title_input, system_prompt_input, temp_slider,
custom_cookies_input, bookmarks_file_input],
outputs=[result_output]
)

Expand Down Expand Up @@ -527,11 +637,15 @@ def convert_json_to_markdown(json_str: str) -> str:
markdown += f"- **Scrape Method:** {data['scrape_method']}\n"
markdown += f"- **API Used:** {data['api_used']}\n"
markdown += f"- **Keywords:** {data['keywords']}\n"
if data['url_level'] is not None:
if data.get('url_level') is not None:
markdown += f"- **URL Level:** {data['url_level']}\n"
if data.get('max_pages') is not None:
markdown += f"- **Maximum Pages:** {data['max_pages']}\n"
if data.get('max_depth') is not None:
markdown += f"- **Maximum Depth:** {data['max_depth']}\n"
markdown += f"- **Total Articles Scraped:** {data['total_articles_scraped']}\n\n"

# Add URLs scraped
# Add URLs Scraped
markdown += "## URLs Scraped\n\n"
for url in data['urls_scraped']:
markdown += f"- {url}\n"
Expand All @@ -549,6 +663,7 @@ def convert_json_to_markdown(json_str: str) -> str:
return f"# Error\n\nMissing key in JSON data: {str(e)}"
except Exception as e:
return f"# Error\n\nAn unexpected error occurred: {str(e)}"

#
# End of File
########################################################################################################################
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import requests
#
# Import Local
from App_Function_Libraries.Web_Scraping.Article_Summarization_Lib import *
from App_Function_Libraries.Utils.Utils import download_file
#
#######################################################################################################################
Expand Down
Loading

0 comments on commit 785b12a

Please sign in to comment.