Merge pull request #386 from rmusser01/dev

Web Scraping pipeline slightly updated
the-crypt-keeper · Oct 22, 2024 · 785b12a · 785b12a
2 parents 243163e + 978d4ec
commit 785b12a
Show file tree

Hide file tree

Showing 6 changed files with 424 additions and 428 deletions.
diff --git a/App_Function_Libraries/Gradio_UI/Chat_ui.py b/App_Function_Libraries/Gradio_UI/Chat_ui.py
@@ -261,7 +261,7 @@ def create_chat_interface():
                                                  lines=3,
                                                  visible=False)
             with gr.Column(scale=2):
-                chatbot = gr.Chatbot(height=600, elem_classes="chatbot-container")
+                chatbot = gr.Chatbot(height=800, elem_classes="chatbot-container")
                 msg = gr.Textbox(label="Enter your message")
                 submit = gr.Button("Submit")
                 regenerate_button = gr.Button("Regenerate Last Message")
@@ -465,7 +465,7 @@ def create_chat_interface_stacked():
                 gr.Markdown("Scroll down for the chat window...")
         with gr.Row():
             with gr.Column(scale=1):
-                chatbot = gr.Chatbot(height=600, elem_classes="chatbot-container")
+                chatbot = gr.Chatbot(height=800, elem_classes="chatbot-container")
                 msg = gr.Textbox(label="Enter your message")
         with gr.Row():
             with gr.Column():

diff --git a/App_Function_Libraries/Gradio_UI/Website_scraping_tab.py b/App_Function_Libraries/Gradio_UI/Website_scraping_tab.py
@@ -19,8 +19,8 @@
 
 #
 # Local Imports
-from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_from_sitemap, scrape_by_url_level, scrape_article
-from App_Function_Libraries.Web_Scraping.Article_Summarization_Lib import scrape_and_summarize_multiple
+from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_from_sitemap, scrape_by_url_level, \
+    scrape_article, collect_bookmarks, scrape_and_summarize_multiple
 from App_Function_Libraries.DB.DB_Manager import load_preset_prompts
 from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt
 from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize
@@ -35,12 +35,12 @@ def get_url_depth(url: str) -> int:
     return len(urlparse(url).path.strip('/').split('/'))
 
 
-def sync_recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay=1.0):
+def sync_recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay=1.0, custom_cookies=None):
     def run_async_scrape():
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
         return loop.run_until_complete(
-            recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay)
+            recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay, custom_cookies=custom_cookies)
         )
 
     with ThreadPoolExecutor() as executor:
@@ -55,7 +55,8 @@ async def recursive_scrape(
         progress_callback: callable,
         delay: float = 1.0,
         resume_file: str = 'scrape_progress.json',
-        user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+        user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
+        custom_cookies: Optional[List[Dict[str, Any]]] = None
 ) -> List[Dict]:
     async def save_progress():
         temp_file = resume_file + ".tmp"
@@ -90,6 +91,10 @@ def is_valid_url(url: str) -> bool:
             browser = await p.chromium.launch(headless=True)
             context = await browser.new_context(user_agent=user_agent)
 
+            # Set custom cookies if provided
+            if custom_cookies:
+                await context.add_cookies(custom_cookies)
+
             try:
                 while to_visit and pages_scraped < max_pages:
                     current_url, current_depth = to_visit.pop(0)
@@ -293,7 +298,7 @@ def create_website_scraping_tab():
                     lines=5
                 )
                 with gr.Row():
-                    summarize_checkbox = gr.Checkbox(label="Summarize Articles", value=False)
+                    summarize_checkbox = gr.Checkbox(label="Summarize/Analyze Articles", value=False)
                     custom_prompt_checkbox = gr.Checkbox(label="Use a Custom Prompt", value=False, visible=True)
                     preset_prompt_checkbox = gr.Checkbox(label="Use a pre-set Prompt", value=False, visible=True)
                 with gr.Row():
@@ -348,14 +353,35 @@ def create_website_scraping_tab():
                     placeholder="Enter your API key here; Ignore if using Local API or Built-in API",
                     type="password"
                 )
+                custom_cookies_input = gr.Textbox(
+                    label="Custom Cookies (JSON format)",
+                    placeholder="Enter custom cookies in JSON format",
+                    lines=3,
+                    visible=True
+                )
                 keywords_input = gr.Textbox(
                     label="Keywords",
                     placeholder="Enter keywords here (comma-separated)",
                     value="default,no_keyword_set",
                     visible=True
                 )
+                # Updated: Added output to display parsed URLs
+                bookmarks_file_input = gr.File(
+                    label="Upload Bookmarks File",
+                    type="filepath",
+                    file_types=[".json", ".html"],
+                    visible=True
+                )
+                parsed_urls_output = gr.Textbox(
+                    label="Parsed URLs from Bookmarks",
+                    placeholder="URLs will be displayed here after uploading a bookmarks file.",
+                    lines=10,
+                    interactive=False,
+                    visible=False  # Initially hidden, shown only when URLs are parsed
+                )
 
                 scrape_button = gr.Button("Scrape and Summarize")
+
             with gr.Column():
                 progress_output = gr.Textbox(label="Progress", lines=3)
                 result_output = gr.Textbox(label="Result", lines=20)
@@ -396,10 +422,61 @@ def update_prompts(preset_name):
 
         preset_prompt.change(
             update_prompts,
-            inputs=preset_prompt,
+            inputs=[preset_prompt],
             outputs=[website_custom_prompt_input, system_prompt_input]
         )
 
+        def parse_bookmarks(file_path):
+            """
+            Parses the uploaded bookmarks file and extracts URLs.
+
+            Args:
+                file_path (str): Path to the uploaded bookmarks file.
+
+            Returns:
+                str: Formatted string of extracted URLs or error message.
+            """
+            try:
+                bookmarks = collect_bookmarks(file_path)
+                # Extract URLs
+                urls = []
+                for value in bookmarks.values():
+                    if isinstance(value, list):
+                        urls.extend(value)
+                    elif isinstance(value, str):
+                        urls.append(value)
+                if not urls:
+                    return "No URLs found in the bookmarks file."
+                # Format URLs for display
+                formatted_urls = "\n".join(urls)
+                return formatted_urls
+            except Exception as e:
+                logging.error(f"Error parsing bookmarks file: {str(e)}")
+                return f"Error parsing bookmarks file: {str(e)}"
+
+        def show_parsed_urls(bookmarks_file):
+            """
+            Determines whether to show the parsed URLs output.
+
+            Args:
+                bookmarks_file: Uploaded file object.
+
+            Returns:
+                Tuple indicating visibility and content of parsed_urls_output.
+            """
+            if bookmarks_file is None:
+                return gr.update(visible=False), ""
+            file_path = bookmarks_file.name
+            parsed_urls = parse_bookmarks(file_path)
+            return gr.update(visible=True), parsed_urls
+
+        # Connect the parsing function to the file upload event
+        bookmarks_file_input.change(
+            fn=show_parsed_urls,
+            inputs=[bookmarks_file_input],
+            outputs=[parsed_urls_output, parsed_urls_output]
+        )
+
         async def scrape_and_summarize_wrapper(
                 scrape_method: str,
                 url_input: str,
@@ -413,15 +490,46 @@ async def scrape_and_summarize_wrapper(
                 keywords: str,
                 custom_titles: Optional[str],
                 system_prompt: Optional[str],
-                temperature: float = 0.7,
+                temperature: float,
+                custom_cookies: Optional[str],
+                bookmarks_file,
                 progress: gr.Progress = gr.Progress()
         ) -> str:
             try:
                 result: List[Dict[str, Any]] = []
 
+                # Handle bookmarks file if provided
+                if bookmarks_file is not None:
+                    bookmarks = collect_bookmarks(bookmarks_file.name)
+                    # Extract URLs from bookmarks
+                    urls_from_bookmarks = []
+                    for value in bookmarks.values():
+                        if isinstance(value, list):
+                            urls_from_bookmarks.extend(value)
+                        elif isinstance(value, str):
+                            urls_from_bookmarks.append(value)
+                    if scrape_method == "Individual URLs":
+                        url_input = "\n".join(urls_from_bookmarks)
+                    else:
+                        if urls_from_bookmarks:
+                            url_input = urls_from_bookmarks[0]
+                        else:
+                            return convert_json_to_markdown(json.dumps({"error": "No URLs found in the bookmarks file."}))
+
+                # Handle custom cookies
+                custom_cookies_list = None
+                if custom_cookies:
+                    try:
+                        custom_cookies_list = json.loads(custom_cookies)
+                        if not isinstance(custom_cookies_list, list):
+                            custom_cookies_list = [custom_cookies_list]
+                    except json.JSONDecodeError as e:
+                        return convert_json_to_markdown(json.dumps({"error": f"Invalid JSON format for custom cookies: {e}"}))
+
                 if scrape_method == "Individual URLs":
+                    # FIXME modify scrape_and_summarize_multiple to accept custom_cookies
                     result = await scrape_and_summarize_multiple(url_input, custom_prompt, api_name, api_key, keywords,
-                                                                 custom_titles, system_prompt)
+                                                                 custom_titles, system_prompt, summarize_checkbox, custom_cookies=custom_cookies_list)
                 elif scrape_method == "Sitemap":
                     result = await asyncio.to_thread(scrape_from_sitemap, url_input)
                 elif scrape_method == "URL Level":
@@ -430,7 +538,8 @@ async def scrape_and_summarize_wrapper(
                             json.dumps({"error": "URL level is required for URL Level scraping."}))
                     result = await asyncio.to_thread(scrape_by_url_level, url_input, url_level)
                 elif scrape_method == "Recursive Scraping":
-                    result = await recursive_scrape(url_input, max_pages, max_depth, progress.update, delay=1.0)
+                    result = await recursive_scrape(url_input, max_pages, max_depth, progress.update, delay=1.0,
+                                                    custom_cookies=custom_cookies_list)
                 else:
                     return convert_json_to_markdown(json.dumps({"error": f"Unknown scraping method: {scrape_method}"}))
 
@@ -496,7 +605,8 @@ async def scrape_and_summarize_wrapper(
             fn=lambda *args: asyncio.run(scrape_and_summarize_wrapper(*args)),
             inputs=[scrape_method, url_input, url_level, max_pages, max_depth, summarize_checkbox,
                     website_custom_prompt_input, api_name_input, api_key_input, keywords_input,
-                    custom_article_title_input, system_prompt_input, temp_slider],
+                    custom_article_title_input, system_prompt_input, temp_slider,
+                    custom_cookies_input, bookmarks_file_input],
             outputs=[result_output]
         )
 
@@ -527,11 +637,15 @@ def convert_json_to_markdown(json_str: str) -> str:
         markdown += f"- **Scrape Method:** {data['scrape_method']}\n"
         markdown += f"- **API Used:** {data['api_used']}\n"
         markdown += f"- **Keywords:** {data['keywords']}\n"
-        if data['url_level'] is not None:
+        if data.get('url_level') is not None:
             markdown += f"- **URL Level:** {data['url_level']}\n"
+        if data.get('max_pages') is not None:
+            markdown += f"- **Maximum Pages:** {data['max_pages']}\n"
+        if data.get('max_depth') is not None:
+            markdown += f"- **Maximum Depth:** {data['max_depth']}\n"
         markdown += f"- **Total Articles Scraped:** {data['total_articles_scraped']}\n\n"
 
-        # Add URLs scraped
+        # Add URLs Scraped
         markdown += "## URLs Scraped\n\n"
         for url in data['urls_scraped']:
             markdown += f"- {url}\n"
@@ -549,6 +663,7 @@ def convert_json_to_markdown(json_str: str) -> str:
         return f"# Error\n\nMissing key in JSON data: {str(e)}"
     except Exception as e:
         return f"# Error\n\nAn unexpected error occurred: {str(e)}"
+
 #
 # End of File
 ########################################################################################################################
diff --git a/App_Function_Libraries/Local_LLM/Local_LLM_Inference_Engine_Lib.py b/App_Function_Libraries/Local_LLM/Local_LLM_Inference_Engine_Lib.py
@@ -28,7 +28,6 @@
 import requests
 #
 # Import Local
-from App_Function_Libraries.Web_Scraping.Article_Summarization_Lib import *
 from App_Function_Libraries.Utils.Utils import download_file
 #
 #######################################################################################################################