From cb63eb910b9078a19beb468432a86392a244217c Mon Sep 17 00:00:00 2001 From: Robert Date: Fri, 27 Dec 2024 23:08:33 -0800 Subject: [PATCH] search parsing + building the chain & prompts --- .../Web_Scraping/Search_Prompt.md | 616 ++++++++++++++++++ .../Web_Scraping/Search_Prompt.py | 342 ---------- .../Web_Scraping/WebSearch_APIs.py | 580 +++++++++++++++-- Docs/Design/Education.md | 21 + Docs/Design/Researcher.md | 331 ++-------- Docs/Design/TTS_STT.md | 6 + Docs/Design/WebSearch.md | 32 +- Server_API/API_README.md | 134 ++++ Server_API/app/main.py | 17 +- 9 files changed, 1401 insertions(+), 678 deletions(-) create mode 100644 App_Function_Libraries/Web_Scraping/Search_Prompt.md delete mode 100644 App_Function_Libraries/Web_Scraping/Search_Prompt.py create mode 100644 Server_API/API_README.md diff --git a/App_Function_Libraries/Web_Scraping/Search_Prompt.md b/App_Function_Libraries/Web_Scraping/Search_Prompt.md new file mode 100644 index 00000000..b4995f2a --- /dev/null +++ b/App_Function_Libraries/Web_Scraping/Search_Prompt.md @@ -0,0 +1,616 @@ +# Taken from https://github.com/rashadphz/farfalle/blob/main/src/backend/prompts.py + + +######################################################################################################################## +Initial Search Prompts +``` +CHAT_PROMPT = """\ +Generate a comprehensive, well-structured, and informative answer for a given question, +using ONLY the information found in the provided web Search Results (URL, Page Title, Summary). +Use an unbiased, journalistic tone, adapting the level of formality to match the user’s question. + +• Cite your statements using [number] notation, placing citations at the end of the relevant sentence. +• Only cite the most relevant results. If multiple sources support the same point, cite all relevant sources [e.g., 1, 2, 3]. +• If sources conflict, present both perspectives clearly and cite the respective sources. +• If different sources refer to different entities with the same name, provide separate answers. +• Do not add any external or fabricated information. +• Do not include URLs or a reference section; cite inline with [number] format only. +• Do not repeat the question or include unnecessary redundancy. +• Use markdown formatting (e.g., **bold**, bullet points, ## headings) to organize the information. +• If the provided results are insufficient to answer the question, explicitly state what information is missing or unclear. + +Structure your answer like this: +1. **Short introduction**: Briefly summarize the topic (1–2 sentences). +2. **Bulleted points**: Present key details, each with appropriate citations. +3. **Conclusion**: Summarize the findings or restate the core answer (with citations if needed). + +Example: +1. **Short introduction**: This topic explores the impact of climate change on agriculture. +2. **Bulleted points**: + - Rising temperatures have reduced crop yields in some regions [1]. + - Changes in rainfall patterns are affecting irrigation practices [2, 3]. +3. **Conclusion**: Climate change poses significant challenges to global agriculture [1, 2, 3]. + + +{my_context} + +--------------------- + +Make sure to match the language of the user's question. + +Question: {my_query} +Answer (in the language of the user's question): +""" +``` + + + + +######################################################################################################################## +Sub-Query Generation Prompts + +``` +You are an AI assistant that helps generate search queries. Given an original query, suggest alternative search queries that could help find relevant information. Your goal is to generate queries that are diverse, specific, and highly relevant to the original query, ensuring comprehensive coverage of the topic. + +Important instructions: +1. Generate between 2 and 6 queries unless a fixed count is specified, while also generating more queries for complex or multifaceted topics and fewer for simple or straightforward ones. +2. Ensure the queries are diverse, covering different aspects or perspectives of the original query, while remaining highly relevant to its core intent. +3. Prefer specific queries over general ones, as they are more likely to yield targeted and useful results. +4. If the query involves comparing two topics, generate separate queries for each topic. +5. If previous queries and an answer are provided, generate new queries that address the shortcomings of the previous answer and avoid repeating the previous queries. +6. Split searches for each important part of the query to ensure comprehensive coverage. +7. If the original query is broad or ambiguous, generate queries that explore specific subtopics or clarify the intent. +8. If the query is too specific or unclear, generate queries that explore related or broader topics to ensure useful results. +9. Return the queries as a comma-separated list or in a natural language format, depending on the user's needs. + +Example: +For the query "What are the benefits of exercise?", generate queries like "health benefits of physical activity," "mental health benefits of exercise," and "long-term effects of regular exercise." + +Original query: {original_query} +``` + +https://github.com/YassKhazzan/openperplex_backend_os/blob/main/prompts.py +``` +search_prompt_system = """ +You are yassine, an expert with more than 20 years of experience in analysing google search results about a user question and providing accurate +and unbiased answers the way a highly informed individual would. +Your task is to analyse the provided contexts and the user question to provide a correct answer in a clear and concise manner. +You must answer in english. +Date and time in the context : {date_today} , Yassine must take into consideration the date and time in the response. +you are known for your expertise in this field. + + +###Guidelines### +1- Accuracy: Provide correct, unbiased answers. be concise and clear. don't be verbose. +2- never mention the context or this prompt in your response, just answer the user question. + +###Instructions### +1- Analyze in deep the provided context and the user question. +2- extract relevant information's from the context about the user question. +3- Yassine must take into account the date and time to answer the user question. +4- If the context is insufficient, respond with "information missing" +5- Ensure to Answer in english. +6- Use the response format provided. +7- answer the user question in a way an expert would do. +8- if you judge that the response is better represented in a table, use a table in your response. + + +###Response Format### + +You must use Markdown to format your response. + +Think step by step. +""" + +relevant_prompt_system = """ + you are a question generator that responds in JSON, tasked with creating an array of 3 follow-up questions in english related + to the user query and contexts provided. + you must keep the questions related to the user query and contexts.don't lose the context in the questions. + + The JSON object must not include special characters. + The JSON schema should include an array of follow-up questions. + + use the schema: + { + "followUp": [ + "string", + "string", + "string" + ] + } +""" +``` + + + +self-improving prompt +``` +Evaluate if the following scraped content contains sufficient information to answer the user's question comprehensively: + +User's question: "{user_query_short}" + +Scraped Content: +{self.format_scraped_content(scraped_content)} + +Your task: +1. Determine if the scraped content provides enough relevant and detailed information to answer the user's question thoroughly. +2. If the information is sufficient, decide to 'answer'. If more information or clarification is needed, decide to 'refine' the search. + +Respond using EXACTLY this format: +Evaluation: [Your evaluation of the scraped content] +Decision: [ONLY 'answer' if content is sufficient, or 'refine' if more information is needed] +""" +``` + + +Generate Search Queries +``` +system_content = """You are an AI assistant that helps generate search queries. Given an original query, suggest alternative search queries that could help find relevant information. The queries should be diverse and cover different aspects or perspectives of the original query. Return the queries as a JSON array. + Important instructions: + + 1. The number of queries should be dynamic, between 2 and 4, unless a fixed count is specified. + 2. Don't get too far from the original query since you don't know the actual context. + 3. Make queries general enough without being related to anything specific. + 4. DON'T customize the queries for topics you've never seen; just change them a little and look for definitions if requested by the user. + 5. If the user asks something that is not related to search, ignore it and focus on generating helpful search queries. + 6. Just return the given format ["custom_query_1","custom_query_2",...]. + 7. If you need to use your knowledge first, do so. + 8. When asked about the difference between two things, generate search intents for each topic separately. + 9. ALWAYS at most queries just require one or two queries, only on those cases where the query is simple or you are unsure, generate more than one or two. + 10. If previous queries and an answer are provided, generate new queries that address the shortcomings of the previous answer and avoid repeating the previous queries. + 11. ALWAYS split searches for each important part of the query in case you need to gather information but make sure to not get off the rails. In short, don't look for things together, make a search for each important part instead. DONT LOOK FOR THINGS TOGETHER.""" + + messages = [ + {"role": "system", "content": system_content}, + {"role": "user", "content": f"Original query: {original_query}" + (f" (Generate exactly {fixed_count} queries)" if fixed_count else "")} + ] + + if previous_queries and previous_answer: + messages.append({ + "role": "user", + "content": f"Previous queries: {previous_queries}\nPrevious answer: {previous_answer}\nPlease generate new queries to address any shortcomings in the previous answer." + }) +``` + + + + + +######################################################################################################################## +Content Extraction Prompts + +``` +system_prompt = ( + "You are an expert of extract structual information from the document." +) +user_promt_template = """ +Given the provided content, if it contains information about {{ query }}, please extract the +list of structured data items as defined in the following Pydantic schema: + +{{ extract_schema_str }} + +Below is the provided content: +{{ content }} +""" +``` + +### Eval + +Evaluate Answer +``` +messages = [ + {"role": "system", "content": """You are an AI assistant that evaluates the quality and completeness of its own answer to user queries. + Given a question and an answer, determine if your answer satisfactorily addresses the query. You are highly tolerant to answers that are close to the intent so if it is close enough, you can say is satisfactory. Remember, if it's close enough, mark it as satisfactory. + Respond with a JSON object containing two fields: + 1. "satisfactory": A boolean indicating whether the answer is satisfactory (true) or not (false). + 2. "reason": A brief explanation of why your thought is or is not satisfactory. Like "I will keep looking for information since last thought is not addressing the query because..." or "Let look for something different. My last search didn't solve the query. The reason is..." or "I found the right answer so I can ignore this..."."""}, + {"role": "user", "content": f"Query: {query}\nAnswer: {answer}"} + ] +``` +Eval best answer +``` +messages = [ + {"role": "system", "content": """You are an assistant that evaluates multiple answers to a query and selects the best one based on relevance and completeness. + Given a query and a list of answers, choose the answer that best addresses the query. Respond with the best answer. Don't need to mention the word answers at all just be natural. Don't "the best answer" or things like that. Just provide the best one."""}, + {"role": "user", "content": f"Query: {query}\nAnswers: {json.dumps(cached_answers)}"} + ] +``` + + +Select relevant content +``` +Given the following search results for the user's question: "{user_query}" +Select the 2 most relevant results to scrape and analyze. Explain your reasoning for each selection. + +Search Results: +{self.format_results(search_results)} + +Instructions: +1. You MUST select exactly 2 result numbers from the search results. +2. Choose the results that are most likely to contain comprehensive and relevant information to answer the user's question. +3. Provide a brief reason for each selection. + +You MUST respond using EXACTLY this format and nothing else: + +Selected Results: [Two numbers corresponding to the selected results] +Reasoning: [Your reasoning for the selections] +""" +``` + + +######################################################################################################################## +Final Answer Generation Prompts + + +Final-Answer-1 +``` +You are an AI assistant. Provide a comprehensive and detailed answer to the following question using ONLY the information provided in the scraped content. Do not include any references or mention any sources unless explicitly instructed. Answer directly and thoroughly, using a clear and professional tone. + +Question: "{user_query_short}" + +Scraped Content: +{self.format_scraped_content(scraped_content)} + +Important Instructions: +1. Structure your answer as follows: + - **Introduction**: Briefly summarize the topic or main point (1–2 sentences). + - **Details**: Provide key information, facts, or insights from the scraped content. Use bullet points or paragraphs for clarity. + - **Conclusion**: Summarize the findings or restate the core answer (1–2 sentences). +2. Adapt the tone and style of the answer to match the user’s question. Use a formal tone for technical or professional queries and a conversational tone for casual questions. +3. If the scraped content contains conflicting information, present both perspectives clearly and neutrally, noting the discrepancy. +4. Focus on the most relevant and important information in the scraped content, and avoid including minor or tangential details. +5. If the scraped content does not contain enough information to answer the question, say so explicitly and explain what information is missing. +6. Provide as much relevant detail as possible from the scraped content, but avoid redundancy or unnecessary repetition. +7. If the question is ambiguous or overly broad, clarify the intent or focus on specific subtopics to provide a more targeted answer. +8. Avoid generating content that is discriminatory, offensive, or harmful. If the topic is sensitive, provide a neutral and respectful response. +9. If the user specifies a preferred format (e.g., bullet points, paragraphs) or level of detail (e.g., brief, comprehensive), tailor the answer accordingly. +10. If the user requests revisions, adjust the answer based on their feedback while adhering to the above guidelines. + +Examples: +1. Short Answer (3–4 Sentences) + - **Question:** "What is photosynthesis?" + - **Answer:** + - **Introduction:** Photosynthesis is the process by which plants convert sunlight into energy. + - **Details:** + * It occurs in the chloroplasts of plant cells, using chlorophyll to absorb light. + * During photosynthesis, plants take in carbon dioxide and release oxygen as a byproduct. + - **Conclusion:** This process is essential for plant growth and oxygen production, supporting life on Earth. +2. Medium Answer (5–8 Sentences) + - **Question:** "What are the benefits of exercise?" + - **Answer:** + - **Introduction:** Exercise offers numerous physical and mental health benefits. + - **Details:** + * It improves cardiovascular health by strengthening the heart and improving circulation. + * Regular exercise helps maintain a healthy weight and reduces the risk of chronic diseases like diabetes. + * It also enhances mental health by reducing stress, anxiety, and depression through the release of endorphins. + * Exercise can improve sleep quality and boost overall energy levels. + - **Conclusion:** Incorporating regular exercise into your routine is essential for long-term physical and mental well-being. +3. Long Answer (9–12 Sentences) + - **Question**: "What are the causes and effects of climate change?" + - **Answer:** + - **Introduction**: Climate change refers to long-term changes in temperature and weather patterns, primarily caused by human activities. + - **Details:** + * The main cause is the increase in greenhouse gases, such as carbon dioxide and methane, from burning fossil fuels, deforestation, and industrial processes. + * These gases trap heat in the atmosphere, leading to a rise in global temperatures. + * Effects of climate change include more frequent and severe weather events, such as hurricanes, droughts, and heatwaves. + * Melting polar ice caps and glaciers contribute to rising sea levels, threatening coastal communities. + * Changes in precipitation patterns affect agriculture, leading to food and water shortages in some regions. + * Ecosystems are disrupted, causing species extinction and loss of biodiversity. + * Climate change also has economic impacts, such as increased costs for disaster recovery and healthcare. + - **Conclusion:** Addressing climate change requires global cooperation, sustainable practices, and a transition to renewable energy sources. +4. Very Long Answer (13–20 Sentences) + - **Question:** "What are the pros and cons of remote work?" + - **Answer:** + - **Introduction**: Remote work has become increasingly popular, offering both advantages and disadvantages for employees and employers. + - **Details:** + - **Pros for Employees:** + * Increased flexibility allows employees to manage their schedules and achieve better work-life balance. + * Eliminating commuting saves time and reduces transportation costs. + * Remote work can reduce stress by providing a more comfortable and personalized work environment. + * Employees have the opportunity to live in locations with a lower cost of living or closer to family. + - **Pros for Employers:** + * Remote work can reduce overhead costs, such as office space and utilities. + * Employers can access a global talent pool, increasing diversity and expertise. + * Studies show that remote workers are often more productive due to fewer office distractions. + * Offering remote work can improve employee satisfaction and retention. + - **Cons for Employees:** + * Remote work can lead to feelings of isolation and reduced team cohesion. + * Blurred boundaries between work and personal life may result in longer working hours. + * Limited access to office resources and face-to-face collaboration can hinder creativity and problem-solving. + - **Cons for Employers:** + * Managing remote teams requires robust communication tools and strategies. + * Ensuring data security and compliance can be more challenging in a remote setup. + * Remote work may reduce opportunities for spontaneous collaboration and innovation. + - **Conclusion:** While remote work offers significant benefits, it also presents challenges that require careful management to maximize its potential. + +Answer: +``` + + +Final-Answer-2 +``` +system_prompt = ( + "You are an eagle-eyed researcher, skilled at summarizing lengthy documents with precision and clarity. " + "Your task is to create a comprehensive summary of the provided document, capturing the main ideas, key details, and essential arguments presented. " + "Use the provided context to answer the user's question, and always reference your sources clearly." +) + +user_prompt_template = """ +**Task:** +Create a detailed summary of the provided document to answer the following question using {{ language }}: + +{{ query }} + +**Instructions:** + +1. **Source Referencing:** + - Use the format [1], [2], ..., [n] to reference sources in line with the text. For example, "According to the research from Google[3], ...". + - Ensure all claims, data, or examples are backed by a reference from the provided context. + +2. **Structure:** + - Organize the summary logically using clear headings and subheadings for different sections or themes. + - Include the following sections: + - **Introduction:** Briefly introduce the main theme or purpose of the document. + - **Key Points:** Summarize the main arguments, ideas, or findings. + - **Supporting Details:** Include key data, examples, or evidence that strengthen the main points. + - **Conclusion:** Succinctly encapsulate the overarching message or significance of the document. + +3. **Tone:** + - Use an objective, neutral tone, delivering precise and insightful analysis without personal opinions or interpretations. + +4. **Length and Depth:** + - Adjust the length of the summary based on the complexity and depth of the document. + - Ensure the summary is comprehensive without omitting crucial information. + +5. **Contextual Awareness:** + - If the context does not contain information relevant to the query, state: "No related information found in the context." using {{ language }}. + +**Example Output:** + +**Introduction to Main Theme** +The document begins by discussing [main idea], outlining [initial point] with supporting data like [example][1]. + +**Key Points** +The text presents several main arguments, such as [supporting detail][2]. Notably, [data or statistic][3] is used to reinforce the main concept. + +**Supporting Details** +Additional evidence includes [example][4], which highlights [key point][5]. + +**Conclusion** +In summary, [document's conclusion statement][6], highlighting the broader implications like [significance][7]. + +**Context:** +{{ context }} +""" +``` + + + + +Perplexity Samples +``` +Example 1: Technology + +- **Query:** What are TPUs? +- **Answer:** + * TPUs (Tensor Processing Units) are specialized hardware accelerators developed by Google for machine learning (ML) and artificial intelligence (AI) workloads. They are designed to optimize the performance of tensor operations, which are fundamental to neural network computations. + - **Key Features: + * **Architecture:** TPUs are built on Google’s custom ASIC (Application-Specific Integrated Circuit) technology, which is optimized for high-speed matrix multiplications and other tensor operations. + * **Performance:** TPUs offer significantly faster processing speeds for ML tasks compared to traditional CPUs and GPUs. For example, a single TPU v4 pod can deliver over 1 exaflop of computing power. + * **Scalability:** TPUs are designed to work in clusters, allowing for scalable performance across large datasets and complex models. + * **Energy Efficiency:** TPUs are highly energy-efficient, reducing the power consumption required for large-scale ML training and inference. + - **Applications:** + * Training large-scale neural networks, such as those used in natural language processing (NLP) and computer vision. + * Accelerating inference tasks for real-time applications like image recognition and language translation. + * Supporting research and development in AI and ML by providing high-performance computing resources. + - By offering unparalleled speed and efficiency for tensor operations, TPUs have become a critical tool for advancing AI and ML technologies. + + +Example 2: Comparative Analysis + +- **Query:** How does the TPU v4 compare to the TPU v3 in terms of performance? +- **Answer:** + - The TPU v4 represents a significant performance improvement over the TPU v3, with advancements in speed, scalability, and efficiency. + - **Performance Improvements:** + - **Compute Power: + The TPU v4 offers up to 2.7 times the peak compute performance of the TPU v3, achieving over 275 teraflops per chip. + For large-scale ML models, the TPU v4 demonstrates up to 3x faster training times compared to the TPU v3. + - **Memory and Bandwidth: + The TPU v4 features 32GB of high-bandwidth memory (HBM2), providing 1.5 times the memory capacity of the TPU v3. + Memory bandwidth has also increased to 1.2 TB/s, enabling faster data access and processing. + - **Scalability: + TPU v4 pods can scale up to 4,096 chips, delivering over 1 exaflop of computing power. This is a significant improvement over the TPU v3’s maximum pod size of 1,024 chips. + - **Energy Efficiency: + The TPU v4 is designed to be more energy-efficient, reducing power consumption per operation by up to 20% compared to the TPU v3. + - **Architectural Advancements: + - **Interconnect Technology: + * The TPU v4 uses Google’s next-generation interconnect technology, which provides higher bandwidth and lower latency for communication between chips. + - **Software Support: + * The TPU v4 is fully compatible with TensorFlow and JAX, with optimizations for newer ML frameworks and algorithms. + - **Applications: + * Training state-of-the-art AI models, such as GPT-4 and other large language models. + * Accelerating inference for real-time applications like autonomous driving and recommendation systems. + * Supporting scientific research in areas like climate modeling and drug discovery. + - **The TPU v4’s advancements in performance, scalability, and efficiency make it a powerful upgrade over the TPU v3 for demanding AI and ML workloads. + + +Example 3: Consumer Electronics +- **Query:** What are the key features of the iPhone 15 Pro? +- **Answer:** + * The iPhone 15 Pro is Apple’s latest flagship smartphone, offering a range of advanced features and improvements over its predecessors. + - **Key Features:** + - **Design:** + * The iPhone 15 Pro features a titanium frame, making it lighter and more durable than previous models. + * It retains the Ceramic Shield front cover for enhanced drop protection. + - **Display:** + * The 6.1-inch Super Retina XDR display supports ProMotion technology with a 120Hz refresh rate for smoother scrolling and responsiveness. + * It also features Always-On Display functionality, allowing users to view key information without waking the phone. + - **Performance:** + * Powered by the A17 Bionic chip, the iPhone 15 Pro delivers up to 20% faster CPU and GPU performance compared to the A16 Bionic. + * The new 6-core GPU supports hardware-accelerated ray tracing for enhanced gaming and graphics. + - **Camera System:** + * The triple-camera system includes a 48MP main sensor, a 12MP ultra-wide lens, and a 12MP telephoto lens with 5x optical zoom. + * New computational photography features, such as Photonic Engine and Smart HDR 5, improve low-light performance and dynamic range. + **Battery Life:** + * The iPhone 15 Pro offers up to 22 hours of video playback, thanks to its larger battery and energy-efficient A17 Bionic chip. + **Connectivity:** + * It supports Wi-Fi 6E and Bluetooth 5.3 for faster wireless connectivity. + * The USB-C port replaces Lightning, offering faster data transfer speeds and compatibility with a wider range of accessories. + - **Additional Features:** + * Action Button: A customizable button replaces the traditional mute switch, allowing users to assign shortcuts for quick access to frequently used functions. + * iOS 17: The iPhone 15 Pro comes with the latest version of iOS, featuring new customization options, enhanced privacy controls, and improved messaging capabilities. + - **The iPhone 15 Pro combines cutting-edge technology with a sleek design, making it a top choice for consumers seeking a premium smartphone experience.** + + +Example 4: Comparative Analysis +- **Query:** How does the iPhone 15 Pro compare to the iPhone 14 Pro in terms of performance? +- **Answer:** + * The iPhone 15 Pro offers several performance improvements over the iPhone 14 Pro, making it a significant upgrade for users. + - **Performance Improvements:** + + - **Processor:** + * The iPhone 15 Pro is powered by the A17 Bionic chip, which delivers up to 20% faster CPU and GPU performance compared to the A16 Bionic in the iPhone 14 Pro. + * The A17 Bionic also features a 6-core GPU with hardware-accelerated ray tracing, enhancing gaming and graphics performance. + - **Memory and Storage:** + * The iPhone 15 Pro starts with 8GB of RAM, compared to 6GB in the iPhone 14 Pro, allowing for better multitasking and app performance. + * Storage options remain the same, ranging from 128GB to 1TB. + - **Battery Life:** + * The iPhone 15 Pro offers up to 22 hours of video playback, a slight improvement over the iPhone 14 Pro’s 20 hours. + * The A17 Bionic’s energy efficiency contributes to longer battery life under heavy usage. + - **Design and Features:** + - Build Material: + * The iPhone 15 Pro features a titanium frame, making it lighter and more durable than the stainless steel frame of the iPhone 14 Pro. + - Display: + * Both models feature a 6.1-inch Super Retina XDR display with ProMotion, but the iPhone 15 Pro introduces Always-On Display functionality. + - Camera System: + * The iPhone 15 Pro’s 48MP main sensor and 5x optical zoom offer improved photography capabilities compared to the iPhone 14 Pro’s 3x optical zoom. + - Connectivity: + * The iPhone 15 Pro replaces the Lightning port with USB-C, offering faster data transfer speeds and broader compatibility. + - **Software:** + * Both models run iOS 17, but the iPhone 15 Pro’s hardware optimizations ensure smoother performance and better support for new features. + - **The iPhone 15 Pro’s advancements in processing power, design, and camera technology make it a compelling upgrade over the iPhone 14 Pro.** +``` + + + + + +######################################################################################################################## +Failed Search Query Prompt +``` +After multiple search attempts, we couldn't find a fully satisfactory answer to the user's question: "{user_query}" + +Please provide the best possible answer you can, acknowledging any limitations or uncertainties. +If appropriate, suggest ways the user might refine their question or where they might find more information. + +Respond in a clear, concise, and informative manner. +""" +``` + + + + +######################################################################################################################## +Related Questions Prompts + +``` +RELATED_QUESTION_PROMPT = """\ +Given a user's question and the context from search results, generate exactly 3 concise and relevant follow-up questions the user might ask. + +**Instructions:** +1. **Relevance:** Ensure the questions are directly related to the original question and context. +2. **Conciseness:** Keep the questions short and simple. +3. **Language Match:** Use the same language as the user's original question. +4. **Depth:** Include questions that explore different aspects of the topic (e.g., clarification, deeper exploration, or related subtopics). + +**Original Question:** {query} + +**Context:** + +{context} + + +**Output Format:** +related_questions: A list of EXACTLY three concise, simple, and relevant follow-up questions. + +**Example:** +related_questions: [ + "What are the benefits of renewable energy?", + "How does solar energy reduce greenhouse gas emissions?", + "What are the costs of implementing wind energy?" +] +""" +``` + +``` +HISTORY_QUERY_REPHRASE = """\ +Given a conversation history and a follow-up input, rephrase the follow-up into a SHORT, standalone query that captures relevant context from previous messages. + +**Instructions:** +1. **Conciseness:** Make the query as short and compressed as possible. +2. **Relevance:** Include only information relevant to the retrieval task. +3. **Topic Change:** If there is a clear change in topic, disregard the previous messages and focus only on the follow-up input. +4. **Language Match:** Use the same language as the user's original question. + +**Chat History:** +{chat_history} + +**Follow-Up Input:** +{question} + +**Output Format:** +Respond with ONLY the short, standalone query. + +**Example:** +Follow-Up Input: "What about the costs?" +Standalone Query: "Costs of implementing renewable energy" +""" +``` + + +Query Plan Prompt +``` +QUERY_PLAN_PROMPT = """\ +You are an expert at creating search task lists to answer queries. Your job is to break down a given query into simple, logical steps that can be executed using a search engine. + +Rules: +1. Use up to 4 steps maximum, but use fewer if possible. +2. Keep steps simple, concise, and easy to understand. +3. Ensure proper use of dependencies between steps. +4. Always include a final step to summarize/combine/compare information from previous steps. + +Instructions for creating the Query Plan: +1. Break down the query into logical search steps. +2. For each step, specify an "id" (starting from 0) and a "step" description. +3. List dependencies for each step as an array of previous step ids. +4. The first step should always have an empty dependencies array. +5. Subsequent steps should list all step ids they depend on. + +Example Query: +Given the query "Compare Perplexity and You.com in terms of revenue, number of employees, and valuation" + +Example Query Plan: +[ + {{ + "id": 0, + "step": "Research Perplexity's revenue, employee count, and valuation", + "dependencies": [] + }}, + {{ + "id": 1, + "step": "Research You.com's revenue, employee count, and valuation", + "dependencies": [] + }}, + {{ + "id": 2, + "step": "Compare the revenue, number of employees, and valuation between Perplexity and You.com", + "dependencies": [0, 1] + }} +] + +Query: {query} +Query Plan (with a final summarize/combine/compare step): +""" +``` + + diff --git a/App_Function_Libraries/Web_Scraping/Search_Prompt.py b/App_Function_Libraries/Web_Scraping/Search_Prompt.py deleted file mode 100644 index 92ca6f16..00000000 --- a/App_Function_Libraries/Web_Scraping/Search_Prompt.py +++ /dev/null @@ -1,342 +0,0 @@ -# Taken from https://github.com/rashadphz/farfalle/blob/main/src/backend/prompts.py -CHAT_PROMPT = """\ -Generate a comprehensive and informative answer for a given question solely based on the provided web Search Results (URL, Page Title, Summary). You must only use information from the provided search results. Use an unbiased and journalistic tone. - -You must cite the answer using [number] notation. You must cite sentences with their relevant citation number. Cite every part of the answer. -Place citations at the end of the sentence. You can do multiple citations in a row with the format [number1][number2]. - -Only cite the most relevant results that answer the question accurately. If different results refer to different entities with the same name, write separate answers for each entity. - -ONLY cite inline. -DO NOT include a reference section, DO NOT include URLs. -DO NOT repeat the question. - - -You can use markdown formatting. You should include bullets to list the information in your answer. - - -{my_context} - ---------------------- - -Make sure to match the language of the user's question. - -Question: {my_query} -Answer (in the language of the user's question): \ -""" - -RELATED_QUESTION_PROMPT = """\ -Given a question and search result context, generate 3 follow-up questions the user might ask. Use the original question and context. - -Instructions: -- Generate exactly 3 questions. -- These questions should be concise, and simple. -- Ensure the follow-up questions are relevant to the original question and context. -Make sure to match the language of the user's question. - -Original Question: {query} - -{context} - - -Output: -related_questions: A list of EXACTLY three concise, simple follow-up questions -""" - -HISTORY_QUERY_REPHRASE = """ -Given the following conversation and a follow up input, rephrase the follow up into a SHORT, \ -standalone query (which captures any relevant context from previous messages). -IMPORTANT: EDIT THE QUERY TO BE CONCISE. Respond with a short, compressed phrase. \ -If there is a clear change in topic, disregard the previous messages. -Strip out any information that is not relevant for the retrieval task. - -Chat History: -{chat_history} - -Make sure to match the language of the user's question. - -Follow Up Input: {question} -Standalone question (Respond with only the short combined query): -""".strip() - - -QUERY_PLAN_PROMPT = """\ -You are an expert at creating search task lists to answer queries. Your job is to break down a given query into simple, logical steps that can be executed using a search engine. - -Rules: -1. Use up to 4 steps maximum, but use fewer if possible. -2. Keep steps simple, concise, and easy to understand. -3. Ensure proper use of dependencies between steps. -4. Always include a final step to summarize/combine/compare information from previous steps. - -Instructions for creating the Query Plan: -1. Break down the query into logical search steps. -2. For each step, specify an "id" (starting from 0) and a "step" description. -3. List dependencies for each step as an array of previous step ids. -4. The first step should always have an empty dependencies array. -5. Subsequent steps should list all step ids they depend on. - -Example Query: -Given the query "Compare Perplexity and You.com in terms of revenue, number of employees, and valuation" - -Example Query Plan: -[ - {{ - "id": 0, - "step": "Research Perplexity's revenue, employee count, and valuation", - "dependencies": [] - }}, - {{ - "id": 1, - "step": "Research You.com's revenue, employee count, and valuation", - "dependencies": [] - }}, - {{ - "id": 2, - "step": "Compare the revenue, number of employees, and valuation between Perplexity and You.com", - "dependencies": [0, 1] - }} -] - -Query: {query} -Query Plan (with a final summarize/combine/compare step): -""" - -SEARCH_QUERY_PROMPT = """\ -Generate a concise list of search queries to gather information for executing the given step. - -You will be provided with: -1. A specific step to execute -2. The user's original query -3. Context from previous steps (if available) - -Use this information to create targeted search queries that will help complete the current step effectively. Aim for the minimum number of queries necessary while ensuring they cover all aspects of the step. - -IMPORTANT: Always incorporate relevant information from previous steps into your queries. This ensures continuity and builds upon already gathered information. - -Input: ---- -User's original query: {user_query} ---- -Context from previous steps: -{prev_steps_context} - -Your task: -1. Analyze the current step and its requirements -2. Consider the user's original query and any relevant previous context -3. Consider the user's original query -4. Generate a list of specific, focused search queries that: - - Incorporate relevant information from previous steps - - Address the requirements of the current step - - Build upon the information already gathered ---- -Current step to execute: {current_step} ---- - -Your search queries based: -""" - - -# License Reproduction: -# Apache License -# Version 2.0, January 2004 -# http://www.apache.org/licenses/ -# -# TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION -# -# 1. Definitions. -# -# "License" shall mean the terms and conditions for use, reproduction, -# and distribution as defined by Sections 1 through 9 of this document. -# -# "Licensor" shall mean the copyright owner or entity authorized by -# the copyright owner that is granting the License. -# -# "Legal Entity" shall mean the union of the acting entity and all -# other entities that control, are controlled by, or are under common -# control with that entity. For the purposes of this definition, -# "control" means (i) the power, direct or indirect, to cause the -# direction or management of such entity, whether by contract or -# otherwise, or (ii) ownership of fifty percent (50%) or more of the -# outstanding shares, or (iii) beneficial ownership of such entity. -# -# "You" (or "Your") shall mean an individual or Legal Entity -# exercising permissions granted by this License. -# -# "Source" form shall mean the preferred form for making modifications, -# including but not limited to software source code, documentation -# source, and configuration files. -# -# "Object" form shall mean any form resulting from mechanical -# transformation or translation of a Source form, including but -# not limited to compiled object code, generated documentation, -# and conversions to other media types. -# -# "Work" shall mean the work of authorship, whether in Source or -# Object form, made available under the License, as indicated by a -# copyright notice that is included in or attached to the work -# (an example is provided in the Appendix below). -# -# "Derivative Works" shall mean any work, whether in Source or Object -# form, that is based on (or derived from) the Work and for which the -# editorial revisions, annotations, elaborations, or other modifications -# represent, as a whole, an original work of authorship. For the purposes -# of this License, Derivative Works shall not include works that remain -# separable from, or merely link (or bind by name) to the interfaces of, -# the Work and Derivative Works thereof. -# -# "Contribution" shall mean any work of authorship, including -# the original version of the Work and any modifications or additions -# to that Work or Derivative Works thereof, that is intentionally -# submitted to Licensor for inclusion in the Work by the copyright owner -# or by an individual or Legal Entity authorized to submit on behalf of -# the copyright owner. For the purposes of this definition, "submitted" -# means any form of electronic, verbal, or written communication sent -# to the Licensor or its representatives, including but not limited to -# communication on electronic mailing lists, source code control systems, -# and issue tracking systems that are managed by, or on behalf of, the -# Licensor for the purpose of discussing and improving the Work, but -# excluding communication that is conspicuously marked or otherwise -# designated in writing by the copyright owner as "Not a Contribution." -# -# "Contributor" shall mean Licensor and any individual or Legal Entity -# on behalf of whom a Contribution has been received by Licensor and -# subsequently incorporated within the Work. -# -# 2. Grant of Copyright License. Subject to the terms and conditions of -# this License, each Contributor hereby grants to You a perpetual, -# worldwide, non-exclusive, no-charge, royalty-free, irrevocable -# copyright license to reproduce, prepare Derivative Works of, -# publicly display, publicly perform, sublicense, and distribute the -# Work and such Derivative Works in Source or Object form. -# -# 3. Grant of Patent License. Subject to the terms and conditions of -# this License, each Contributor hereby grants to You a perpetual, -# worldwide, non-exclusive, no-charge, royalty-free, irrevocable -# (except as stated in this section) patent license to make, have made, -# use, offer to sell, sell, import, and otherwise transfer the Work, -# where such license applies only to those patent claims licensable -# by such Contributor that are necessarily infringed by their -# Contribution(s) alone or by combination of their Contribution(s) -# with the Work to which such Contribution(s) was submitted. If You -# institute patent litigation against any entity (including a -# cross-claim or counterclaim in a lawsuit) alleging that the Work -# or a Contribution incorporated within the Work constitutes direct -# or contributory patent infringement, then any patent licenses -# granted to You under this License for that Work shall terminate -# as of the date such litigation is filed. -# -# 4. Redistribution. You may reproduce and distribute copies of the -# Work or Derivative Works thereof in any medium, with or without -# modifications, and in Source or Object form, provided that You -# meet the following conditions: -# -# (a) You must give any other recipients of the Work or -# Derivative Works a copy of this License; and -# -# (b) You must cause any modified files to carry prominent notices -# stating that You changed the files; and -# -# (c) You must retain, in the Source form of any Derivative Works -# that You distribute, all copyright, patent, trademark, and -# attribution notices from the Source form of the Work, -# excluding those notices that do not pertain to any part of -# the Derivative Works; and -# -# (d) If the Work includes a "NOTICE" text file as part of its -# distribution, then any Derivative Works that You distribute must -# include a readable copy of the attribution notices contained -# within such NOTICE file, excluding those notices that do not -# pertain to any part of the Derivative Works, in at least one -# of the following places: within a NOTICE text file distributed -# as part of the Derivative Works; within the Source form or -# documentation, if provided along with the Derivative Works; or, -# within a display generated by the Derivative Works, if and -# wherever such third-party notices normally appear. The contents -# of the NOTICE file are for informational purposes only and -# do not modify the License. You may add Your own attribution -# notices within Derivative Works that You distribute, alongside -# or as an addendum to the NOTICE text from the Work, provided -# that such additional attribution notices cannot be construed -# as modifying the License. -# -# You may add Your own copyright statement to Your modifications and -# may provide additional or different license terms and conditions -# for use, reproduction, or distribution of Your modifications, or -# for any such Derivative Works as a whole, provided Your use, -# reproduction, and distribution of the Work otherwise complies with -# the conditions stated in this License. -# -# 5. Submission of Contributions. Unless You explicitly state otherwise, -# any Contribution intentionally submitted for inclusion in the Work -# by You to the Licensor shall be under the terms and conditions of -# this License, without any additional terms or conditions. -# Notwithstanding the above, nothing herein shall supersede or modify -# the terms of any separate license agreement you may have executed -# with Licensor regarding such Contributions. -# -# 6. Trademarks. This License does not grant permission to use the trade -# names, trademarks, service marks, or product names of the Licensor, -# except as required for reasonable and customary use in describing the -# origin of the Work and reproducing the content of the NOTICE file. -# -# 7. Disclaimer of Warranty. Unless required by applicable law or -# agreed to in writing, Licensor provides the Work (and each -# Contributor provides its Contributions) on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied, including, without limitation, any warranties or conditions -# of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A -# PARTICULAR PURPOSE. You are solely responsible for determining the -# appropriateness of using or redistributing the Work and assume any -# risks associated with Your exercise of permissions under this License. -# -# 8. Limitation of Liability. In no event and under no legal theory, -# whether in tort (including negligence), contract, or otherwise, -# unless required by applicable law (such as deliberate and grossly -# negligent acts) or agreed to in writing, shall any Contributor be -# liable to You for damages, including any direct, indirect, special, -# incidental, or consequential damages of any character arising as a -# result of this License or out of the use or inability to use the -# Work (including but not limited to damages for loss of goodwill, -# work stoppage, computer failure or malfunction, or any and all -# other commercial damages or losses), even if such Contributor -# has been advised of the possibility of such damages. -# -# 9. Accepting Warranty or Additional Liability. While redistributing -# the Work or Derivative Works thereof, You may choose to offer, -# and charge a fee for, acceptance of support, warranty, indemnity, -# or other liability obligations and/or rights consistent with this -# License. However, in accepting such obligations, You may act only -# on Your own behalf and on Your sole responsibility, not on behalf -# of any other Contributor, and only if You agree to indemnify, -# defend, and hold each Contributor harmless for any liability -# incurred by, or claims asserted against, such Contributor by reason -# of your accepting any such warranty or additional liability. -# -# END OF TERMS AND CONDITIONS -# -# APPENDIX: How to apply the Apache License to your work. -# -# To apply the Apache License to your work, attach the following -# boilerplate notice, with the fields enclosed by brackets "[]" -# replaced with your own identifying information. (Don't include -# the brackets!) The text should be enclosed in the appropriate -# comment syntax for the file format. We also recommend that a -# file or class name and description of purpose be included on the -# same "printed page" as the copyright notice for easier -# identification within third-party archives. -# -# Copyright [yyyy] [name of copyright owner] -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/App_Function_Libraries/Web_Scraping/WebSearch_APIs.py b/App_Function_Libraries/Web_Scraping/WebSearch_APIs.py index 5c36a6a5..a463e923 100644 --- a/App_Function_Libraries/Web_Scraping/WebSearch_APIs.py +++ b/App_Function_Libraries/Web_Scraping/WebSearch_APIs.py @@ -46,34 +46,157 @@ def perform_websearch(search_engine, search_query, content_country, search_lang, output_lang, result_count, date_range=None, safesearch=None, site_blacklist=None, exactTerms=None, excludeTerms=None, filter=None, geolocation=None, search_result_language=None, sort_results_by=None): if search_engine.lower() == "baidu": - return search_web_baidu() + web_search_results = search_web_baidu(search_query, None, None) + process_results = process_web_search_results(web_search_results, "baidu") + elif search_engine.lower() == "bing": - return search_web_bing(search_query, search_lang, content_country, date_range, result_count) + web_search_results = search_web_bing(search_query, search_lang, content_country, date_range, result_count) + process_results = process_web_search_results(web_search_results, "bing") + elif search_engine.lower() == "brave": - return search_web_brave(search_query, content_country, search_lang, output_lang, result_count, safesearch, + web_search_results = search_web_brave(search_query, content_country, search_lang, output_lang, result_count, safesearch, site_blacklist, date_range) + process_results = process_web_search_results(web_search_results, "brave") + elif search_engine.lower() == "duckduckgo": - return search_web_duckduckgo(search_query, content_country, date_range, result_count) + web_search_results = search_web_duckduckgo(search_query, content_country, date_range, result_count) + process_results = process_web_search_results(web_search_results, "ddg") + elif search_engine.lower() == "google": - return search_web_google(search_query, result_count, content_country, date_range, exactTerms, + web_search_results = search_web_google(search_query, result_count, content_country, date_range, exactTerms, excludeTerms, filter, geolocation, output_lang, search_result_language, safesearch, site_blacklist, sort_results_by) + process_results = process_web_search_results(web_search_results, "google") + elif search_engine.lower() == "kagi": - return search_web_kagi(search_query, content_country, search_lang, output_lang, result_count, safesearch, date_range, + web_search_results = search_web_kagi(search_query, content_country, search_lang, output_lang, result_count, safesearch, date_range, site_blacklist) + process_results = process_web_search_results(web_search_results, "kagi") + elif search_engine.lower() == "serper": - return search_web_serper() + web_search_results = search_web_serper() + process_results = process_web_search_results(web_search_results, "serper") + elif search_engine.lower() == "tavily": - return search_web_tavily(search_query, result_count, site_blacklist) + web_search_results = search_web_tavily(search_query, result_count, site_blacklist) + process_results = process_web_search_results(web_search_results, "tavily") + elif search_engine.lower() == "searx": - return search_web_searx(search_query, language='auto', time_range='', safesearch=0, pageno=1, categories='general') + web_search_results = search_web_searx(search_query, language='auto', time_range='', safesearch=0, pageno=1, categories='general') + process_results = process_web_search_results(web_search_results, "bing") + elif search_engine.lower() == "yandex": - return search_web_yandex() + web_search_results = search_web_yandex() + process_results = process_web_search_results(web_search_results, "bing") + else: return f"Error: Invalid Search Engine Name {search_engine}" -######################### Search Results Parsing ######################### +# +######################### Search Result Parsing ################################################################## +# + +def process_web_search_results(search_results: Dict, search_engine: str) -> Dict: + """ + Processes search results from a search engine and formats them into a standardized dictionary structure. + + Args: + search_results (Dict): The raw search results from the search engine. + search_engine (str): The name of the search engine (e.g., "Google", "Bing"). + + Returns: + Dict: A dictionary containing the processed search results in the specified structure. + """ + # Initialize the output dictionary with default values + web_search_results_dict = { + "search_engine": search_engine, + "search_query": search_results.get("search_query", ""), + "content_country": search_results.get("content_country", ""), + "search_lang": search_results.get("search_lang", ""), + "output_lang": search_results.get("output_lang", ""), + "result_count": search_results.get("result_count", 0), + "date_range": search_results.get("date_range", None), + "safesearch": search_results.get("safesearch", None), + "site_blacklist": search_results.get("site_blacklist", None), + "exactTerms": search_results.get("exactTerms", None), + "excludeTerms": search_results.get("excludeTerms", None), + "filter": search_results.get("filter", None), + "geolocation": search_results.get("geolocation", None), + "search_result_language": search_results.get("search_result_language", None), + "sort_results_by": search_results.get("sort_results_by", None), + "results": [], + "total_results_found": search_results.get("total_results_found", 0), + "search_time": search_results.get("search_time", 0.0), + "error": search_results.get("error", None), + "processing_error": None + } + try: + if search_engine.lower() == "baidu": + pass + elif search_engine.lower() == "bing": + parse_bing_results(search_results, web_search_results_dict) + pass + + elif search_engine.lower() == "brave": + parse_brave_results(search_results, web_search_results_dict) + pass + + elif search_engine.lower() == "duckduckgo": + parse_duckduckgo_results(search_results, web_search_results_dict) + pass + + elif search_engine.lower() == "google": + parse_google_results(search_results, web_search_results_dict) + pass + + elif search_engine.lower() == "kagi": + parse_kagi_results(search_results, web_search_results_dict) + pass + + elif search_engine.lower() == "serper": + parse_serper_results(search_results, web_search_results_dict) + pass + + elif search_engine.lower() == "tavily": + parse_tavily_results(search_results, web_search_results_dict) + pass + + elif search_engine.lower() == "searx": + parse_searx_results(search_results, web_search_results_dict) + pass + + elif search_engine.lower() == "yandex": + parse_yandex_results(search_results, web_search_results_dict) + pass + + else: + web_search_results_dict["processing_error"] = f"Error: Invalid Search Engine Name {search_engine}" + raise ValueError(f"Error: Invalid Search Engine Name {search_engine}") + except Exception as e: + web_search_results_dict["processing_error"] = f"Error processing search results: {str(e)}" + raise + + # Process individual search results + for result in search_results.get("results", []): + processed_result = { + "title": result.get("title", ""), + "url": result.get("url", ""), + "content": result.get("content", ""), + "metadata": { + "date_published": result.get("metadata", {}).get("date_published", None), + "author": result.get("metadata", {}).get("author", None), + "source": result.get("metadata", {}).get("source", None), + "language": result.get("metadata", {}).get("language", None), + "relevance_score": result.get("metadata", {}).get("relevance_score", None), + "snippet": result.get("metadata", {}).get("snippet", None) + } + } + web_search_results_dict["results"].append(processed_result) + + return web_search_results_dict + + def parse_html_search_results_generic(soup): results = [] for result in soup.find_all('div', class_='result'): @@ -96,7 +219,7 @@ def parse_html_search_results_generic(soup): # # https://cloud.baidu.com/doc/APIGUIDE/s/Xk1myz05f # https://oxylabs.io/blog/how-to-scrape-baidu-search-results -def search_web_baidu(): +def search_web_baidu(arg1, arg2, arg3): pass @@ -104,6 +227,9 @@ def test_baidu_search(arg1, arg2, arg3): result = search_web_baidu(arg1, arg2, arg3) return result +def search_parse_baidu_results(): + pass + ######################### Bing Search ######################### # @@ -157,11 +283,11 @@ def search_web_bing(search_query, bing_lang, bing_country, result_count=None, bi response = requests.get(search_url, headers=headers, params=params) response.raise_for_status() - print("Headers: ") - print(response.headers) + logging.debug("Headers: ") + logging.debug(response.headers) - print("JSON Response: ") - print(response.json()) + logging.debug("JSON Response: ") + logging.debug(response.json()) bing_search_results = response.json() return bing_search_results except Exception as ex: @@ -169,16 +295,85 @@ def search_web_bing(search_query, bing_lang, bing_country, result_count=None, bi def test_search_web_bing(): - search_query = "How can I bake a cherry cake" + search_query = "How can I get started learning machine learning?" bing_lang = "en" bing_country = "US" result_count = 10 bing_api_key = None date_range = None result = search_web_bing(search_query, bing_lang, bing_country, result_count, bing_api_key, date_range) + print("Bing Search Results:") print(result) +# FIXME - untested +def parse_bing_results(raw_results: Dict, output_dict: Dict) -> None: + """ + Parse Bing search results and update the output dictionary + + Args: + raw_results (Dict): Raw Bing API response + output_dict (Dict): Dictionary to store processed results + """ + try: + # Extract web pages results + if "webPages" in raw_results: + web_pages = raw_results["webPages"] + output_dict["total_results_found"] = web_pages.get("totalEstimatedMatches", 0) + + for result in web_pages.get("value", []): + processed_result = { + "title": result.get("name", ""), + "url": result.get("url", ""), + "content": result.get("snippet", ""), + "metadata": { + "date_published": None, # Bing doesn't typically provide this + "author": None, # Bing doesn't typically provide this + "source": result.get("displayUrl", None), + "language": None, # Could be extracted from result.get("language") if available + "relevance_score": None, # Could be calculated from result.get("rank") if available + "snippet": result.get("snippet", None) + } + } + output_dict["results"].append(processed_result) + + # Optionally process other result types + if "news" in raw_results: + for news_item in raw_results["news"].get("value", []): + processed_result = { + "title": news_item.get("name", ""), + "url": news_item.get("url", ""), + "content": news_item.get("description", ""), + "metadata": { + "date_published": news_item.get("datePublished", None), + "author": news_item.get("provider", [{}])[0].get("name", None), + "source": news_item.get("provider", [{}])[0].get("name", None), + "language": None, + "relevance_score": None, + "snippet": news_item.get("description", None) + } + } + output_dict["results"].append(processed_result) + + # Add spell suggestions if available + if "spellSuggestion" in raw_results: + output_dict["spell_suggestions"] = raw_results["spellSuggestion"] + + # Add related searches if available + if "relatedSearches" in raw_results: + output_dict["related_searches"] = [ + item.get("text", "") + for item in raw_results["relatedSearches"].get("value", []) + ] + + except Exception as e: + output_dict["processing_error"] = f"Error processing Bing results: {str(e)}" + + +def test_parse_bing_results(): + pass + + ######################### Brave Search ######################### # # https://brave.com/search/api/ @@ -238,6 +433,65 @@ def test_search_brave(): return result +# FIXME - untested +def parse_brave_results(raw_results: Dict, output_dict: Dict) -> None: + """ + Parse Brave search results and update the output dictionary + + Args: + raw_results (Dict): Raw Brave API response + output_dict (Dict): Dictionary to store processed results + """ + try: + # Extract query information + if "query" in raw_results: + query_info = raw_results["query"] + output_dict.update({ + "search_query": query_info.get("original", ""), + "content_country": query_info.get("country", ""), + "city": query_info.get("city", ""), + "state": query_info.get("state", ""), + "more_results_available": query_info.get("more_results_available", False) + }) + + # Process web results + if "web" in raw_results and "results" in raw_results["web"]: + for result in raw_results["web"]["results"]: + processed_result = { + "title": result.get("title", ""), + "url": result.get("url", ""), + "content": result.get("description", ""), + "metadata": { + "date_published": result.get("page_age", None), + "author": None, + "source": result.get("profile", {}).get("name", None), + "language": result.get("language", None), + "relevance_score": None, + "snippet": result.get("description", None), + "family_friendly": result.get("family_friendly", None), + "type": result.get("type", None), + "subtype": result.get("subtype", None), + "thumbnail": result.get("thumbnail", {}).get("src", None) + } + } + output_dict["results"].append(processed_result) + + # Update total results count + if "mixed" in raw_results: + output_dict["total_results_found"] = len(raw_results["mixed"].get("main", [])) + + # Set family friendly status + if "mixed" in raw_results: + output_dict["family_friendly"] = raw_results.get("family_friendly", True) + + except Exception as e: + output_dict["processing_error"] = f"Error processing Brave results: {str(e)}" + raise + +def test_parse_brave_results(): + pass + + ######################### DuckDuckGo Search ######################### # # https://github.com/deedy5/duckduckgo_search @@ -351,6 +605,81 @@ def test_search_duckduckgo(): print(f"Request error: {str(e)}") +def parse_duckduckgo_results(raw_results: Dict, output_dict: Dict) -> None: + """ + Parse DuckDuckGo search results and update the output dictionary + + Args: + raw_results (Dict): Raw DuckDuckGo response + output_dict (Dict): Dictionary to store processed results + """ + try: + # DuckDuckGo results appear to be in a simple list format + # Each result is separated by "---" + results = raw_results.get("results", []) + + for result in results: + # Extract information using the consistent format in results + title = "" + url = "" + snippet = "" + + # Parse the result text + lines = result.split('\n') + for line in lines: + if line.startswith("Title: "): + title = line.replace("Title: ", "").strip() + elif line.startswith("URL: "): + url = line.replace("URL: ", "").strip() + elif line.startswith("Snippet: "): + snippet = line.replace("Snippet: ", "").strip() + + processed_result = { + "title": title, + "url": url, + "content": snippet, + "metadata": { + "date_published": None, # DuckDuckGo doesn't typically provide this + "author": None, # DuckDuckGo doesn't typically provide this + "source": extract_domain(url) if url else None, + "language": None, # DuckDuckGo doesn't typically provide this + "relevance_score": None, # DuckDuckGo doesn't typically provide this + "snippet": snippet + } + } + + output_dict["results"].append(processed_result) + + # Update total results count + output_dict["total_results_found"] = len(output_dict["results"]) + + except Exception as e: + output_dict["processing_error"] = f"Error processing DuckDuckGo results: {str(e)}" + + +def extract_domain(url: str) -> str: + """ + Extract domain name from URL + + Args: + url (str): Full URL + + Returns: + str: Domain name + """ + try: + from urllib.parse import urlparse + parsed_uri = urlparse(url) + domain = parsed_uri.netloc + return domain.replace('www.', '') + except: + return url + +def test_parse_duckduckgo_results(): + pass + + + ######################### Google Search ######################### # # https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list @@ -503,47 +832,184 @@ def test_search_google(): print(result) +# FIXME - untested +def parse_google_results(raw_results: Dict, output_dict: Dict) -> None: + """ + Parse Google Custom Search API results and update the output dictionary + + Args: + raw_results (Dict): Raw Google API response + output_dict (Dict): Dictionary to store processed results + """ + try: + # Extract search information + if "searchInformation" in raw_results: + search_info = raw_results["searchInformation"] + output_dict["total_results_found"] = int(search_info.get("totalResults", "0")) + output_dict["search_time"] = search_info.get("searchTime", 0.0) + + # Extract spelling suggestions + if "spelling" in raw_results: + output_dict["spell_suggestions"] = raw_results["spelling"].get("correctedQuery") + + # Extract search parameters from queries + if "queries" in raw_results and "request" in raw_results["queries"]: + request = raw_results["queries"]["request"][0] + output_dict.update({ + "search_query": request.get("searchTerms", ""), + "search_lang": request.get("language", ""), + "result_count": request.get("count", 0), + "safesearch": request.get("safe", None), + "exactTerms": request.get("exactTerms", None), + "excludeTerms": request.get("excludeTerms", None), + "filter": request.get("filter", None), + "geolocation": request.get("gl", None), + "search_result_language": request.get("hl", None), + "sort_results_by": request.get("sort", None) + }) + + # Process search results + if "items" in raw_results: + for item in raw_results["items"]: + processed_result = { + "title": item.get("title", ""), + "url": item.get("link", ""), + "content": item.get("snippet", ""), + "metadata": { + "date_published": item.get("pagemap", {}).get("metatags", [{}])[0].get( + "article:published_time"), + "author": item.get("pagemap", {}).get("metatags", [{}])[0].get("article:author"), + "source": item.get("displayLink", None), + "language": item.get("language", None), + "relevance_score": None, # Google doesn't provide this directly + "snippet": item.get("snippet", None), + "file_format": item.get("fileFormat", None), + "mime_type": item.get("mime", None), + "cache_url": item.get("cacheId", None) + } + } + + # Extract additional metadata if available + if "pagemap" in item: + pagemap = item["pagemap"] + if "metatags" in pagemap and pagemap["metatags"]: + metatags = pagemap["metatags"][0] + processed_result["metadata"].update({ + "description": metatags.get("og:description", + metatags.get("description")), + "keywords": metatags.get("keywords"), + "site_name": metatags.get("og:site_name") + }) + + output_dict["results"].append(processed_result) + + # Add pagination information + output_dict["pagination"] = { + "has_next": "nextPage" in raw_results.get("queries", {}), + "has_previous": "previousPage" in raw_results.get("queries", {}), + "current_page": raw_results.get("queries", {}).get("request", [{}])[0].get("startIndex", 1) + } + + except Exception as e: + output_dict["processing_error"] = f"Error processing Google results: {str(e)}" + +def test_parse_google_results(): + pass + + + ######################### Kagi Search ######################### # # https://help.kagi.com/kagi/api/search.html -def search_web_kagi(search_term, country, search_lang, ui_lang, result_count, safesearch="moderate", date_range=None, - result_filter=None, kagi_api_key=None): - search_url = "https://api.search.brave.com/res/v1/web/search" +def search_web_kagi(query: str, limit: int = 10) -> Dict: + search_url = "https://kagi.com/api/v0/search" + # load key from config file + kagi_api_key = loaded_config_data['search_engines']['kagi_search_api_key'] if not kagi_api_key: - # load key from config file - if not kagi_api_key: - raise ValueError("Please provide a valid Kagi Search API subscription key") - if not country: - country = "US" - if not search_lang: - search_lang = "en" - if not ui_lang: - ui_lang = "en" - if not result_count: - result_count = 10 - # if not date_range: - # date_range = "month" - if not result_filter: - result_filter = "webpages" + raise ValueError("Please provide a valid Kagi Search API subscription key") - headers = {"Authorization: Bot " + kagi_api_key} + """ + Queries the Kagi Search API with the given query and limit. + """ + if kagi_api_key is None: + raise ValueError("API key is required.") - # https://api.search.brave.com/app/documentation/web-search/query#WebSearchAPIQueryParameters - params = {"q": search_term, "textDecorations": True, "textFormat": "HTML", "count": result_count, - "freshness": date_range, "promote": "webpages", "safeSearch": "Moderate"} + headers = {"Authorization": f"Bot {kagi_api_key}"} + endpoint = f"{search_url}/search" + params = {"q": query, "limit": limit} - response = requests.get(search_url, headers=headers, params=params) + response = requests.get(endpoint, headers=headers, params=params) response.raise_for_status() - # Response: https://api.search.brave.com/app/documentation/web-search/responses#WebSearchApiResponse - kagi_search_results = response.json() - return kagi_search_results + print(response.json()) + return response.json() def test_search_kagi(): + search_term = "How can I bake a cherry cake" + result_count = 10 + result = search_web_kagi(search_term, result_count) + print(result) + + +def parse_kagi_results(raw_results: Dict, output_dict: Dict) -> None: + """ + Parse Kagi search results and update the output dictionary + + Args: + raw_results (Dict): Raw Kagi API response + output_dict (Dict): Dictionary to store processed results + """ + try: + # Extract metadata + if "meta" in raw_results: + meta = raw_results["meta"] + output_dict["search_time"] = meta.get("ms", 0) / 1000.0 # Convert to seconds + output_dict["api_balance"] = meta.get("api_balance") + output_dict["search_id"] = meta.get("id") + output_dict["node"] = meta.get("node") + + # Process search results + if "data" in raw_results: + for item in raw_results["data"]: + # Skip related searches (type 1) + if item.get("t") == 1: + output_dict["related_searches"] = item.get("list", []) + continue + + # Process regular search results (type 0) + if item.get("t") == 0: + processed_result = { + "title": item.get("title", ""), + "url": item.get("url", ""), + "content": item.get("snippet", ""), + "metadata": { + "date_published": item.get("published"), + "author": None, # Kagi doesn't typically provide this + "source": None, # Could be extracted from URL if needed + "language": None, # Kagi doesn't typically provide this + "relevance_score": None, + "snippet": item.get("snippet"), + "thumbnail": item.get("thumbnail", {}).get("url") if "thumbnail" in item else None + } + } + output_dict["results"].append(processed_result) + + # Update total results count + output_dict["total_results_found"] = len([ + item for item in raw_results["data"] + if item.get("t") == 0 + ]) + + except Exception as e: + output_dict["processing_error"] = f"Error processing Kagi results: {str(e)}" + + +def test_parse_kagi_results(): pass + ######################### SearX Search ######################### # # https://searx.space @@ -612,6 +1078,14 @@ def test_search_searx(): print(result) pass +def parse_searx_results(searx_search_results, web_search_results_dict): + pass + +def test_parse_searx_results(): + pass + + + ######################### Serper.dev Search ######################### # @@ -623,6 +1097,11 @@ def search_web_serper(): def test_search_serper(): pass +def parse_serper_results(serper_search_results, web_search_results_dict): + pass + + + ######################### Tavily Search ######################### # @@ -664,6 +1143,17 @@ def test_search_tavily(): result = search_web_tavily("How can I bake a cherry cake?") print(result) + +def parse_tavily_results(tavily_search_results, web_search_results_dict): + pass + + +def test_parse_tavily_results(): + pass + + + + ######################### Yandex Search ######################### # # https://yandex.cloud/en/docs/search-api/operations/web-search @@ -677,6 +1167,10 @@ def search_web_yandex(): def test_search_yandex(): pass +def parse_yandex_results(yandex_search_results, web_search_results_dict): + pass + + # # End of WebSearch_APIs.py ####################################################################################################################### diff --git a/Docs/Design/Education.md b/Docs/Design/Education.md index a07a32ad..7def7431 100644 --- a/Docs/Design/Education.md +++ b/Docs/Design/Education.md @@ -9,3 +9,24 @@ https://arxiv.org/abs/2411.07407 https://arxiv.org/abs/2412.16429 https://huggingface.co/papers/2412.15443 + + + + + +one2manny + — +Today at 12:43 AM +A great way to make studying more efficient and convenient is to take a digital PDF textbook, split it into separate files for each chapter, and organize them individually. +I then create a dedicated notebook for each chapter, treating it as a focused single source. +From there, I convert each chapter into an audio format, like a podcast. +This approach makes it easy to study while commuting, relaxing in bed with your eyes closed, or at any time when reading isn’t practical. + +I also recommend creating a study guide for each chapter, fully breaking down key concepts and definitions. +For more complex topics, the “explain like I’m 5” method works wonders—it simplifies challenging ideas into digestible explanations. + +To take this further, incorporate a Personal Knowledge Management (PKM) system into your routine. +Apps like Obsidian are perfect for this, with their flexible folder structures and Markdown formatting. +I optimize my AI outputs for Markdown so I can copy, paste, and organize them into clean, structured notes. +This ensures your materials are not only well-organized but also easy to access and build on later. +A solid PKM system is invaluable for managing knowledge and staying on top of your studies! \ No newline at end of file diff --git a/Docs/Design/Researcher.md b/Docs/Design/Researcher.md index c1d01cc0..1ccbb7fa 100644 --- a/Docs/Design/Researcher.md +++ b/Docs/Design/Researcher.md @@ -82,6 +82,14 @@ search_blacklist_URLs = "URL1,URL2,URL3" ``` +Perplexica + https://github.com/ItzCrazyKns/Perplexica/blob/master/src/search/metaSearchAgent.ts + https://github.com/ItzCrazyKns/Perplexica/blob/master/src/chains/suggestionGeneratorAgent.ts + https://github.com/ItzCrazyKns/Perplexica/blob/master/src/chains/imageSearchAgent.ts + https://github.com/ItzCrazyKns/Perplexica/blob/master/src/search/metaSearchAgent.ts + +Falle + https://github.com/rashadphz/farfalle/blob/main/src/backend/agent_search.py ### Link Dump: @@ -91,6 +99,9 @@ Articles https://docs.gptr.dev/docs/gpt-researcher/context/tailored-research# https://docs.gptr.dev/docs/gpt-researcher/gptr/pip-package +Standford STORM + https://arxiv.org/abs/2402.14207# + https://storm.genie.stanford.edu/ https://github.com/assafelovic/gpt-researcher https://arxiv.org/abs/2411.15114 @@ -123,7 +134,6 @@ https://github.com/faraz18001/Sales-Llama https://github.com/memgraph/memgraph https://github.com/rashadphz/farfalle/tree/main/src/backend https://github.com/TheBlewish/Automated-AI-Web-Researcher-Ollama/blob/main/Self_Improving_Search.py -https://storm.genie.stanford.edu/ https://github.com/TheBlewish/Automated-AI-Web-Researcher-Ollama @@ -133,305 +143,44 @@ https://github.com/TheBlewish/Automated-AI-Web-Researcher-Ollama ### Researcher Prompts +https://github.com/cbuccella/perplexity_research_prompt +https://github.com/rashadphz/farfalle/blob/main/src/backend/prompts.py - - - +https://github.com/ItzCrazyKns/Perplexica/tree/master/src/prompts https://github.com/SakanaAI/AI-Scientist - -one2manny - — -Today at 12:43 AM -A great way to make studying more efficient and convenient is to take a digital PDF textbook, split it into separate files for each chapter, and organize them individually. I then create a dedicated notebook for each chapter, treating it as a focused single source. From there, I convert each chapter into an audio format, like a podcast. This approach makes it easy to study while commuting, relaxing in bed with your eyes closed, or at any time when reading isn’t practical. - -I also recommend creating a study guide for each chapter, fully breaking down key concepts and definitions. For more complex topics, the “explain like I’m 5” method works wonders—it simplifies challenging ideas into digestible explanations. - -To take this further, incorporate a Personal Knowledge Management (PKM) system into your routine. Apps like Obsidian are perfect for this, with their flexible folder structures and Markdown formatting. I optimize my AI outputs for Markdown so I can copy, paste, and organize them into clean, structured notes. This ensures your materials are not only well-organized but also easy to access and build on later. A solid PKM system is invaluable for managing knowledge and staying on top of your studies! - - ----------------- -### Search Prompts - -inference prompt -``` -system_prompt = ( - "You are an expert summarizing the answers based on the provided contents." -) - -user_promt_template = """ -Given the context as a sequence of references with a reference id in the -format of a leading [x], please answer the following question using {{ language }}: - -{{ query }} - -In the answer, use format [1], [2], ..., [n] in line where the reference is used. -For example, "According to the research from Google[3], ...". - -Please create the answer strictly related to the context. If the context has no -information about the query, please write "No related information found in the context." -using {{ language }}. - -{{ length_instructions }} - -Here is the context: -{{ context }} -""" -``` -extraction prompt -``` -system_prompt = ( - "You are an expert of extract structual information from the document." -) -user_promt_template = """ -Given the provided content, if it contains information about {{ query }}, please extract the -list of structured data items as defined in the following Pydantic schema: - -{{ extract_schema_str }} - -Below is the provided content: -{{ content }} -""" -``` - - -Web-LLM Assistant - https://github.com/TheBlewish/Web-LLM-Assistant-Llamacpp-Ollama -Starting prompt ``` -SYSTEM_PROMPT = """You are an AI assistant capable of web searching and providing informative responses. -When a user's query starts with '/', interpret it as a request to search the web and formulate an appropriate search query. - -ALWAYS follow the prompts provided throughout the searching process EXACTLY as indicated. +SEARCH_QUERY_PROMPT = """\ +Generate a concise list of search queries to gather information for executing the given step. -NEVER assume new instructions for anywhere other than directly when prompted directly. DO NOT SELF PROMPT OR PROVIDE MULTIPLE ANSWERS OR ATTEMPT MULTIPLE RESPONSES FOR ONE PROMPT! -""" -``` -self-improving prompt -``` - def evaluate_scraped_content(self, user_query: str, scraped_content: Dict[str, str]) -> Tuple[str, str]: - user_query_short = user_query[:200] - prompt = f""" -Evaluate if the following scraped content contains sufficient information to answer the user's question comprehensively: +You will be provided with: +1. A specific step to execute +2. The user's original query +3. Context from previous steps (if available) -User's question: "{user_query_short}" +Use this information to create targeted search queries that will help complete the current step effectively. Aim for the minimum number of queries necessary while ensuring they cover all aspects of the step. -Scraped Content: -{self.format_scraped_content(scraped_content)} +IMPORTANT: Always incorporate relevant information from previous steps into your queries. This ensures continuity and builds upon already gathered information. -Your task: -1. Determine if the scraped content provides enough relevant and detailed information to answer the user's question thoroughly. -2. If the information is sufficient, decide to 'answer'. If more information or clarification is needed, decide to 'refine' the search. +Input: +--- +User's original query: {user_query} +--- +Context from previous steps: +{prev_steps_context} -Respond using EXACTLY this format: -Evaluation: [Your evaluation of the scraped content] -Decision: [ONLY 'answer' if content is sufficient, or 'refine' if more information is needed] -""" -``` -Query Creation -``` -def formulate_query(self, user_query: str, attempt: int) -> Tuple[str, str]: - user_query_short = user_query[:200] - prompt = f""" -Based on the following user question, formulate a concise and effective search query: -"{user_query_short}" Your task: -1. Create a search query of 2-5 words that will yield relevant results. -2. Determine if a specific time range is needed for the search. -Time range options: -- 'd': Limit results to the past day. Use for very recent events or rapidly changing information. -- 'w': Limit results to the past week. Use for recent events or topics with frequent updates. -- 'm': Limit results to the past month. Use for relatively recent information or ongoing events. -- 'y': Limit results to the past year. Use for annual events or information that changes yearly. -- 'none': No time limit. Use for historical information or topics not tied to a specific time frame. -Respond in the following format: -Search query: [Your 2-5 word query] -Time range: [d/w/m/y/none] -Do not provide any additional information or explanation. +1. Analyze the current step and its requirements +2. Consider the user's original query and any relevant previous context +3. Consider the user's original query +4. Generate a list of specific, focused search queries that: + - Incorporate relevant information from previous steps + - Address the requirements of the current step + - Build upon the information already gathered +--- +Current step to execute: {current_step} +--- + +Your search queries based: """ -``` -Select relevant content -``` -def select_relevant_pages(self, search_results: List[Dict], user_query: str) -> List[str]: - prompt = f""" -Given the following search results for the user's question: "{user_query}" -Select the 2 most relevant results to scrape and analyze. Explain your reasoning for each selection. - -Search Results: -{self.format_results(search_results)} - -Instructions: -1. You MUST select exactly 2 result numbers from the search results. -2. Choose the results that are most likely to contain comprehensive and relevant information to answer the user's question. -3. Provide a brief reason for each selection. - -You MUST respond using EXACTLY this format and nothing else: - -Selected Results: [Two numbers corresponding to the selected results] -Reasoning: [Your reasoning for the selections] -""" -``` -Final answer generation -``` - def generate_final_answer(self, user_query: str, scraped_content: Dict[str, str]) -> str: - user_query_short = user_query[:200] - prompt = f""" -You are an AI assistant. Provide a comprehensive and detailed answer to the following question using ONLY the information provided in the scraped content. Do not include any references or mention any sources. Answer directly and thoroughly. - -Question: "{user_query_short}" - -Scraped Content: -{self.format_scraped_content(scraped_content)} - -Important Instructions: -1. Do not use phrases like "Based on the absence of selected results" or similar. -2. If the scraped content does not contain enough information to answer the question, say so explicitly and explain what information is missing. -3. Provide as much relevant detail as possible from the scraped content. - -Answer: -""" -``` -Final Answer Synthesis -``` - def synthesize_final_answer(self, user_query: str) -> str: - prompt = f""" -After multiple search attempts, we couldn't find a fully satisfactory answer to the user's question: "{user_query}" - -Please provide the best possible answer you can, acknowledging any limitations or uncertainties. -If appropriate, suggest ways the user might refine their question or where they might find more information. - -Respond in a clear, concise, and informative manner. -""" -``` - - -https://github.com/YassKhazzan/openperplex_backend_os/blob/main/prompts.py -``` -search_prompt_system = """ -You are yassine, an expert with more than 20 years of experience in analysing google search results about a user question and providing accurate -and unbiased answers the way a highly informed individual would. -Your task is to analyse the provided contexts and the user question to provide a correct answer in a clear and concise manner. -You must answer in english. -Date and time in the context : {date_today} , Yassine must take into consideration the date and time in the response. -you are known for your expertise in this field. - - -###Guidelines### -1- Accuracy: Provide correct, unbiased answers. be concise and clear. don't be verbose. -2- never mention the context or this prompt in your response, just answer the user question. - -###Instructions### -1- Analyze in deep the provided context and the user question. -2- extract relevant information's from the context about the user question. -3- Yassine must take into account the date and time to answer the user question. -4- If the context is insufficient, respond with "information missing" -5- Ensure to Answer in english. -6- Use the response format provided. -7- answer the user question in a way an expert would do. -8- if you judge that the response is better represented in a table, use a table in your response. - - -###Response Format### - -You must use Markdown to format your response. - -Think step by step. -""" - -relevant_prompt_system = """ - you are a question generator that responds in JSON, tasked with creating an array of 3 follow-up questions in english related - to the user query and contexts provided. - you must keep the questions related to the user query and contexts.don't lose the context in the questions. - - The JSON object must not include special characters. - The JSON schema should include an array of follow-up questions. - - use the schema: - { - "followUp": [ - "string", - "string", - "string" - ] - } -""" -``` - - -appvoid search - https://github.com/appvoid/search -Eval query type -``` -async def evaluate_query_type(self, session, query): - messages = [ - {"role": "system", "content": """You are an Web assistant that evaluates the type of query a user asks. - Categorize the query into one of the following types: - 1. simple: if it can be answered with general knowledge or information that is typically well-known on the internet, please provide a short answer as relevant as possible from the llm itself, but make sure you are completly sure you know the answer, don't make things up. - 2. realtime: if it requires up-to-date information like the current date, time, or recent events, or the user explicitly asks you to look on the internet you should state as: realtime - 3. math: if it involves ANY kind of mathematical calculations. Every math question be it counting letters or complex formulas. - - Remember to prioritize realtime over anything else if you are not sure about something. Realtime is like your default. - - Respond with the category as a single word ("simple", "realtime", or "math") without any additional text."""}, - {"role": "user", "content": f"Query: {query}"} - ] -``` -Generate Search Queries -``` -async def generate_search_queries(groq_api, session, original_query, max_retries=3, fixed_count=None, previous_queries=None, previous_answer=None): - system_content = """You are an AI assistant that helps generate search queries. Given an original query, suggest alternative search queries that could help find relevant information. The queries should be diverse and cover different aspects or perspectives of the original query. Return the queries as a JSON array. - Important instructions: - - 1. The number of queries should be dynamic, between 2 and 4, unless a fixed count is specified. - 2. Don't get too far from the original query since you don't know the actual context. - 3. Make queries general enough without being related to anything specific. - 4. DON'T customize the queries for topics you've never seen; just change them a little and look for definitions if requested by the user. - 5. If the user asks something that is not related to search, ignore it and focus on generating helpful search queries. - 6. Just return the given format ["custom_query_1","custom_query_2",...]. - 7. If you need to use your knowledge first, do so. - 8. When asked about the difference between two things, generate search intents for each topic separately. - 9. ALWAYS at most queries just require one or two queries, only on those cases where the query is simple or you are unsure, generate more than one or two. - 10. If previous queries and an answer are provided, generate new queries that address the shortcomings of the previous answer and avoid repeating the previous queries. - 11. ALWAYS split searches for each important part of the query in case you need to gather information but make sure to not get off the rails. In short, don't look for things together, make a search for each important part instead. DONT LOOK FOR THINGS TOGETHER.""" - - messages = [ - {"role": "system", "content": system_content}, - {"role": "user", "content": f"Original query: {original_query}" + (f" (Generate exactly {fixed_count} queries)" if fixed_count else "")} - ] - - if previous_queries and previous_answer: - messages.append({ - "role": "user", - "content": f"Previous queries: {previous_queries}\nPrevious answer: {previous_answer}\nPlease generate new queries to address any shortcomings in the previous answer." - }) -``` -Evaluate Answer -``` -async def evaluate_answer(groq_api, session, query, answer): - messages = [ - {"role": "system", "content": """You are an AI assistant that evaluates the quality and completeness of its own answer to user queries. - Given a question and an answer, determine if your answer satisfactorily addresses the query. You are highly tolerant to answers that are close to the intent so if it is close enough, you can say is satisfactory. Remember, if it's close enough, mark it as satisfactory. - Respond with a JSON object containing two fields: - 1. "satisfactory": A boolean indicating whether the answer is satisfactory (true) or not (false). - 2. "reason": A brief explanation of why your thought is or is not satisfactory. Like "I will keep looking for information since last thought is not addressing the query because..." or "Let look for something different. My last search didn't solve the query. The reason is..." or "I found the right answer so I can ignore this..."."""}, - {"role": "user", "content": f"Query: {query}\nAnswer: {answer}"} - ] -``` -Eval best answer -``` -async def evaluate_best_answer(groq_api, session, query, cached_answers): - print('Answers pool > ', cached_answers) - messages = [ - {"role": "system", "content": """You are an assistant that evaluates multiple answers to a query and selects the best one based on relevance and completeness. - Given a query and a list of answers, choose the answer that best addresses the query. Respond with the best answer. Don't need to mention the word answers at all just be natural. Don't "the best answer" or things like that. Just provide the best one."""}, - {"role": "user", "content": f"Query: {query}\nAnswers: {json.dumps(cached_answers)}"} - ] -``` -Summarization -``` -messages = [ - {"role": "system", "content": """You are a web assistant that helps users find information from web search results. -Given a question and a set of search results, provide a concise response based on the information -available in the search results. If the information is not available in the search results, -state that you don't have enough information to answer the question. You MUST not comment on anything, just follow the instruction. Don't add additional details about anything."""}, - {"role": "user", "content": f"Question: {query}\nSearch Results: {json.dumps(all_results)}"} -] ``` \ No newline at end of file diff --git a/Docs/Design/TTS_STT.md b/Docs/Design/TTS_STT.md index 64a5f473..ff3b554a 100644 --- a/Docs/Design/TTS_STT.md +++ b/Docs/Design/TTS_STT.md @@ -50,8 +50,14 @@ https://github.com/microsoft/SpeechT5 https://github.com/smellslikeml/dolla_llama +Coqui TTS + https://github.com/idiap/coqui-ai-TTS +Cartesia + https://docs.cartesia.ai/get-started/make-an-api-request +F5 TTS + https://github.com/SWivid/F5-TTS Podcastfy diff --git a/Docs/Design/WebSearch.md b/Docs/Design/WebSearch.md index f817eda0..e432cf7e 100644 --- a/Docs/Design/WebSearch.md +++ b/Docs/Design/WebSearch.md @@ -3,10 +3,40 @@ ## Introduction This page serves as documentation regarding the web search functionality within tldw and provides context/justification for the decisions made within the module. + +Pipeline: +1. User posts question + - Gradio/UI/API +2. Question is analyzed + - Question is analyzed to identify most likely purpose/goal of question, and Sub-questions are generated to support this + - User has option of seeing/modifying prompt used for Analysis/sub-question creation +3. Search(es) is/are performed - User toggled + - Search is performed using the user's question and sub-questions +4. Results are collected, stored, and analyzed + - Results are collected, stored in a temp 'search_results' dict, and analyzed for relevance, based on initial snippet(? or full page?) + - User has the option of seeing all results, or only relevant results + - User has the option to select which results are 'relevant', + - User also has the option to select which 'relevant' results are used to answer the question +5. Relevant results are added to result dictionary + - Results determined to be relevant are then stored in a 'relevant_results' dictionary, and the process is repeated until all results are analyzed/limit is hit. +6. Once all results are collected, they are then used to answer the user's question/sub-questions + - The relevant results are then used to answer the user's question/sub-questions + - Each result is first abstract summarized, FIXME +7. The final answer/'briefing' is then presented to the user +8. User has the option to save the results to the DB +9. User has the option to ask follow-up questions / see potential other questions + + + + + ---------------- ### Setting the Stage +- The goal of this module is to provide a simple, easy-to-use interface for searching the web and retrieving results. - All the web searches are simple HTTP requests to an API or to the direct endpoint and then scraping the results. -- Parsing results is TODO. +- Results are then reviewed for relevancy, if relevant, the full page is fetched and analyzed. +- The results are then stored in a dictionary, and the process is repeated until all results are analyzed/limit is hit. +- Once all results are collected, they are then operated on, being used to create whatever final product is desired by the user. - The goal is to provide a simple, easy-to-use interface for searching the web and retrieving results. - Other modules are responsible for anything else, this module just performs the search, and delivers the results. - **Main Function:** diff --git a/Server_API/API_README.md b/Server_API/API_README.md new file mode 100644 index 00000000..7861c896 --- /dev/null +++ b/Server_API/API_README.md @@ -0,0 +1,134 @@ +# API Documentation + +## Overview + +API uses FastAPI to provide a RESTful interface to the backend services. The API is designed to be simple and easy to use, with a focus on providing a clean interface for the frontend to interact with. + +- **URLs** + - Main page: http://127.0.0.1:8000 + - API Documentation page: http://127.0.0.1:8000/docs + + + +## Endpoints + + + +``` +Here’s the important part. We’ll create: + + A global asyncio.Queue of “write tasks.” + A WriteTask class that holds the SQL, parameters, and an asyncio.Future to signal completion. + A background worker (writer_worker) that pops tasks from the queue, executes them, and sets the result in the Future. + Endpoints that push a WriteTask onto the queue, then await the Future before returning. + +# main.py +import asyncio +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +from typing import Any, Tuple, Union + +from database import get_db_connection + +app = FastAPI() + +# ----------------------------- +# 1) A global queue + task class +# ----------------------------- +class WriteTask: + """Holds SQL, parameters, and a Future to let the enqueuing code wait for completion.""" + def __init__(self, sql: str, params: tuple[Any, ...]): + self.sql = sql + self.params = params + self.future: asyncio.Future = asyncio.get_event_loop().create_future() + +write_queue: asyncio.Queue[WriteTask] = asyncio.Queue() + + +# ----------------------------- +# 2) The background worker +# ----------------------------- +async def writer_worker(): + """Continuously processes write tasks from the queue, one at a time.""" + while True: + task: WriteTask = await write_queue.get() + try: + # Perform the write + with get_db_connection() as conn: + conn.execute(task.sql, task.params) + conn.commit() + + # If success, set the result of the Future + task.future.set_result(True) + except Exception as e: + # If failure, set the exception so the caller can handle it + task.future.set_exception(e) + finally: + write_queue.task_done() + + +# ----------------------------- +# 3) Start the worker on startup +# ----------------------------- +@app.on_event("startup") +async def startup_event(): + # Launch the writer worker as a background task + asyncio.create_task(writer_worker()) + + +# ----------------------------- +# 4) Pydantic model for input +# ----------------------------- +class ItemCreate(BaseModel): + name: str + + +# ----------------------------- +# 5) Write endpoint (POST) +# ----------------------------- +@app.post("/items") +async def create_item(item: ItemCreate): + """Queue a write to the database, then wait for its completion.""" + sql = "INSERT INTO items (name) VALUES (?)" + params = (item.name,) + + # Create a WriteTask + write_task = WriteTask(sql, params) + + # Put the task in the queue + await write_queue.put(write_task) + + # Wait for the task to complete + try: + result = await write_task.future # This will be True if successful + return {"status": "success", "name": item.name} + except Exception as exc: + # If the DB write failed for some reason, raise a 500 + raise HTTPException(status_code=500, detail=str(exc)) + + +# ----------------------------- +# 6) Read endpoint (GET) +# ----------------------------- +@app.get("/items") +def read_items(): + """Simple read operation that does not need the queue.""" + with get_db_connection() as conn: + cursor = conn.cursor() + cursor.execute("SELECT id, name FROM items") + rows = cursor.fetchall() + return [{"id": row[0], "name": row[1]} for row in rows] + +Explanation + + WriteTask stores (sql, params, future). The future is how we pass success/failure back to the original request. + When a request hits POST /items, we: + Construct a WriteTask. + put() it on the write_queue. + Immediately await write_task.future. We don’t return until the DB operation is done. + The writer_worker loop picks tasks in FIFO order and executes them one-by-one, guaranteeing no concurrency for writes (thus avoiding locks). + On success, task.future.set_result(True) is called. On failure, task.future.set_exception(e). + The awaiting endpoint sees either a success (and returns HTTP 200) or an exception (and returns HTTP 500). + + This pattern means each request is effectively serialized for writes, but the user still gets a definitive success/failure response in the same request/response cycle. +``` \ No newline at end of file diff --git a/Server_API/app/main.py b/Server_API/app/main.py index 7b8a6591..07a90a5a 100644 --- a/Server_API/app/main.py +++ b/Server_API/app/main.py @@ -1,7 +1,22 @@ +# main.py +# Description: This file contains the main FastAPI application, which serves as the primary API for the tldw application. +# +# Imports +# +# 3rd-party Libraries from fastapi import FastAPI +# +# Local Imports +# +######################################################################################################################## +# +# Functions: -app = FastAPI(title="TLDW API", version="1.0.0") +# Usage: uvicorn main:app --reload +app = FastAPI(title="tldw API", version="1.0.0") @app.get("/") async def root(): return {"message": "Welcome to the tldw API"} + +