Skip to content

Commit

Permalink
search APIs
Browse files Browse the repository at this point in the history
  • Loading branch information
rmusser01 committed Dec 27, 2024
1 parent 30b9a4c commit 11814d1
Show file tree
Hide file tree
Showing 13 changed files with 116 additions and 73 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
pipeline:
params:
clustering: AgglomerativeClustering
embedding: /FULL/PATH/TO/SCRIPT/tldw/App_Function_Libraries/models/pyannote_model_wespeaker-voxceleb-resnet34-LM.bin #models/pyannote_model_wespeaker-voxceleb-resnet34-LM.bin
segmentation: /FULL/PATH/TO/SCRIPT/tldw/App_Function_Libraries/models/pyannote_model_segmentation-3.0.bin #models/pyannote_model_segmentation-3.0.bin

params:
segmentation:
min_duration_off: 0.0
clustering:
method: centroid
min_cluster_size: 12
threshold: 0.7045654963945799
5 changes: 4 additions & 1 deletion App_Function_Libraries/Utils/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,8 @@ def load_and_log_configs():
bing_country_code = config.get('Search-Engines', 'search_engine_country_code_bing', fallback='us')
bing_search_api_url = config.get('Search-Engines', 'search_engine_api_url_bing', fallback='')
# Brave Search Settings
brave_search_api_key = config.get('Search-Engines', 'search_engine_api_key_brave', fallback='')
brave_search_api_key = config.get('Search-Engines', 'search_engine_api_key_brave_regular', fallback='')
brave_search_ai_api_key = config.get('Search-Engines', 'search_engine_api_key_brave_ai', fallback='')
brave_country_code = config.get('Search-Engines', 'search_engine_country_code_brave', fallback='us')
# DuckDuckGo Search Settings
duckduckgo_search_api_key = config.get('Search-Engines', 'search_engine_api_key_duckduckgo', fallback='')
Expand Down Expand Up @@ -510,8 +511,10 @@ def load_and_log_configs():
'bing_country_code': bing_country_code,
'bing_search_api_url': bing_search_api_url,
'brave_search_api_key': brave_search_api_key,
'brave_search_ai_api_key': brave_search_ai_api_key,
'brave_country_code': brave_country_code,
'duckduckgo_search_api_key': duckduckgo_search_api_key,
'google_search_api_url': google_search_api_url,
'google_search_api_key': google_search_api_key,
'google_search_engine_id': google_search_engine_id,
'google_simp_trad_chinese': google_simp_trad_chinese,
Expand Down
128 changes: 64 additions & 64 deletions App_Function_Libraries/Web_Scraping/WebSearch_APIs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#######################################################################################################################
#



def perform_websearch(search_engine, search_query, country, search_lang, output_lang, result_count, date_range,
safesearch, site_blacklist):
if search_engine.lower() == "baidu":
Expand All @@ -33,7 +35,7 @@ def perform_websearch(search_engine, search_query, country, search_lang, output_
return search_web_brave(search_query, country, search_lang, output_lang, result_count, safesearch,
site_blacklist, date_range)
elif search_engine.lower() == "duckduckgo":
return search_web_ddg()
return search_web_duckduckgo(arg1, arg2)
elif search_engine.lower() == "google":
return search_web_google(search_query, result_count, results_origin_country, date_range, exactTerms,
excludeTerms, filter, geolocation, output_lang,
Expand All @@ -53,6 +55,25 @@ def perform_websearch(search_engine, search_query, country, search_lang, output_
return f"Error: Invalid Search Engine Name {search_engine}"


######################### Search Results Parsing #########################
def parse_html_search_results_generic(soup):
results = []
for result in soup.find_all('div', class_='result'):
title = result.find('h3').text if result.find('h3') else ''
url = result.find('a', class_='url')['href'] if result.find('a', class_='url') else ''
content = result.find('p', class_='content').text if result.find('p', class_='content') else ''
published_date = result.find('span', class_='published_date').text if result.find('span',
class_='published_date') else ''

results.append({
'title': title,
'url': url,
'content': content,
'publishedDate': published_date
})
return results


######################### Baidu Search #########################
#
# https://cloud.baidu.com/doc/APIGUIDE/s/Xk1myz05f
Expand Down Expand Up @@ -145,7 +166,7 @@ def test_search_web_bing():
# https://brave.com/search/api/
# https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/tools/llama-index-tools-brave-search/README.md
def search_web_brave(search_term, country, search_lang, ui_lang, result_count, safesearch="moderate",
brave_api_key=None, result_filter=None, date_range=None, ):
brave_api_key=None, result_filter=None, search_type="ai", date_range=None):
search_url = "https://api.search.brave.com/res/v1/web/search"
if not brave_api_key:
# load key from config file
Expand All @@ -166,6 +187,10 @@ def search_web_brave(search_term, country, search_lang, ui_lang, result_count, s
# date_range = "month"
if not result_filter:
result_filter = "webpages"
if search_type == "ai":
# FIXME - Option for switching between AI/Regular search
pass


headers = {"Accept": "application/json", "Accept-Encoding": "gzip", "X-Subscription-Token": brave_api_key}

Expand All @@ -180,48 +205,27 @@ def search_web_brave(search_term, country, search_lang, ui_lang, result_count, s
return brave_search_results


def test_search_brave(search_term, country, search_lang, ui_lang, result_count, safesearch="moderate", date_range=None,
result_filter=None, brave_api_key=None):
def test_search_brave():
search_term = "How can I bake a cherry cake"
country = "US"
search_lang = "en"
ui_lang = "en"
result_count = 10
safesearch = "moderate"
date_range = None
result_filter = None
result = search_web_brave(search_term, country, search_lang, ui_lang, result_count, safesearch, date_range,
result_filter, brave_api_key)
result_filter)
print(result)
return result


######################### DuckDuckGo Search #########################
#
# https://github.com/deedy5/duckduckgo_search/blob/main/duckduckgo_search/duckduckgo_search.py
# FIXME - 1shot gen with sonnet 3.5, untested.
def search_web_ddg():
# fuck it https://github.com/deedy5/duckduckgo_search/tree/main?tab=readme-ov-file
#return results
pass
# https://github.com/deedy5/duckduckgo_search
def search_web_duckduckgo(arg1, arg2)


def test_search_web_ddg():
"""Example usage of the DuckDuckGo search function"""
try:
# Basic search
results = search_web_ddg("How to bake a cherry cake")
print(f"Found {len(results)} results for 'How to bake a cherry cake'")

# Print first 3 results
for i, result in enumerate(results[:3], 1):
print(f"\nResult {i}:")
print(f"Title: {result['title']}")
print(f"URL: {result['href']}")
print(f"Description: {result['body'][:150]}...")

# Search with different parameters
limited_results = search_web_ddg(
keywords="artificial intelligence news",
region="us-en",
safesearch="on",
max_results=5
)
print(f"\nFound {len(limited_results)} limited results")

except Exception as e:
print(f"Search failed: {e}")


######################### Google Search #########################
Expand Down Expand Up @@ -376,17 +380,6 @@ def test_search_google():
print(result)


######################### Jina.ai Search #########################
#
# https://jina.ai/reader/
def search_web_jina(arg1, arg2, arg3):
pass

def test_search_jina(arg1, arg2, arg3):
result = search_web_jina(arg1, arg2, arg3)
return result


######################### Kagi Search #########################
#
# https://help.kagi.com/kagi/api/search.html
Expand Down Expand Up @@ -420,13 +413,8 @@ def search_web_kagi(search_term, country, search_lang, ui_lang, result_count, sa
response = requests.get(search_url, headers=headers, params=params)
response.raise_for_status()
# Response: https://api.search.brave.com/app/documentation/web-search/responses#WebSearchApiResponse
brave_search_results = response.json()
return brave_search_results
# curl - v \
# - H
# "Authorization: Bot $TOKEN" \
# https: // kagi.com / api / v0 / search\?q = steve + jobs
pass
kagi_search_results = response.json()
return kagi_search_results


def test_search_kagi():
Expand All @@ -437,18 +425,23 @@ def test_search_kagi():
#
# https://searx.space
# https://searx.github.io/searx/dev/search_api.html
def search_web_searx(search_query):
def search_web_searx(search_query, language='auto', time_range='', safesearch=0, pageno=1, categories='general'):

# Check if API URL is configured
searx_url = loaded_config_data['search_engines']['searx_search_api_url']
if not searx_url:
return "Search is disabled and no content was found. This functionality is disabled because the user has not set it up yet."
return "SearX Search is disabled and no content was found. This functionality is disabled because the user has not set it up yet."

# Validate and construct URL
try:
parsed_url = urlparse(searx_url)
params = {
'q': search_query,
'format': 'json'
'language': language,
'time_range': time_range,
'safesearch': safesearch,
'pageno': pageno,
'categories': categories
}
search_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}?{urlencode(params)}"
except Exception as e:
Expand All @@ -457,17 +450,25 @@ def search_web_searx(search_query):
# Perform the search request
try:
headers = {
'Content-Type': 'application/json',
'User-Agent': 'anything-llm'
}

response = requests.get(search_url, headers=headers)
response.raise_for_status()
search_data = response.json()

# Check if the response is JSON
content_type = response.headers.get('Content-Type', '')
if 'application/json' in content_type:
search_data = response.json()
else:
# If not JSON, assume it's HTML and parse it
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
search_data = parse_html_search_results_generic(soup)

# Process results
data = []
for result in search_data.get('results', []):
for result in search_data:
data.append({
'title': result.get('title'),
'link': result.get('url'),
Expand All @@ -485,7 +486,8 @@ def search_web_searx(search_query):


def test_search_searx():
search_web_searx("How can I bake a cherry cake?")
result = search_web_searx("How can I bake a cherry cake?")
print(result)
pass


Expand All @@ -502,16 +504,14 @@ def test_search_serper():

######################### Tavily Search #########################
#
# https://docs.tavily.com/docs/rest-api/api-reference
# https://github.com/YassKhazzan/openperplex_backend_os/blob/main/sources_searcher.py
def search_web_tavily():
tavily_url = "https://api.tavily.com/search"
pass


def test_search_tavily():
pass


######################### Yandex Search #########################
#
# https://yandex.cloud/en/docs/search-api/operations/web-search
Expand Down
3 changes: 2 additions & 1 deletion Docs/Design/Education.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ https://arxiv.org/abs/2412.02035
https://github.com/andreamust/NEON-GPT
https://excalidraw.com/
https://arxiv.org/abs/2411.07407

https://arxiv.org/abs/2412.16429
https://huggingface.co/papers/2412.15443

2 changes: 1 addition & 1 deletion Docs/Design/RSS_Ranking.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ https://blog.det.life/from-scrolls-to-similarity-search-building-a-movie-recomme

https://www.dogesec.com/blog/full_text_rss_atom_blog_feeds/
https://arxiv.org/abs/2411.19352

https://arxiv.org/abs/2412.18082


https://blog.badsectorlabs.com/files/blogs.txt
Expand Down
6 changes: 6 additions & 0 deletions Docs/Design/Researcher.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ https://github.com/neuml/paperai
https://github.com/neuml/paperetl
https://github.com/ai-christianson/RA.Aid
https://github.com/Future-House/paper-qa
https://openreview.net/
https://www.researchrabbit.ai/
https://github.com/faraz18001/Sales-Llama
https://github.com/memgraph/memgraph




### Ideas
Expand Down
6 changes: 5 additions & 1 deletion Docs/Design/Search.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@


### Link Dump
https://pub.towardsai.net/hnsw-small-world-yes-but-how-in-the-world-is-it-navigable-77701ed37e20
https://pub.towardsai.net/hnsw-small-world-yes-but-how-in-the-world-is-it-navigable-77701ed37e20
https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1



4 changes: 2 additions & 2 deletions Docs/Design/TTS_STT.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ https://github.com/FanaHOVA/smol-podcaster
https://docs.inferless.com/cookbook/serverless-customer-service-bot
https://github.com/aedocw/epub2tts
https://github.com/microsoft/SpeechT5

https://github.com/smellslikeml/dolla_llama



Expand Down Expand Up @@ -183,7 +183,7 @@ https://github.com/shagunmistry/NotebookLM_Alternative/tree/main/ai_helper
https://docs.cartesia.ai/get-started/make-an-api-request
https://github.com/JarodMica/open-neruosama
https://github.com/flatmax/speech-to-text

https://arxiv.org/abs/2412.18566

Google
https://github.com/google-gemini/cookbook/tree/main/gemini-2
Expand Down
12 changes: 12 additions & 0 deletions Docs/Design/WebSearch.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@ The web search functionality is a core component of the tldw system, allowing us
- Perform searches using Bing!


#### Brave Search
Two APIs, 1 for 'AI' the other for 'regular' search



#### DuckDuckGo Search
using DuckDuckGo website

#### Google Search
Have to create a custom search engine first, get the ID and then the API key




https://github.com/scrapinghub/article-extraction-benchmark
Expand Down
3 changes: 2 additions & 1 deletion Docs/Handy_Dandy_Papers.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ https://huggingface.co/spaces/HuggingFaceH4/blogpost-scaling-test-time-compute
- https://arxiv.org/abs/2408.03314
- https://github.com/huggingface/search-and-learn
- https://huggingface.co/spaces/HuggingFaceH4/blogpost-scaling-test-time-compute

https://arxiv.org/abs/2412.18319
- https://github.com/hkust-nlp/B-STaR

### Personalization
https://arxiv.org/abs/2411.16034
Expand Down
1 change: 1 addition & 0 deletions Docs/Issues/Citations_and_Confabulations.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ https://huggingface.co/PleIAs/Pleias-Nano
https://deepmind.google/discover/blog/facts-grounding-a-new-benchmark-for-evaluating-the-factuality-of-large-language-models/
https://arxiv.org/abs/2412.14860
https://arxiv.org/abs/2412.14686
https://arxiv.org/abs/2412.18069



Expand Down
2 changes: 2 additions & 0 deletions Docs/Issues/Evaluation_Plans.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ https://github.com/chigkim/Ollama-MMLU-Pro
https://huggingface.co/ymcki/Llama-3_1-Nemotron-51B-Instruct-GGUF
https://pub.towardsai.net/streamline-your-llm-evaluation-a-step-by-step-guide-to-rag-metrics-with-streamlit-38ed9efbdc9a
https://huggingface.co/QuantFactory/granite-3.1-8b-instruct-GGUF
https://huggingface.co/CohereForAI/c4ai-command-r7b-12-2024
https://arxiv.org/abs/2412.17758


Have LLMs play Social deception games
Expand Down
Loading

0 comments on commit 11814d1

Please sign in to comment.