-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into @pawelcp/input
- Loading branch information
Showing
58 changed files
with
1,289 additions
and
629 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
name: Python Black | ||
|
||
on: [pull_request] | ||
|
||
jobs: | ||
python-black: | ||
name: Python Black | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Python Black (core) | ||
uses: cytopia/[email protected] | ||
with: | ||
path: 'core/' | ||
- name: Python Black (main) | ||
uses: cytopia/[email protected] | ||
with: | ||
path: 'main.py' | ||
- name: Python Black (crawler) | ||
uses: cytopia/[email protected] | ||
with: | ||
path: 'workers/crawler.py' | ||
- name: Python Black (embedder) | ||
uses: cytopia/[email protected] | ||
with: | ||
path: 'workers/embedder.py' | ||
- name: Python Black (scheduler) | ||
uses: cytopia/[email protected] | ||
with: | ||
path: 'workers/scheduler.py' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
## Guide for UI developers, no matter the platform. | ||
To interact with the rest of the system, exclusively utilize functions from `core/database/db_xxx` | ||
|
||
For a simple applications which only requests summaries and reads the results,<br> | ||
You have to use: | ||
* `db_add_completion_task` to create a task | ||
* `db_get_completions_by_page` to get the results | ||
|
||
<br> | ||
Here is an example of how that would look like with `FastAPI`: | ||
|
||
```py | ||
@app.post("/add_completion_task") | ||
def add_completion_task(prompt): | ||
db_add_completion_task(prompt) | ||
return { | ||
"status": "OK" | ||
} | ||
|
||
|
||
@app.get("/get_completions") | ||
def get_completions(page: int): | ||
completions = db_get_completions_by_page(page) | ||
return { | ||
"completions": completions | ||
} | ||
``` | ||
|
||
For a more complicated web page, which can also schedule crawls, | ||
you'll also make use of: | ||
* `db_add_crawl_task` to schedule a new crawl | ||
* `db_get_crawl_history_by_page` to see the crawls you scheduled, and their status | ||
|
||
#### Important notes | ||
* Currently, there is no system present which would automatically populate | ||
the embeddings database after scheduling a completion task. | ||
This means, that the UI has to ensure all the databases are appropriately populated. | ||
As a result, before requesting a summary, it's necessary to perform crawls | ||
to give our summaries enough context to work with. | ||
|
||
* All db calls return lists of entire objects, unless it's specified otherwise. | ||
This is the default since we're prioritizing speed and minimal latency over | ||
security. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import argparse | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"-H", | ||
"--use-hugging-face", | ||
dest="use_hugging_face", | ||
action="store_true", | ||
help="Use Hugging Face as the model provider", | ||
) | ||
parser.add_argument( | ||
"-M", | ||
"--pick-model", | ||
type=str, | ||
dest="model_choice", | ||
choices=[ | ||
"ollama_medium", | ||
"ollama_small", | ||
"ollama_large", | ||
"hugging_face_medium", | ||
"hugging_face_small", | ||
"hugging_face_large", | ||
], | ||
default="ollama_medium", | ||
help="Select model configuration", | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
USE_HUGGING_FACE = args.use_hugging_face | ||
MODEL_CHOICE = args.model_choice |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,97 @@ | ||
import datetime | ||
|
||
from core.tools.scraper import web_query_google_lookup | ||
from core.classes.query import WebQuery | ||
from langchain_core.prompts import ChatPromptTemplate | ||
|
||
|
||
def web_news_lookup(prompt_text: str): | ||
query = WebQuery('news', prompt_core=prompt_text) | ||
query = WebQuery("news", prompt_core=prompt_text) | ||
return web_query_google_lookup(query) | ||
|
||
|
||
def web_wiki_lookup(prompt_text: str): | ||
query = WebQuery('wiki', prompt_core=prompt_text) | ||
query = WebQuery("wiki", prompt_core=prompt_text) | ||
return web_query_google_lookup(query) | ||
|
||
|
||
def web_docs_lookup(prompt_text: str): | ||
query = WebQuery('docs', prompt_core=prompt_text) | ||
query = WebQuery("docs", prompt_core=prompt_text) | ||
return web_query_google_lookup(query) | ||
|
||
|
||
def web_docs_lookup_prompt(): | ||
return ChatPromptTemplate.from_messages( | ||
[ | ||
( | ||
"system", | ||
"You are a search results interpreter." | ||
"Your job is to write an detailed instruction based on the provided context. " | ||
"Your job is to convert all the search results you were given into a long, " | ||
"comprehensive and clean output. " | ||
"Use context data to explain " | ||
"the topic of user request to the best of your ability. " | ||
"You don't have a knowledge cutoff. " | ||
"It is currently " + datetime.date.today().strftime("%B %Y"), | ||
), | ||
( | ||
"user", | ||
"Search results data: " | ||
"```" | ||
"{search_data}" | ||
"```" | ||
'User request: "Write an article on: {user_request}"', | ||
), | ||
] | ||
) | ||
|
||
|
||
def web_wiki_lookup_prompt(): | ||
return ChatPromptTemplate.from_messages( | ||
[ | ||
( | ||
"system", | ||
"You are a search results interpreter. " | ||
"Your job is to write an article based on the provided context. " | ||
"Your job is to convert all the search results you were given into a long, " | ||
"comprehensive and clean output. " | ||
"Use context data to answer " | ||
"the user request to the best of your ability. " | ||
"You don't have a knowledge cutoff. " | ||
"It is currently " + datetime.date.today().strftime("%B %Y"), | ||
), | ||
( | ||
"user", | ||
"Search results data: " | ||
"```" | ||
"{search_data}" | ||
"```" | ||
'User request: "Write an article on: {user_request}"', | ||
), | ||
] | ||
) | ||
|
||
|
||
def web_news_lookup_prompt(): | ||
return ChatPromptTemplate.from_messages( | ||
[ | ||
( | ||
"system", | ||
"You are a search results interpreter. " | ||
"Your job is to write an article based on the provided context. " | ||
"Your job is to convert all the search results you were given into a long, " | ||
"comprehensive and clean output. " | ||
"Use provided context to answer the user request to the best of your ability. " | ||
"You don't have a knowledge cutoff. " | ||
"It is currently " + datetime.date.today().strftime("%B %Y"), | ||
), | ||
( | ||
"user", | ||
"Search results data: " | ||
"```" | ||
"{search_data}" | ||
"```" | ||
'User request: "Write an article on: {user_request}"', | ||
), | ||
] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Literal | ||
|
||
# this is a serializable singleton global configuration file | ||
|
||
|
||
class RuntimeConfig: | ||
|
||
worker_type: Literal["embedder", "crawler", "summarizer", "scheduler"] | ||
|
||
llm_configuration: str | None | ||
embed_configuration: str | None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,62 +1,70 @@ | ||
from __future__ import annotations | ||
|
||
import datetime | ||
from typing import Literal, Union | ||
from typing import Literal | ||
|
||
from core.tools import utils | ||
|
||
|
||
class WebQuery: | ||
"""class for bundling all data required for embedding and search operations""" | ||
# Small chunks make it impossible to deduct full context in presence of millions of other unrelated texts | ||
|
||
# Small chunks make it impossible to deduct full context | ||
# in presence of millions of other unrelated texts | ||
# Small chunks are meaningful only when talking about a single topic | ||
_DEFAULT_INFO_CHUNK_LENGTH = 800 | ||
_DEFAULT_STORY_CHUNK_LENGTH = 1200 | ||
_DEFAULT_PRIORITY = 1 | ||
|
||
query_type: str | ||
|
||
prompt_core: str = '' | ||
prompt_core: str = "" | ||
|
||
web_query: str = '' | ||
web_extra_params: Union[dict, None] = None | ||
web_query: str = "" | ||
|
||
web_extra_params: dict[str, str | int] | None = None | ||
web_tbs = 0 | ||
|
||
db_search_query: str = '' # query to search by | ||
db_embedding_prefix: str = '' # prefixed to each article saved to faiss db | ||
db_embedding_postfix: str = '' # postfixed -||- | ||
db_save_file_extension: str = '' # most types will have dedicated db for them | ||
db_search_query: str = "" # query to search by | ||
db_embedding_prefix: str = "" # prefixed to each article saved to faiss db | ||
db_embedding_postfix: str = "" # postfixed -||- | ||
db_save_file_extension: str = "" # most types will have dedicated db for them | ||
db_chunk_size: int = 600 # legacy default | ||
|
||
def __init__(self, | ||
query_type: Literal['basic', 'wiki', 'news', 'docs'], | ||
prompt_core: str, | ||
priority: int = _DEFAULT_PRIORITY): | ||
def __init__( | ||
self, | ||
query_type: Literal["basic", "wiki", "news", "docs"], | ||
prompt_core: str, | ||
priority: int = _DEFAULT_PRIORITY, | ||
): | ||
|
||
self.query_type = query_type | ||
self.prompt_core = prompt_core | ||
self.db_embed_query = prompt_core # query to search by | ||
self.priority = priority | ||
|
||
if query_type == 'basic': | ||
if query_type == "basic": | ||
self.web_query = prompt_core | ||
self.db_chunk_size = 800 | ||
|
||
elif query_type == 'wiki': | ||
elif query_type == "wiki": | ||
# deprecated, use 'basic' | ||
self.web_query = 'wikipedia ' + prompt_core | ||
self.db_save_file_extension = '_facts' | ||
self.web_query = "wikipedia " + prompt_core | ||
self.db_save_file_extension = "_facts" | ||
self.db_chunk_size = 600 | ||
|
||
elif query_type == 'news': | ||
elif query_type == "news": | ||
# this prompt works well for Google News searches | ||
self.web_query = f"{prompt_core} news comprehensive overview " | ||
self.web_extra_params = { | ||
'tbm': 'nws', # news only | ||
"tbm": "nws", # news only | ||
} | ||
self.web_tbs = 'qdr:m' # last month only | ||
self.web_tbs = "qdr:m" # last month only | ||
self.db_search_query = f"{prompt_core} news and innovations" | ||
self.db_save_file_extension = f"_news_{datetime.date.today().strftime('%Y_%m_%d').lower()}" | ||
self.db_save_file_extension = f"_news_{utils.gen_unix_time()}" | ||
self.db_chunk_size = 1200 | ||
|
||
elif query_type == 'docs': | ||
self.web_query = 'documentation for ' + prompt_core | ||
elif query_type == "docs": | ||
self.web_query = "documentation for " + prompt_core | ||
self.db_save_file_extension = "_docs" | ||
self.db_chunk_size = 600 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
## What's this? | ||
This directory contains all the getters and setters for all 5 global databases, | ||
with more potentially coming in later. | ||
|
||
## Now vs Future | ||
Currently, these simple getters and setters utilize TinyDB, | ||
and effectively behave as singletons for all the detached, separate workers. | ||
|
||
In the future, we'll want to these functions to optionally call | ||
remote database providers, and act as a wrapper for these databases. | ||
|
||
## Important notes | ||
All db calls return lists of entire objects, unless it's specified otherwise. | ||
This is the default since we're prioritizing speed and minimal latency over | ||
security or cleanliness, and these systems are not intended to be run publicly, | ||
but as a closed network. |
Oops, something went wrong.