Skip to content

Commit

Permalink
refactor: format code
Browse files Browse the repository at this point in the history
  • Loading branch information
jhehemann committed Jul 24, 2024
1 parent 7443916 commit f200434
Show file tree
Hide file tree
Showing 3 changed files with 2 additions and 31 deletions.
2 changes: 1 addition & 1 deletion packages/jhehemann/customs/research/component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ license: Apache-2.0
aea_version: '>=1.0.0, <2.0.0'
fingerprint:
__init__.py: bafybeidpcd7b3qvijj5ucmjcxvya4o5hbgetndu3siny7itumlfeekkkam
research.py: bafybeieuoe2jet5t6qf7arlcn4e23762f6wxtf4g5b6avregsqcy5eljta
research.py: bafybeia443k5mbkjdaib2tgrpbmnw2q3kpwdgvyvadsgiobuwktixqfaja
fingerprint_ignore_patterns: []
entry_point: research.py
callable: run
Expand Down
29 changes: 0 additions & 29 deletions packages/jhehemann/customs/research/research.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,6 @@ def __init__(self, url, html=None, title=None, description=None, publication_dat
self.chunks_final = []
self.extract_attribute_names = ["title", "description", "publication_date", "publisher"]


def get_title(self, soup, scripts):
try:
title = soup.title
Expand All @@ -354,14 +353,12 @@ def get_title(self, soup, scripts):
# If no title was found return "n/a".
return "n/a"


def get_description(self, soup, scripts):
description = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property": "description"})
if description and description.get("content"):
return description["content"].strip()
return "n/a"


def get_publisher(self, soup, scripts):
for script in scripts:
try:
Expand All @@ -382,7 +379,6 @@ def get_publisher(self, soup, scripts):
else:
return "n/a"


def get_date(self, soup, scripts):
for script in scripts:
try:
Expand All @@ -405,7 +401,6 @@ def get_date(self, soup, scripts):
return format_date(meta_tag["content"])
return "n/a"


def extract_page_attributes(
self,
) -> object:
Expand All @@ -429,7 +424,6 @@ def extract_page_attributes(

return self


def to_prompt(self):
"""
Function to convert article attributes into a structured format for LLM prompts.
Expand All @@ -443,7 +437,6 @@ def to_prompt(self):

return page_info


def _find_publisher(self, data):
def extract_names(item, key):
"""Helper function to extract names from a field that could be a list or a single object."""
Expand Down Expand Up @@ -496,7 +489,6 @@ def trim_json_formatting(output_string):
# Return the original string if no match is found
return output_string


def trim_chunks_string(
chunks_string: str,
enc: tiktoken.Encoding,
Expand All @@ -508,14 +500,12 @@ def trim_chunks_string(
encoding = encoding[:max_tokens]
return enc.decode(encoding)


def find_release_date_in_data(data):
for name in RELEASE_DATE_NAMES:
if name in data:
return data[name]
return None


def format_date(date_string) -> str:
# Desired format "February 16, 2024, 3:30 PM"
format_str = "%B %d, %Y"
Expand All @@ -534,7 +524,6 @@ def format_date(date_string) -> str:
# If there's an error during parsing, return the original string
return date_string


def extract_question(text:str) -> str:
# Look for a quoted question
match = re.search(r'["“](.*?\?)["”]', text)
Expand All @@ -544,7 +533,6 @@ def extract_question(text:str) -> str:
# Return prompt if ending with a question mark
return text if text.strip().endswith('?') else ""


def parse_date_str(date_str: str) -> datetime:
# Desired format "February 16, 2024, 3:30 PM"
datetime_format = "%B %d, %Y"
Expand All @@ -559,28 +547,24 @@ def remove_date_from_query(query: str) -> str:
new_query = re.sub(date_pattern, "", query)
return new_query


def recursive_character_text_splitter(text, max_tokens, overlap):
if len(text) <= max_tokens:
return [text]
else:
return [text[i:i+max_tokens] for i in range(0, len(text), max_tokens - overlap)]


def count_tokens(text: str, model: str) -> int:
"""Count the number of tokens in a text."""
enc = encoding_for_model(model)
return len(enc.encode(text))


def get_first_dict_from_list(data):
"""Returns the first item if data is a list of dictionaries"""
if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
return data[0]
else:
return data # or raise an appropriate exception


def format_additional_information(web_pages: List[WebPage]) -> str:
"""Format the additional information from the web pages"""
formatted_information = ""
Expand All @@ -590,7 +574,6 @@ def format_additional_information(web_pages: List[WebPage]) -> str:
formatted_information += f"{web_page.final_output}\n\n"
return formatted_information


def search_google(query: str, api_key: str, engine: str, num: int) -> List[str]:
"""Search Google using a custom search engine."""
service = build("customsearch", "v1", developerKey=api_key)
Expand All @@ -605,7 +588,6 @@ def search_google(query: str, api_key: str, engine: str, num: int) -> List[str]:
)
return [result["link"] for result in search.get("items", [])]


def process_in_batches(
web_pages: List[WebPage],
batch_size: int = 15,
Expand Down Expand Up @@ -646,7 +628,6 @@ def process_in_batches(

yield get_futures


def embed_batch(client: OpenAI, batch):
"""
Helper function to process a single batch of texts and return the embeddings.
Expand All @@ -663,7 +644,6 @@ def embed_batch(client: OpenAI, batch):
# Return the embeddings
return [data.embedding for data in response.data]


def sort_text_chunks(
client: OpenAI, query: str, text_chunks_embedded: List[TextChunk]
) -> List[TextChunk]:
Expand All @@ -684,7 +664,6 @@ def sort_text_chunks(

return [text_chunks_embedded[i] for i in I[0]]


def get_embeddings(client: OpenAI, text_chunks: List[TextChunk], enc: tiktoken.Encoding) -> List[TextChunk]:
"""Get embeddings for the text chunks."""
# Batch the text chunks that the sum of tokens is less than MAX_EMBEDDING_TOKEN_INPUT
Expand Down Expand Up @@ -726,7 +705,6 @@ def get_embeddings(client: OpenAI, text_chunks: List[TextChunk], enc: tiktoken.E

return text_chunks


def get_chunks(web_pages: List[WebPage]) -> List[WebPage]:
"""Create chunks from the text of all web pages"""
text_chunks = []
Expand All @@ -737,7 +715,6 @@ def get_chunks(web_pages: List[WebPage]) -> List[WebPage]:

return text_chunks


def scrape_web_pages(web_pages: List[WebPage], week_interval, max_num_char: int = 10000) -> List[WebPage]:
"""Scrape text from web pages"""
filtered_web_pages = []
Expand Down Expand Up @@ -778,7 +755,6 @@ def scrape_web_pages(web_pages: List[WebPage], week_interval, max_num_char: int

return filtered_web_pages


def extract_html_texts(
web_pages: List[WebPage],
) -> List[WebPage]:
Expand Down Expand Up @@ -813,7 +789,6 @@ def extract_html_texts(

return parsed_web_pages


def get_urls_from_queries(
queries: List[str],
api_key: str,
Expand Down Expand Up @@ -847,7 +822,6 @@ def get_urls_from_queries(

return list(results)


def fetch_queries(
input_query: str,
engine="gpt-3.5-turbo",
Expand Down Expand Up @@ -919,7 +893,6 @@ def fetch_queries(
print("Maximum attempts reached, returning an empty string.")
return [], counter_callback


def summarize_relevant_chunks(
web_pages: List[WebPage],
input_query: str,
Expand Down Expand Up @@ -981,7 +954,6 @@ def summarize_for_web_page(web_page: WebPage) -> None:
web_pages = [web_page for web_page in web_pages if "Error" not in web_page.relevant_chunks_summary]
return web_pages, counter_callback


def summarize_over_summarized_chunks(
web_pages: List[WebPage],
input_query: str,
Expand Down Expand Up @@ -1059,7 +1031,6 @@ def summarize_over_summarized_chunks(

return modified_web_pages, counter_callback


def run(**kwargs) -> Tuple[str, Optional[str], Optional[Dict[str, Any]], Any]:
"""Run the task"""
with OpenAIClientManager(kwargs["api_keys"]["openai"]):
Expand Down
2 changes: 1 addition & 1 deletion packages/packages.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"custom/napthaai/prediction_request_reasoning_lite/0.1.0": "bafybeigxs5tq4w7ouamwlvv7vjw3z3jeercynsuka4mcpveshopvej4cyu",
"custom/valory/prediction_langchain/0.1.0": "bafybeihd3hv2zafscrlr25chtqmsh5weiolv2y7tc75urix665jf2a7zdu",
"custom/victorpolisetty/gemini_request/0.1.0": "bafybeig5x6b5jtanet2q5sk7er7fdzpippbvh4q5p7uxmxpriq66omjnaq",
"custom/jhehemann/research/0.1.0": "bafybeibtqesy65bk4gbda6m44ck46rxrk5lrovhqt72sg3lbdjsismpkri",
"custom/jhehemann/research/0.1.0": "bafybeifjcadlvh5yqhltry6i5ntimfcnx5k6psscokvyxgpfy6wyvmdfsy",
"custom/jhehemann/prediction_with_rules_and_report/0.1.0": "bafybeidfmb45ab336fzoq2vbglo6owpncpuiwzfihx3sm4b4a3asuhpwpm",
"custom/jhehemann/infer_market_rules/0.1.0": "bafybeibkp6ywtfpgdphuscldtshp3y7oorhkfi7fu6goiwjvjymbbohwfy",
"custom/gnosis/omen_tools/0.1.0": "bafybeibnjcgvy4l2libl34qz3aqfietjrevhxsbtisttuyfnelp3rfjlge",
Expand Down

0 comments on commit f200434

Please sign in to comment.