From 94ad7bdfb6ed0c8be19d99330dbb477a9a219c2c Mon Sep 17 00:00:00 2001 From: Pratik Prakash Date: Fri, 28 Jul 2023 17:48:21 -0700 Subject: [PATCH 1/3] Adding function to read discord messages and return them into an array --- .gitignore | 3 + ingestion/discord/discord_fetch_tech_notes.md | 40 ++++++ ingestion/discord/read_discord.py | 131 +++++++++++++----- 3 files changed, 136 insertions(+), 38 deletions(-) create mode 100644 ingestion/discord/discord_fetch_tech_notes.md diff --git a/.gitignore b/.gitignore index a7d2d83..4a44b1b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ package-lock.json**/.DS_Store .DS_Store .vscode __pycache__ + +ingestion/discord/config.yaml +ingestion/discord/*.json \ No newline at end of file diff --git a/ingestion/discord/discord_fetch_tech_notes.md b/ingestion/discord/discord_fetch_tech_notes.md new file mode 100644 index 0000000..0326fe7 --- /dev/null +++ b/ingestion/discord/discord_fetch_tech_notes.md @@ -0,0 +1,40 @@ +Read new discord messages + + + +# Implementation overview +We will use discord.py to read the messages from discord for a certain channel id. We will get all the messages that take place after timestamp1. Timestamp1 will be stored in a local json file. If timestamp1 doesn't exist then create and save timestamp1 and receive all available messages. + +1. Setup the get_messages discord function which takes in channelId as the parameter +2. Setup the discord bot and its on ready event +3. Read the timestamp file (If exists) for the channel id +5. If timestamp file does not exist, create a new one and name it [channelId].json +6. Read the discord message and save it to the array +7. Return the message array as part of the function. + +It may also be worthwhile creating this as a class + +# Implementation details + +## How to get messages after a timestamp +Discord's channel.history has an `after` parameter that takes in a datetime.datetime + +``` +channel.history(after=after_date, limit=None) +``` + +## What happens when fetching messages fail? +Ignore this for now. Assume it succeeds for now. Also we'd need to look at how discord.py handles read messages errors. [Channel.history documentation](https://discordpy.readthedocs.io/en/latest/api.html#discord.TextChannel.history) + +But some ideas are to retry from the timestamp of the last successfully retrieved message. Or just retry the entire get. + +## Where to store the timestamp +Write the timestamp to a local json file for now. The fetch is for a specific channel id so we can have channel id be the file name. [channelId].json +``` +{ + lastFetch: "2019-11-14T00:55:31.820Z" // ISO 8601 standard +} +``` + +## How to pass the messages for the transformation step +The read_messages function will just return the array of messages received for the transformation step. \ No newline at end of file diff --git a/ingestion/discord/read_discord.py b/ingestion/discord/read_discord.py index 8e9cb94..c4ef1be 100644 --- a/ingestion/discord/read_discord.py +++ b/ingestion/discord/read_discord.py @@ -1,44 +1,99 @@ import json +import yaml import discord from typing import List from discord.ext import commands +from datetime import datetime, timezone +import pytz +class DiscordConfig: + def __init__(self) -> None: + config = self._fetch_discord_config() -def get_discord_messages(): - with open("../get-discord-messages/messages3.json") as f: - messages = json.load(f) - - message_docs = messages - # for message in messages: - # message_docs.append(message['content']) - # print(message['content']) - # print("\n\n") - return message_docs - - -# self._create_full_tmp_dir_path() -# if self.config.verbose: - # logger.debug(f"fetching {self} - PID: {os.getpid()}") -messages: List[discord.Message] = [] -intents = discord.Intents.default() -intents.message_content = True -bot = commands.Bot(command_prefix=">", intents=intents) - -@bot.event -async def on_ready(): - try: - after_date = None - - channel = bot.get_channel(970731414494535703) - async for msg in channel.history(after=after_date, limit=None): # type: ignore - messages.append(msg) - - print(messages[0]) - print(len(messages)) - await bot.close() - except Exception as e: - # logger.error(f"Error fetching messages: {e}") - await bot.close() - raise e - -bot.run("MTExNzE1NzgyOTI5MjMzNTExNQ.GAm_WX.KCcahPTN-yiRWCkpONBNZJ4wwUr5N28jBX-Cac") \ No newline at end of file + try: + self.token = config['token'] + except KeyError: + raise KeyError("Missing token from Discord configuration") + + try: + self.channelId = config['channelId'] + except KeyError: + raise KeyError("Missing field channelId from Discord configuration") + + def _fetch_discord_config(self): + try: + with open("config.yaml", "r") as file: + return yaml.safe_load(file) + except FileNotFoundError: + print("No config.yaml file found containing discord configuration") + + +class DiscordConnector: + def __init__(self) -> None: + self.config = DiscordConfig() + self.messages: List[discord.Message] = [] + + intents = discord.Intents.default() + intents.message_content = True + self.bot = commands.Bot(command_prefix=">", intents=intents) + + def _get_current_timestamp(self): + return datetime.now(timezone.utc).isoformat() + + def _get_timestamp_from_file(self): + timestamp_file = f"{self.config.channelId}.json" + try: + with open(timestamp_file, "r") as file: + dict = json.load(file) + iso_timestamp_str = dict["lastFetch"] + return datetime.fromisoformat(iso_timestamp_str) + + except FileNotFoundError: + # Create a new file instead with the same timestamp + self._rewrite_timestamp_file() + return None + + def _delete_timestamp_file(self): + pass + + def _rewrite_timestamp_file(self): + timestamp_file = f"{self.config.channelId}.json" + data = {"lastFetch": self._get_current_timestamp()} + json_data = json.dumps(data, indent=4) + + with open(timestamp_file, "w") as file: + file.write(json_data) + + def get_messages(self): + @self.bot.event + async def on_ready(): + try: + timestamp = self._get_timestamp_from_file() + after_date = timestamp + + print(f"Fetching all messages after {after_date}") + + channel = self.bot.get_channel(self.config.channelId) + async for msg in channel.history(after=after_date, limit=5): + self.messages.append(msg.content) + + # Update the timestamp json to now when we succeed so we can fetch the new messages we haven't yet read + self._rewrite_timestamp_file() + + # Exit this function by closing the bot + await self.bot.close() + + except Exception as e: + print("Received error: ", e) + await self.bot.close() + # If we fail, delete the timestamp file so we retry it for next time. + self._delete_timestamp_file() + raise e + + self.bot.run(self.config.token) + return self.messages + + + +Discord = DiscordConnector() +print(Discord.get_messages()) \ No newline at end of file From a6b71b8a639ac7508a802545f6af86efe32defb3 Mon Sep 17 00:00:00 2001 From: Pratik Prakash Date: Fri, 28 Jul 2023 17:56:39 -0700 Subject: [PATCH 2/3] Adding usage to comments --- src/sherlock/ingestion/discord/read_discord.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/sherlock/ingestion/discord/read_discord.py b/src/sherlock/ingestion/discord/read_discord.py index c4ef1be..63f6dfa 100644 --- a/src/sherlock/ingestion/discord/read_discord.py +++ b/src/sherlock/ingestion/discord/read_discord.py @@ -29,6 +29,13 @@ def _fetch_discord_config(self): class DiscordConnector: + """ + Example usage: + + Discord = DiscordConnector() + print(Discord.get_messages()) + """ + def __init__(self) -> None: self.config = DiscordConfig() self.messages: List[discord.Message] = [] @@ -91,9 +98,4 @@ async def on_ready(): raise e self.bot.run(self.config.token) - return self.messages - - - -Discord = DiscordConnector() -print(Discord.get_messages()) \ No newline at end of file + return self.messages \ No newline at end of file From c146eabd16f6d1fe91ea78125f0e38b80a358d8b Mon Sep 17 00:00:00 2001 From: Pratik Prakash Date: Fri, 28 Jul 2023 17:59:49 -0700 Subject: [PATCH 3/3] Updating limit=5 to None so it fetches all the messages it can --- src/sherlock/ingestion/discord/read_discord.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sherlock/ingestion/discord/read_discord.py b/src/sherlock/ingestion/discord/read_discord.py index 63f6dfa..c1b0023 100644 --- a/src/sherlock/ingestion/discord/read_discord.py +++ b/src/sherlock/ingestion/discord/read_discord.py @@ -81,7 +81,7 @@ async def on_ready(): print(f"Fetching all messages after {after_date}") channel = self.bot.get_channel(self.config.channelId) - async for msg in channel.history(after=after_date, limit=5): + async for msg in channel.history(after=after_date, limit=None): self.messages.append(msg.content) # Update the timestamp json to now when we succeed so we can fetch the new messages we haven't yet read