Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify parsers to only preserve user messages with text-based body #126

Merged
merged 21 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 36 additions & 67 deletions chatminer/chatparsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,13 +196,12 @@ def _parse_message(self, mess: str):

if ": " in author_and_body:
author, body = [x.strip() for x in author_and_body.split(": ", 1)]
return ParsedMessage(time, author, body)
elif ":." in author_and_body:
author = [x.strip() for x in author_and_body.split(":.", 1)][0]
body = "<Disappearing Message>"
self._logger.info(f"Ignoring self-destroying message on {time}.")
else:
author = "System"
body = author_and_body.strip()
return ParsedMessage(time, author, body)
self._logger.info(f"Ignoring sytem message on {time}: {body}.")


class FacebookMessengerParser(Parser):
Expand All @@ -211,17 +210,11 @@ def _read_raw_messages_from_file(self):
self._raw_messages: List[Dict[str, Any]] = json.load(f)["messages"]

def _parse_message(self, mess: Dict[str, Any]):
body: str
if "type" in mess and mess["type"] == "Share":
body = mess["share"]["link"]
elif "sticker" in mess:
body = mess["sticker"]["uri"]
elif "content" in mess:
body = mess["content"]
else:
self._logger.warning("Skipped message with unknown format: %s", mess)
if "content" not in mess:
return None

body = mess["content"]

time = dt.datetime.utcfromtimestamp(mess["timestamp_ms"] / 1000)
author = mess["sender_name"].encode("latin-1").decode("utf-8")
body = body.encode("latin-1").decode("utf-8")
Expand All @@ -234,36 +227,23 @@ def _read_raw_messages_from_file(self):
self._raw_messages: List[Dict[str, Any]] = json.load(f)["messages"]

def _parse_message(self, mess: Dict[str, Any]):
if "share" in mess:
body = "sentshare"
elif "photos" in mess:
body = "sentphoto"
elif "videos" in mess:
body = "sentvideo"
elif "audio_files" in mess:
body = "sentaudio"
elif "content" in mess:
if any(
flag in mess["content"]
for flag in (
" to your message",
" in the poll.",
" created a poll: ",
" liked a message",
"This poll is no longer available.",
"'s poll has multiple updates.",
)
):
return None
body = mess["content"]
elif all(key in ("sender_name", "timestamp_ms", "reactions") for key in mess):
body = "disappearingmessage"
elif any(key == "is_unsent" for key in mess):
if "content" not in mess:
return None
else:
self._logger.warning("Skipped message with unknown format: %s", mess)

system_messages = [
"to your message",
"in the poll.",
"created a poll: ",
"liked a message",
"This poll is no longer available.",
"'s poll has multiple updates.",
]

if any(flag in mess["content"] for flag in system_messages):
return None

body = mess["content"]

time = dt.datetime.utcfromtimestamp(mess["timestamp_ms"] / 1000)
author = mess["sender_name"].encode("latin-1").decode("utf-8")
body = body.encode("latin-1").decode("utf-8")
Expand All @@ -280,44 +260,33 @@ def _read_raw_messages_from_file(self):
json_objects = json.load(f)

if "messages" in json_objects:
self._logger.info("Detected single chat export.")
self._raw_messages = json_objects["messages"]
else:
self._logger.info("Detected batch export.")
if self.chat_name:
self._logger.info("Searching for chat %s...", self.chat_name)
for chat in json_objects["chats"]["list"]:
if "name" in chat and chat["name"] == self.chat_name:
self._raw_messages = chat["messages"]
break
else:
self._logger.info(
'No chat name was specified, searching for chat "Saved Messages"...'
)
for chat in json_objects["chats"]["list"]:
if chat["type"] == "saved_messages":
self._raw_messages = chat["messages"]
break
if not self._raw_messages:
self._logger.error(
"Chat %s was not found.",
self.chat_name if self.chat_name else "Saved Messages",
)
raise ValueError(f"{self.chat_name} not found in {self._file}")

def _parse_message(self, mess: Dict[str, Any]):
if "from" in mess and "text" in mess:
if isinstance(mess["text"], str):
body = mess["text"]
elif isinstance(mess["text"], list):
text_elements = [
m["text"] if isinstance(m, dict) else m for m in mess["text"]
]
body = " ".join(text_elements)
else:
raise ValueError(f"Unable to parse type {type(mess['text'])} in {mess}")
if "from" not in mess or "text" not in mess:
return None

time = dt.datetime.utcfromtimestamp(int(mess["date_unixtime"]))
author = mess["from"]
return ParsedMessage(time, author, body)
return None
if isinstance(mess["text"], str):
body = mess["text"]
elif isinstance(mess["text"], list):
text_elements = [
m["text"] if isinstance(m, dict) else m for m in mess["text"]
]
body = " ".join(text_elements)

time = dt.datetime.utcfromtimestamp(int(mess["date_unixtime"]))
author = mess["from"]
return ParsedMessage(time, author, body)


class WhatsAppDateFormat:
Expand Down
15 changes: 0 additions & 15 deletions test/whatsapp/target.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,6 @@
"author": "John Doe 🤓",
"message": "Lorem ipsum 🤓"
},
{
"timestamp": "2020-06-30T09:10:00",
"author": "System",
"message": "You were added"
},
{
"timestamp": "2020-06-20T00:08:00",
"author": "System",
"message": "+12 345 578 created group \"Groupname\""
},
{
"timestamp": "2020-06-10T15:55:00",
"author": "John-John Doe",
Expand All @@ -34,11 +24,6 @@
"author": "Jahn Doe",
"message": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis libero."
},
{
"timestamp": "2019-01-20T11:23:00",
"author": "John Doe",
"message": "<Disappearing Message>"
},
{
"timestamp": "2019-01-01T11:25:00",
"author": "John Doe",
Expand Down