Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tons of fixes #2

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 107 additions & 76 deletions HangoutJsonParser.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,83 +1,114 @@
import json

# location of Hangouts Json file obtained from Google Takeout
with open('/path/to/JSON/data/file.json', 'r', encoding='utf-8') as f:
jsonData = json.load(f)

simpleJson = []

#!/usr/bin/python3 -Bubb
import argparse, datetime, hashlib, json, os

def parseData():
for i in range(0, len(jsonData['conversations'])):
conversation = {}
conversation['chatName'] = ""
conversation['participants'] = getParticipants(i)
conversation['messages'] = []

for j in range(0, len(jsonData['conversations'][i]['events'])):
message = {}
message['sender'] = {}
message['sender']['name'] = getName(
jsonData['conversations'][i]['events'][j]['sender_id']['gaia_id'], conversation['participants'])
message['sender']['id'] = jsonData['conversations'][i]['events'][j]['sender_id']['gaia_id']
message['unixtime'] = (int(jsonData['conversations'][i]
['events'][j]['timestamp']))/1000000

if 'chat_message' in jsonData['conversations'][i]['events'][j]:
# if it's a message(normal hangouts, image...)
if 'segment' in jsonData['conversations'][i]['events'][j]['chat_message']['message_content']:
# if it's a normal hangouts message
content = ""
for k in range(0, len(jsonData['conversations'][i]['events'][j]['chat_message']['message_content']['segment'])):
if jsonData['conversations'][i]['events'][j]['chat_message']['message_content']['segment'][k]['type'] == "TEXT":
content = content + \
jsonData['conversations'][i]['events'][j]['chat_message']['message_content']['segment'][k]['text']
elif jsonData['conversations'][i]['events'][j]['chat_message']['message_content']['segment'][k]['type'] == "LINK":
content = content + \
jsonData['conversations'][i]['events'][j]['chat_message']['message_content']['segment'][k]['text']
message['content'] = content

conversation['messages'].append(message)

global gaia_to_name
# First pass just to collect names, to deal with issue in the Hangouts.json, where some conversations lack a name for a user, when others show the name correctly.
gaia_to_name = {}
for orig_conv in jsonData['conversations']:
for participant in orig_conv['conversation']['conversation']['participant_data']:
assert participant['id']['gaia_id'] == participant['id']['chat_id']
if 'fallback_name' in participant:
gaia_to_name[participant['id']['gaia_id']] = participant['fallback_name']

for orig_conv in jsonData['conversations']:
conversation = {
'chatName': '',
'participants': [
{
'id': participant['id']['gaia_id'],
'name': participant.get('fallback_name')
}
for participant in
orig_conv['conversation']['conversation']['participant_data']
],
'messages': []
}

for event in orig_conv['events']:
def get_readable_content(event):
match event['event_type']:
case 'REGULAR_CHAT_MESSAGE':
content = ''
# if it's a message(normal hangouts, image...)
for msg_k,msg_v in event['chat_message']['message_content'].items():
match msg_k:
case 'segment': # if it's a normal hangouts message
for segment in msg_v:
match segment['type']:
case 'TEXT'|'LINK':
content += segment['text']
case 'LINE_BREAK':
content += '\n'
case _: raise segment
case 'attachment':
for att in msg_v:
match att['embed_item']['type']:
case ['PLUS_PHOTO']:
content += ' [attachment '+att['embed_item']['plus_photo']['url']+' ]'
case ['PLACE_V2', 'THING_V2', 'THING']:
content += ' [place '+', '.join(k+' '+repr(v) for k,v in att['embed_item']['place_v2'].items())+']'
case ['DYNAMITE_MESSAGE_METADATA']:
content += ' [upload_metadata '+repr(att['embed_item']['dynamite_message_metadata'])+']'
case _: raise Exception('unhandled attachment '+json.dumps(att,indent=4))
case _: raise msg_k
return content
case 'HANGOUT_EVENT':
return event['event_type']+' '+event['hangout_event']['event_type']
case 'ADD_USER'|'REMOVE_USER':
ret = (
event['event_type']+' '+event['membership_change'].pop('type')+' '+event['membership_change'].pop('leave_reason')+' '+
' '.join(repr(getName(i['gaia_id'],conversation)) for i in event['membership_change'].pop('participant_id'))
)
assert event['membership_change'] == {}
return ret
case 'GROUP_LINK_SHARING_MODIFICATION':
return event['event_type']+' '+repr( event['group_link_sharing_modification'])
case 'RENAME_CONVERSATION':
return event['event_type']+' '+repr(event['conversation_rename'])
case _: raise Exception('unhandled event type '+event['event_type'])

conversation['messages'].append({
'sender': {
'name': getName(event['sender_id']['gaia_id'],conversation),
'id': event['sender_id']['gaia_id']
},
'unixtime': int(event['timestamp'])/1000000,
'content': get_readable_content(event)
})

conversation['chatName'] = chatName(orig_conv, conversation['participants'])
simpleJson.append(conversation)
simpleJson[i]['chatName'] = chatName(i)


def getParticipants(index):
participants = []
for i in range(0, len(jsonData['conversations'][index]['conversation']['conversation']['participant_data'])):
person = {}
person['id'] = jsonData['conversations'][index]['conversation']['conversation']['participant_data'][i]['id']['gaia_id']
if 'fallback_name' in jsonData['conversations'][index]['conversation']['conversation']['participant_data'][i]:
person['name'] = jsonData['conversations'][index]['conversation']['conversation']['participant_data'][i]['fallback_name']
else:
person['name'] = jsonData['conversations'][index]['conversation']['conversation']['participant_data'][i]['id']['gaia_id']
participants.append(person)
return participants


def getName(id, participants):
for i in range(0, len(participants)):
if id == participants[i]['id']:
return participants[i]['name']
return id


def chatName(i):
if (('name' in jsonData['conversations'][i]['conversation']['conversation'])and(jsonData['conversations'][i]['conversation']['conversation']['name'] != "")):
return jsonData['conversations'][i]['conversation']['conversation']['name']
participants = []
index = 0
for k in range(0, len(simpleJson[i]['participants'])):
participants.append(simpleJson[i]['participants'][k]['name'])
if simpleJson[i]['participants'][k]['id'] == jsonData['conversations'][i]['conversation']['conversation']['self_conversation_state']['self_read_state']['participant_id']['gaia_id']:
index = k
break
name = participants[index]
return name

def getName(user_id, conversation):
global gaia_to_name
# First use the locally defined one if available.
for p in conversation['participants']:
if user_id == p['id'] and p['name'] != None:
return p['name']
return gaia_to_name.get(user_id, user_id) # Fall back to global name, then to no name at all just ID.

def chatName(orig_conv, participants):
if (('name' in orig_conv['conversation']['conversation']) and (orig_conv['conversation']['conversation']['name'] != "")):
return orig_conv['conversation']['conversation']['name']
for participant in participants:
if participant['id'] == orig_conv['conversation']['conversation']['self_conversation_state']['self_read_state']['participant_id']['gaia_id']:
return participant['name']

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('INPUT_JSON_PATH', help='Path location of Hangouts.json file obtained from Google Takeout.')
parser.add_argument('OUTPUT_DIRECTORY', help='Path to write output files.')
args = parser.parse_args()

jsonData = json.load(open(args.INPUT_JSON_PATH, 'r'))
simpleJson = []
parseData()
with open("clean_hangoutsData.json", "w", encoding="utf-8") as write_file:
json.dump(simpleJson, write_file, indent=4)
json.dump(simpleJson, open(os.path.join(args.OUTPUT_DIRECTORY, 'clean_hangoutsData.json'), 'w'), indent=4)
for chat in simpleJson:
filename = ', '.join(getName(i['id'],chat) for i in chat['participants'])+'.txt'
if len(filename) > os.statvfs(args.OUTPUT_DIRECTORY).f_namemax:
filename = hashlib.sha256(filename.encode('ascii')).hexdigest()+'.txt'
with open(os.path.join(args.OUTPUT_DIRECTORY, filename), 'w') as outtext:
for msg in chat['messages']:
outtext.write(datetime.datetime.fromtimestamp(msg['unixtime']).strftime('%Y-%m-%d %H:%M:%S')+' '+msg['sender']['name']+': '+msg['content']+'\n')