diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bcf5897 --- /dev/null +++ b/.gitignore @@ -0,0 +1,344 @@ +test_data/*.BAK +test_data/calls-gvoice.xml +test_data/sms-gvoice.xml +test_data/sms-vm-gvoice.xml +test_data/sms-chat.xml +test_data/contacts.json + +venv + +# Created by https://www.toptal.com/developers/gitignore/api/emacs +# Edit at https://www.toptal.com/developers/gitignore?templates=emacs + +### Emacs ### +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + + +# End of https://www.toptal.com/developers/gitignore/api/emacs + +# Created by https://www.toptal.com/developers/gitignore/api/python +# Edit at https://www.toptal.com/developers/gitignore?templates=python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +# End of https://www.toptal.com/developers/gitignore/api/python + +# Created by https://www.toptal.com/developers/gitignore/api/gradle +# Edit at https://www.toptal.com/developers/gitignore?templates=gradle + +### Gradle ### +.gradle +**/build/ +!src/**/build/ + +# Ignore Gradle GUI config +gradle-app.setting + +# Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) +!gradle-wrapper.jar + +# Avoid ignore Gradle wrappper properties +!gradle-wrapper.properties + +# Cache of project +.gradletasknamecache + +# Eclipse Gradle plugin generated files +# Eclipse Core +.project +# JDT-specific (Eclipse Java Development Tools) +.classpath + +### Gradle Patch ### +# Java heap dump +*.hprof + +# End of https://www.toptal.com/developers/gitignore/api/gradle + +# Created by https://www.toptal.com/developers/gitignore/api/eclipse +# Edit at https://www.toptal.com/developers/gitignore?templates=eclipse + +### Eclipse ### +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# CDT- autotools +.autotools + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Annotation Processing +.apt_generated/ +.apt_generated_test/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +# Uncomment this line if you wish to ignore the project description file. +# Typically, this file would be tracked if it contains build/dependency configurations: +#.project + +### Eclipse Patch ### +# Spring Boot Tooling +.sts4-cache/ + +# End of https://www.toptal.com/developers/gitignore/api/eclipse diff --git a/.project b/.project new file mode 100644 index 0000000..2665a63 --- /dev/null +++ b/.project @@ -0,0 +1,17 @@ + + + gvoice-sms-takeout-xml + + + + + + org.python.pydev.PyDevBuilder + + + + + + org.python.pydev.pythonNature + + diff --git a/.pydevproject b/.pydevproject new file mode 100644 index 0000000..2b04565 --- /dev/null +++ b/.pydevproject @@ -0,0 +1,5 @@ + + + Default + python interpreter + diff --git a/DATA_EXAMPLES.md b/DATA_EXAMPLES.md new file mode 100644 index 0000000..f111f6c --- /dev/null +++ b/DATA_EXAMPLES.md @@ -0,0 +1,496 @@ +# Data examples +These snippets of data are slightly trimmed down and "pretty formatted" examples +from either the Google Takeout HTML files or the SMS Backup and Restore back up files. +You don't need to look at any of this to use the script. +This is mostly put here for my own reference as I worked through cases. +It is not a complete set of all the possible variants. + +## A single text SMS (from Takeout) +``` + + +Susie Glow + + +
+
+ Sep 1, 2023, 7:29:20 PM Pacific Time: + + + Susie Glow + + : + What is your favorite color? +
+ +
+ Sep 1, 2023, 8:27:36 PM Pacific Time: + + + Me + + : + Oh, I cannot decide +
+
+ +
Labels: + , + +
+
User Deleted: False
+ + +``` +## A group message with an image attachment, one user not in contacts (from Takeout) +``` + + +Group Conversation + + +
+
Group conversation with: + + + Joe Blow + + , + + + +17735559876 + + + + + Susie Glow + + +
+ +
+ Oct 1, 2023, 8:30:41 AM Pacific Time: + + + Susie Glow + + : + Do you like my hat? +
+ +
+ Oct 1, 2023, 11:44:12 AM Pacific Time: + + + Me + + : + I do. I do like your hat. Here's a picture. +
+ Image MMS Attachment +
+
+
+ +
Labels: + , + +
+
User Deleted: False
+ + +``` +## A vcard attachment (from Takeout) +``` +
+ Jul 13, 2021, 2:27:27 PM Pacific Time: + + + Susie Glow + + : + That person I told you about. +
+ Contact card attachment +
+
+``` +## A vcard attachment (from a backup file) +``` + +``` +## A received SMS (from a backup file) +``` + +``` +## A sent SMS (from a backup file) +``` + +``` +## A sent group MMS with an image attachment (from a backup file) +``` + + + + + + + + + + + + +``` +## The decoded SMIL text for the above MMS +``` + + + + + + + + + + + + + + + + + +``` +## A voicemail (from Takeout) +``` + + +Voicemail from Susie Glow + + + +
+ Call Log for + Voicemail from Susie Glow +
Voicemail from + + Susie Glow + +
+ May 16, 2023, 7:54:27 PM Pacific Time +Transcript: ... +
+ + (00:00:22) + +
Labels: + , + +
+
User Deleted: False
+
+ + + + + +Recorded call with Susie Glow + + +
+ Call Log for + Recorded call with Susie Glow +
Recorded call with + + Susie Glow + +
+ May 22, 2019, 11:18:56 AM Pacific Time + + + (00:29:46) + +
Labels: + , + , + +
+
User Deleted: False
+
+ + +``` +## An incoming call (from Takeout) +``` + + +Received call from + + +
+ Call Log for + Received call from +
Received call from + + + +
+ Nov 22, 2016, 2:27:58 PM Pacific Time + (00:00:04) + +
Labels: + +
+
User Deleted: False
+
+ + + + + +Placed call to Joe Blow + + +
+ Call Log for + Placed call to Joe Blow +
Placed call to + + Joe Blow + +
+ Jun 7, 2021, 2:10:27 PM Pacific Time + (00:02:27) + +
Labels: + +
+
User Deleted: False
+
+ + +``` +## An incoming call (from a backup file) +``` + +``` +## An outgoing call (from a backup file) +``` + +``` +## A group_info file (from Google Takeout for Google Chat) +``` +{ + "members": [ + { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + } + ] +} +``` +## A text message (from messages.json in Google Takeout for Google Chat) +``` + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, April 15, 2015 at 12:51:41 AM UTC", + "text": "I\u0027ve heard of polymaths, but I think you might be the first demi-math", + "topic_id": "beJ6uN-Eb_A", + "message_id": "1pRI-QAAAAE/beJ6uN-Eb_A/beJ6uN-Eb_A" + }, +``` +## A JPEG attachment message (from messages.json in Google Takeout for Google Chat) +``` + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 8:29:01 PM UTC", + "attached_files": [ + { + "original_name": "2015-04-27.jpg", + "export_name": "File-2015-04-27.jpg" + } + ], + "topic_id": "N-9_2Qw6DvE", + "message_id": "1pRI-QAAAAE/N-9_2Qw6DvE/N-9_2Qw6DvE" + }, +``` \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7d02d38 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +BSD 2-Clause License + +Copyright (c) 2023, WJCarpenter + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 98c0673..5b4e32f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,399 @@ # gvoice-sms-takeout-xml -Convert Google Voice SMS data from Takeout to .xml suitable for use with SMS Backup and Restore +Convert Google Voice data and Google Chat data from Google Takeout to XML files suitable for use with SMS Backup and Restore. +Find this code repository at . +For a more detailed and user-friendly explanation of how to use this, see +. -This is a personal project from a few years back when Google switched Voice to Hangouts and I wanted to grab my old messages and get them into a usable format. It worked at the time; I don't know if it works as-is, but I'm planning on some testing in the near term to get it functional. +Google Takeout, +, +is a tool provided by Google for downloading various kinds of data associated with your Google account. +In this case, it's data from Google Voice or Google Chat or both. +It's exported as a ZIP file containing several individual HTML files and some other file types. +Although the HTML files, like any HTML files, exhibit a certain structure, +the actual format used by Google Takeout is not documented. +(At least, I have not been able to find any documentation. +I'd be glad to be proven wrong, so if you know of some documentation, do please let me know.) +The structure is oriented toward viewing the information in a browser. +Consequently, pulling information out of those HTML files is just reverse engineering. +There are many special cases. +The script deals with all of the special cases that I know about, +but there could easily be more special cases that don't happen to show up in the data that I have to work with. -Input data is a folder of SMS .html files from Google Takeout. +SMS Backup and Restore, +, +is a popular Android app that can back up your phone's text messages and call history. +The backups are in XML format, +and the app gives you several choices for where to keep them. +The XML format is mostly -- but not completely -- documented. + +The script reads the files from Google Takeout and produces files in the XML format for SMS Backup and Restore. +The idea is that you then use those XML files to do a `restore` with the app. +That transfers your Google Voice and Google Chat history into your phone's native history. + +## This fork +This is a fork of , +which is itself a fork of . +Although I have made a massive number of changes, +so that it does not look much like the originals any more (perhaps it's not even recognizable), +I have kept the same repo name and script name for the sake of being easily found. +I'm grateful to those earlier authors for giving me a starting point, +but there's not much of their code left. + +This fork corrects several problems I ran into when using the original scripts. +I also added some significant additional features. + +## Apologia +Reverse engineering is a hazardous business. +There are already many special cases and oddities dealt with by the script. +There are undoubtedly more that I either didn't happen to encounter, +or that I didn't notice. +Google could at any time change the format of the Google Takeout files, +or (less likely) SMS Backup and Restore could change the requirements for backup files. +I welcome you to bring additional things like that to my attention, +though fixing them up is the usual freebie "best effort" sort of thing. +Undoubtedly, after some months or years, +I'll myself become a little hazy on the workings of the script, +and that might add some time. + +When asking questions or reporting issues, +the best evidence to give is the original data file that provokes the issue. +The script usually names a specific file that gives it a headache. +Sometimes the script will not know that it's misbehaving, +in which case you have a little detective work to do. +In the XML outputs, input file names are included as XML comments. +For the case of being unable to find referenced attachments, +it's probably some new quirk of the trial and error way the script has of figuring it out. +(There are some bugs/mistakes in the Google Takeout attachment file names +so that they also can't be found in the browser view of the collection.) +I don't need to see the actual attachment file (MP3 or JPEG or whatever), +but I do need to know what its exact filename is. + +You have these choices for reporting things: + +- Open a pull request with a code change. +If you do this, please limit the PR to a single thing to make it easy for me to review it. +(If you are an experienced python programmer, +you will probably be tempted to "fix up" my clumsy style. +That's OK with me, +but I'd rather those sorts of things came as their own PRs rather than intermingled with more substantive stuff.) +- Open an issue describing the problem. +See the GitHub repository link above. +Don't worry if you don't know exactly what's going on. +I just need enough information to figure it out. +- You can also post in the repository's discussion area. +That might be the best way to go if you are not sure you are really seeing a new problem. + +## How to use this script +You want to use Python 3 to run this, +and you may have to `pip install` some of the imported modules if you don't happen to already have them. +Even better would be to set up a python virtual environment, +and install the dependencies with `pip install -r requirements.txt`. +If you don't know what some of that means, +contact the nearest smart alecky kid and get them to help you. + +- Save sms.py in some convenient location. Let's call that location `/some/bin/sms.py`. +It is a python script that requires Python 3. +- Use Google Takeout to download Google Voice or Google Chat messages or both. +That will give you a file named `takeout-`_something-something_`.zip`. +- Unpack that ZIP file in some convenient location. Let's call that location `/someplace/t/`. +- The Google Voice files will be in a directory `Takeout/Voice/Calls/`, aka `/someplace/t/Takeout/Voice/Calls/`. +- The Google Chat files will be in a directory `Takeout/Google Chat/Groups/`, aka `/someplace/t/Takeout/Google Chat/Groups/`. +- In a terminal window, go to directory `/someplace/t/Takeout/`. +- Run the python script, for example, `python /some/bin/sms.py` or `python3 /some/bin/sms.py`. +- If you get python errors, it is most likely because you are missing some of the imported modules. +Use `pip` to install them until python stops complaining. +- When the script starts running correctly, it will announce the locations of inputs and outputs and other helpful information. +- It can also emit warnings or TODO items. +Generally, any output lines prefixed with `>>` are just informational, +but pay attention to any output lines without that prefix. +- If the script sees problems in the information, it will report them to you. +See the information below about missing contacts. +Don't use the resulting output files until you are satisfied you have dealt with any reported problems. + +### Running a test +If you would like to try this with some test data to get comfortable with things, +head down to the `test_data` subdirectory. +There are instructions there for how to use that test data with your own phone. + +### Output files +The script produces four separate output files. + +- an "sms" file containing a combination of SMS and MMS messages based on Google Voice +(MMS messages are used for group conversations and for messages with attachments) +- a "calls" file containing call history records +- an "sms vm" file containing MMS messages for voicemails +(The voicemail recording is included as an attachment. +If there is a transcript, it is included as a text part of the MMS message. +A voicemail also creates a "missed call" record in the "calls" file, without the recording or transcript.) +- an "sms chat" file containing a combination of SMS and MMS messages based on Google Chat + +Why is there a separate file for voicemail MMS messages? +It's done that way in case you don't want to include those with the other SMS and MMS messages when you do the restore operation. +In fact you can pick and choose among any of the output files, depending on what you want to do. +SMS Backup and Restore will let you choose which files you want to use for `restore`. + +### Command line options + +The easiest way to use this script is as described above, +but there are optional command line arguments for changing various locations and files. +You can get the latest information about command line arguments by running the script with the single argument `-h` or `--help`. +``` +usage: sms.py [-h] [-d VOICE_DIRECTORY] [-e CHAT_DIRECTORY] + [-s SMS_BACKUP_FILENAME] [-v VM_BACKUP_FILENAME] + [-c CALL_BACKUP_FILENAME] [-t CHAT_BACKUP_FILENAME] + [-j CONTACTS_FILENAME] [-p {asis,configured,newest}] [-n] [-z] + +Convert Google Takeout HTML and Google Chat JSON files to SMS Backup and +Restore XML files. (Version 2023-12-02 16:20) + +options: + -h, --help show this help message and exit + -d VOICE_DIRECTORY, --voice_directory VOICE_DIRECTORY + The voice_directory containing the HTML files from + Google Voice. Defaults to "Voice/Calls". + -e CHAT_DIRECTORY, --chat_directory CHAT_DIRECTORY + The chat_directory containing the JSON files from + Google Chat. Defaults to "Google Chat/Groups". + -s SMS_BACKUP_FILENAME, --sms_backup_filename SMS_BACKUP_FILENAME + File to receive SMS/MMS messages from Google Voice. + Defaults to "../sms-gvoice.xml". + -v VM_BACKUP_FILENAME, --vm_backup_filename VM_BACKUP_FILENAME + File to receive voicemail MMS messages from Google + Voice. Defaults to "../sms-vm-gvoice.xml". + -c CALL_BACKUP_FILENAME, --call_backup_filename CALL_BACKUP_FILENAME + File to receive call history records from Google + Voice. Defaults to "../calls-gvoice.xml". + -t CHAT_BACKUP_FILENAME, --chat_backup_filename CHAT_BACKUP_FILENAME + File to receive SMS/MMS messages from Google Chat. + Defaults to "../sms-chat.xml". + -j CONTACTS_FILENAME, --contacts_filename CONTACTS_FILENAME + JSON formatted file of definitive contact name/number + pairs. Defaults to "../contacts.json". + -p {asis,configured,newest}, --number_policy {asis,configured,newest} + Policy for choosing the "best" number for a contact. + Defaults to "asis". + -n, --nanp_numbers Heuristically treat some partial numbers as North + American numbers. + -z, --dump_data Dump some internal tables at the end of the run, which + might help with sorting out some thing. + +All command line arguments are optional and have reasonable defaults when the +script is run from within "Takeout/". The contacts file is optional. Output +files should be named "sms-SOMETHING.xml" or "calls-SOMETHING.xml". See the +README at https://github.com/wjcarpenter/gvoice-sms-takeout-xml for more +information. +``` +When the script is printing a message for you and mentioning a file, +it gives the absolute path to the file. +That makes it a little more convenient if you want to go have a look at the file. +On the other hand, when the script is mentioning a file in an XML comment in an output file, +it might print an absolute or relative path, +depending on the value you supply (or the default) for the `--voice_directory` and `--chat_directory` arguments. +If you don't know why you'd care about the distinction, +then you probably don't care. +Relative paths in the output files are very slightly more privacy-preserving +(but only slightly). + +### Missing contacts +In the Google Takeout data, +there are some edge cases where it's impossible to figure out the contact phone number for a particular input file. +It's not too important for you to understand those edge cases, +but the script works hard to deal with them. + +Two main techniques are used. +- First, the script notices name-to-number mappings as it encounters them in HTML files, +so it might be able to figure it out automatically. +- Second, if the script can't figure it out automatically, +it emits a "TODO" message asking you to add an entry to a JSON file and re-run. +If you don't see any TODO messages (most people will not), then the script figured everything out. + +JSON files from Google Chat don't contain phone numbers, +so any discovery of phone numbers comes from the HTML files from Google Voice. +The Google Voice files are processed first, +so any discovered contact numbers can be used when processing the Google Chat files. +Google Chat files contain a contact name and a contact email address. +A phone number for either is sufficient, +whether configured in the JSON contacts file or discovered in the Google Voice HTML files. + +If the script didn't deal with the edge cases, +it would be possible to see things either mapped to the number "0000000000" or without any number at all +(which will show up as something like "Unknown caller") instead of being mapped to the correct contact. + +SMS Backup and Restore is pretty good at duplicate detection during restore operations, +but if you make a mistake and have things ending up in "0000000000" or "Unknown caller" or other strange places, +delete those entire SMS/MMS conversations from your phone, +fix up your run of this script, +and restore again. +Otherwise, a lot of MMS attachments will be detected as duplicates and will never restore properly. +(I had 300-400 of those out of 25,000 SMS/MMS messages, and it was a big puzzle to figure it out.) + +Here are some examples of the kinds of TODO messages you might see: +``` +Unfortunately, we can't figure out your own phone number. +TODO: Missing +phonenumber for contact: "Me": "+", + +TODO: Missing or disallowed +phonenumber for contact: "Agatha M Christie": "+", + due to File: "/home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z.html" + +TODO: Missing contact phone number in HTML file. Using '0000000000'. + due to File: "/home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls/ - Placed - 2013-07-29T20_56_11Z.html" + +TODO: Missing or disallowed +phonenumber for contact: "F Scott Fitzgerald": "+", +TODO: and Missing or disallowed +phonenumber for contact: "fskf@authors.example.com": "+", + due to File: "/home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Google Chat/Groups/Space AAAAdvGdRgs/group_info.json" +``` +The TODO for `Me` is a special case. +The script couldn't figure out your phone number (which it usually can do), +so you have to provide it via that fake entry in the JSON file. +The TODO for the 3rd item above, +where "0000000000" is used instead, +usually indicates some kind of glitch in the indicated input file. +Have a look at it. +If it's not too important to you, probably the simplest course is to delete that input file. +Because Google Chat contacts can be resolved from either the email address or the contact name, +the last item above is an example where there is not a number for either one. +Adding a configured number to the JSON contacts file for either the email address or the name will take care of it. + +If you get any of those other messages, add entries for those contacts into the JSON file. +Obviously, create that file if you haven't done so earlier. +You can probably copy and paste the end of the TODO line and just supply the missing phone number. +That will give you something like: +``` +{ + "Me": "+17323210011", + "Agatha M Christie": "+17323211111", + "fskf@authors.example.com": "+17323215555" +} +``` +Add the contact name exactly as shown in the TODO message. +Contact names, including `Me`, are case-sensitive. +Don't forget to include the `+` and the country code with the phone number +(and no other punctuation ... just the `+` and digits). +The order of items in that file doesn't matter, but the python JSON parser requires a comma after each item except the final one. +It also insists on the use of double quotes (not single quotes) for all of the items. +Rerun the script until you get no TODO reports about missing contact phone numbers and no other errors. + +You can now use the resulting output files as a backup files to be restored with the SMS Backup and Restore app. + +### Aliases and preferred numbers +The optional JSON contacts file has a simplistic mechanism for aliases for both contact names and contact numbers. +In addition to providing entries as seen in the previous section to get from a name to a number, +you can also provide an alias for a contact name with an entry like this: +``` +{ + "Pelé": "Edson Arantes do Nascimento" +} +``` +Likewise, you can provide an alias for a contact number with an entry like this: +``` +{ + "+12123214444": "+15703214444" +} +``` +The script distinguishes these from the name-to-number mappings by recognizing numbers by pattern +(all digits with an optional leading `+`). + +Finally, you can configure multiple phone numbers for a contact name by using an entry like this: +``` +{ + "Edson Arantes do Nascimento": ["+17323214444", "+15703214444"] +} +``` +Depending on the the number policy (described below), +the first number in the list can be considered "preferred". +### Conflicting contact names and numbers +You might also see some informational notices about conflicting numbers for contacts. +This can happen if one of your contacts (or you) has multiple phone numbers, +including having changed phone numbers over time. +The phone number is the thing that matters in the backup files, +so you probably don't have to do anything about these. +If you wanted to go to a lot of trouble, +you could edit the HTML files to change the conflicting number to the one you prefer for that contact. +If you have all of the conflicting numbers in your phone contact records, +things will work out without your needing to do anything. +If you don't have one of those numbers for the contact, +then the record will show up on your phone as just the phone number and no contact name. + +Why can't we just take care of this? +Well, the way things are stored on your phone is with separate +databases for contacts, messages, and calls. +We're only updating the messages and calls. +We're not touching the contacts, +so we can't add numbers to them. +It's the phone numbers in the messages and calls that tie things together. + +Here is an example of this kind of informational message: +``` +>> Info: conflicting information about "Edson Arantes do Nascimento": ['+17323214444'] '+15703214444' +>> due to File: "/home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2016-05-01T00_16_43Z.html" +>> Info: conflicting information about "Edson Arantes do Nascimento": ['+17323214444', '+15703214444'] '+12123214444' +>> due to File: "/home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2014-05-17T02_54_27Z.html" +``` +To keep the noise down, +there will be at most one such message for any newly discovered conflicting numbers for any given contact. +In other words, for a contact with N different phone numbers in the HTML files, +you would expect to see N-1 informational messages about conflicts. +The number outside the `[brackets]` is the most recently seen number, +and the file named on the next line is where that number was first seen. + +### Contact number replacement policies +If a given contact name has a single contact number, +either configured in the JSON contacts file or discovered in the HTML input files, +there is no ambiguity. +That unique number will be used for that contact throughout the output files. + +In cases where there are multiple numbers for the same contact name, +you can specify what you want to do about it, +which can affect how conversations appear when you restore the backup files. +This is called the number replacement policy or simply the number policy. +Here are the possibilties: +- `asis`: (Default) When a given number is found along with that contact name, +then that number is used in the output for that that specific case. +If the contact name is found separately from the contact number, +then the `newset` contact number will be used in the output. +- `newest`: All contact numbers for the contact name are replaced with the newest contact number, +where "newest" means appearing in an HTML file with the most recent message timestamp. +Any contact numbers mentioned in the JSON contacts file are considered to be "newer" +than any numbers discovered in HTML files. +If the contact name in the JSON file has multiple numbers, +they are assumed to be listed in reverse chronological order +(so the first one is the "newest" and will be used). +- `configured`: Similar to `asis`, +except that only contact numbers from the JSON configuration file will be used. +Contact numbers discovered in HTML files will not be used and will generate `TODO` outputs. + +Of the above policies, `asis` is the simplest to use. +`configured` is the most strict, but -- along with contact number aliases -- gives the finest control. +Here is an example of a message you might see if you use the `configured` policy but have no entry for a given contact. +The number was found in an HTML file, but it's disallowed by policy. +``` +TODO: Missing or disallowed +phonenumber for contact: "Søren Aabye Kierkegaard": "+17323211414", + due to File: "/home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T15_30_41Z.html" +``` +### NANP heuristics for phone numbers +Over the years, various applications and phones have been pretty lenient with me in how my contact phone numbers are formatted. +I'm in the US, and most of my contact phone numbers are fully formed with a leading `+1` before the area code. +A few, however, have only the `+` (without the `1`), +and a few don't have the `+1` at all. +It can be tedious to create aliases for all those combinations in the contacts JSON file, +so there is a command line flag to apply "NANP heuristics" to phone numbers. +(NANP is North American Numbering Plan, which is the system used by many telephone systems in North America.) +- If there is a `+` and exactly 10 digits, the `+` is changed to `+1`. +This will be incorrect for some number of non-US phone numbers that properly include a country code other than `1`. +- If there is a `1` and exactly 10 additional digits, the `1` is changed to `+1`. +I have mixed feelings about providing this US-centric (actually, North America centric) feature. +An alternative to this would be fixing up your Google Contacts to have phone numbers with fully formatted country codes before exporting data with Google Takeout. +If you are in the middle of moving things with this script, +you could use fix up your contacts and use the heuristic when converting the data. + +### Dumping runtime data +There is a command line option, `-z`, +to have the script dump out some internal tables at the end of the run. +This can be helpful in sorting out data problems. +It can otherwise be tedious to look through various input files to try to figure out where things have gone sideways. +This info is not dumped by default because most people will not need it. +It's there as an additional concise source of information if you need it. + +NOTE: The dumped data tends to use single quote marks. +If you copy and paste that into the JSON contacts file, be sure to switch them to double quote marks. +(It's not my fault.) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e9cb397 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +appdirs==1.4.4 +beautifulsoup4==4.12.2 +bs4==0.0.1 +dateutils==0.6.12 +fs==2.4.16 +isodate==0.6.1 +phonenumbers==8.13.25 +python-dateutil==2.8.2 +pytz==2023.3.post1 +six==1.16.0 +soupsieve==2.5 diff --git a/sms.py b/sms.py index a2e36e8..61664fa 100644 --- a/sms.py +++ b/sms.py @@ -1,243 +1,1446 @@ -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning +import warnings +warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning) +from bs4.element import Comment import re import os +from fs.path import basename import phonenumbers import dateutil.parser -import time, datetime +import datetime from calendar import timegm -import warnings import base64 -from io import open # adds emoji support +from io import open +import json +import isodate +import argparse +from operator import itemgetter +import pprint + +__updated__ = "2023-12-06 14:19" + +sms_backup_file = None +call_backup_file = None +vm_backup_file = None +chat_backup_file = None +takeout_voice_directory = os.path.join('Voice', 'Calls') +takeout_chat_directory = os.path.join('Google Chat', 'Groups') + +# contacts dict by name +# each value is a list +# each item in the list is a timestamped_number: (phone number, timestamp) +# for contacts read from the file, the timestamp is artificially sometime in the future +# if the read contact already has a list of numbers, they are assumed to be in ascending order of preference +# for discovered contacts, it's the timestamp of the message where we disovered the contact +# if we discover the same contact again, we update the timestamp if it's later then the one we know +# the head of the list is kept pointing at the timestamped_number with the latest timestamp +# +# phone number replacement policies: +# as-is: take whatever number is in the file +# latest: swap with the chronologically latest number we know +# selective: swap as configured +# acceptable: listed numbers as-is, all others swapped with preferred + + +contacts_oracle = None + +# this is for some internal bookkeeping; you don't need to do anything with it. +missing_contacts = set() +conflicting_contacts = dict() + +# some global counters for a stats summary at the end +counters = { + 'number_of_voice_sms_output': 0, + 'number_of_chat_sms_output': 0, + 'number_of_calls_output': 0, + "number_of_vms_output": 0, + "conflict_warnings": 0, + "todo_errors": 0, + "number_of_discovered_contacts": 0, + } + +# I really don't like globals, but there are just too many things to tote around in all these function calls. +phone_number_from_filename = None +contact_name_from_filename = None +phone_number_from_html_title = None +contact_name_from_html_title = None +html_elt = None -me = '+11111111111' # enter phone number +# This number is used a couple of places where we can't figure out the real number. +# If you want to manually fix things up, you should be able to easily search for it in +# either the inputs or the outputs. +BOGUS_NUMBER = "0000000000" -sms_backup_filename = "./gvoice-all.xml" -print('New file will be saved to ' + sms_backup_filename) +ATTACHMENT_TYPE_IMAGE = "image" +ATTACHMENT_TYPE_AUDIO = "audio" +ATTACHMENT_TYPE_VCARD = "vcard" + +POLICY_ASIS = "asis" +POLICY_NEWEST = "newest" +POLICY_CONFIGURED = "configured" +# My convention is to use a relative filename when emitting into the XML +# and an absolute filename when printing a message for the person running the script. def main(): - print('Checking directory for *.html files') - num_sms = 0 - root_dir = '.' + global sms_backup_file, vm_backup_file, call_backup_file, chat_backup_file + global contacts_oracle + global html_elt + + # This file is *optional* unless you get an error message asking you to add entries to it. + contacts_filename = os.path.join('..', 'contacts.json') + # SMS Backup and Restore likes to notice filenames that start with "sms-" or "calls-". + # Save them to the parent directory so they are not lost if the Tajkeout file is + # redone. The parent directory is the one that contains "Takeout". The script + # expects to be run from within the "Takeout" directory by default. + sms_backup_filename = os.path.join('..', 'sms-gvoice.xml') + call_backup_filename = os.path.join('..', 'calls-gvoice.xml') + vm_backup_filename = os.path.join('..', 'sms-vm-gvoice.xml') + chat_backup_filename = os.path.join('..', 'sms-chat.xml') + number_policy = POLICY_ASIS + nanp_heuritstics = False + dump_data = False - for subdir, dirs, files in os.walk(root_dir): - for file in files: - sms_filename = os.path.join(subdir, file) + description = f'Convert Google Takeout HTML and Google Chat JSON files to SMS Backup and Restore XML files. (Version {__updated__})' + epilog = ('All command line arguments are optional and have reasonable defaults when the script is run from within "Takeout/". ' + 'The contacts file is optional. ' + 'Output files should be named "sms-SOMETHING.xml" or "calls-SOMETHING.xml". ' + "See the README at https://github.com/wjcarpenter/gvoice-sms-takeout-xml for more information.") + argparser = argparse.ArgumentParser(description=description, epilog=epilog) - try: - sms_file = open(sms_filename, 'r', encoding="cp850") - except FileNotFoundError: - continue + argparser.add_argument('-d', '--voice_directory', + default="Voice/Calls", + help=f"The voice_directory containing the HTML files from Google Voice. Defaults to \"{takeout_voice_directory}\".") + argparser.add_argument('-e', '--chat_directory', + default="Google Chat/Groups", + help=f"The chat_directory containing the JSON files from Google Chat. Defaults to \"{takeout_chat_directory}\".") + + argparser.add_argument('-s', '--sms_backup_filename', + default=sms_backup_filename, + help=f"File to receive SMS/MMS messages from Google Voice. Defaults to \"{sms_backup_filename}\".") + argparser.add_argument('-v', '--vm_backup_filename', + default=vm_backup_filename, + help=f"File to receive voicemail MMS messages from Google Voice. Defaults to \"{vm_backup_filename}\".") + argparser.add_argument('-c', '--call_backup_filename', + default=call_backup_filename, + help=f"File to receive call history records from Google Voice. Defaults to \"{call_backup_filename}\".") + argparser.add_argument('-t', '--chat_backup_filename', + default=chat_backup_filename, + help=f"File to receive SMS/MMS messages from Google Chat. Defaults to \"{chat_backup_filename}\".") + + argparser.add_argument('-j', '--contacts_filename', + default=contacts_filename, + help=f"JSON formatted file of definitive contact name/number pairs. Defaults to \"{contacts_filename}\".") + argparser.add_argument('-p', '--number_policy', + default=number_policy, + choices=(POLICY_ASIS, POLICY_CONFIGURED, POLICY_NEWEST), + help=f"Policy for choosing the \"best\" number for a contact. Defaults to \"{number_policy}\".") + argparser.add_argument('-n', '--nanp_numbers', + action='store_true', + help=f"Heuristically treat some partial numbers as North American numbers.") + argparser.add_argument('-z', '--dump_data', + action='store_true', + help=f"Dump some internal tables at the end of the run, which might help with sorting out some thing.") + + args = vars(argparser.parse_args()) - if(os.path.splitext(sms_filename)[1] != '.html'): - # print(sms_filename,"- skipped") - continue + sms_backup_filename = args['sms_backup_filename'] + vm_backup_filename = args['vm_backup_filename'] + call_backup_filename = args['call_backup_filename'] + voice_directory = args['voice_directory'] + chat_directory = args['chat_directory'] + contacts_filename = args['contacts_filename'] + number_policy = args['number_policy'] + nanp_heuritstics = args['nanp_numbers'] + dump_data = args['dump_data'] - print('Processing ' + sms_filename) + contacts_oracle = ContactsOracle(contacts_filename, number_policy, nanp_heuritstics) + prep_output_files(sms_backup_filename, vm_backup_filename, call_backup_filename, chat_backup_filename) + + print('>> 1st pass reading *.html files under', get_aka_path(voice_directory)) + # We make two passes over the HTML files. The first pass is merely to gather contact info so that + # we have the complete picture before starting the second ("real") pass. That's so that we can + # correctly apply phone number replacement policies for all of the HTML files. It's true that + # some of the policies do not require this complete picture, and we could greatly reduce the files + # processed in the second pass (maybe even not needing a second pass), but that clutters up the + # logic quite a bit. Since this is a one-time migration, efficiency is not that critical and we + # accept the inefficiency for some number_policy cases. + for subdirectory, __, files in os.walk(voice_directory): + for html_basename in files: + html_target = (subdirectory, html_basename) + process_one_voice_file(True, html_target) - is_group_conversation = re.match(r'(^Group Conversation)', file) + with (open(sms_backup_filename, 'w') as sms_backup_file, + open(vm_backup_filename, 'w') as vm_backup_file, + open(call_backup_filename, 'w') as call_backup_file, + open(chat_backup_filename, 'w') as chat_backup_file): + + write_dummy_headers() + + me_contact_number = contacts_oracle.get_number_by_name('Me', None) + if not me_contact_number: + print() + print("Unfortunately, we can't figure out your own phone number.") + print('TODO: Missing or disallowed +phonenumber for contact: "Me": "+",') + counters['todo_errors'] += 1 + missing_contacts.add('Me') + else: + print(f">> Your 'Me' phone number is {me_contact_number}") - soup = BeautifulSoup(sms_file, 'html.parser') + print('>> 2nd pass reading *.html files under', get_aka_path(voice_directory)) + # second pass over GV files + for subdirectory, __, files in os.walk(voice_directory): + for html_basename in files: + html_target = (subdirectory, html_basename) + process_one_voice_file(False, html_target) - messages_raw = soup.find_all(class_='message') + print('>> Reading chat files under', get_aka_path(chat_directory)) + for subdirectory, __, __ in os.walk(chat_directory): + process_one_chat_directory(me_contact_number, subdirectory) - num_sms += len(messages_raw) + write_trailers() + + # we have to reopen the files with a different mode for this + write_real_headers(sms_backup_filename, vm_backup_filename, call_backup_filename, chat_backup_filename) + print_counters(contacts_filename, sms_backup_filename, vm_backup_filename, call_backup_filename, chat_backup_filename) + if dump_data: + contacts_oracle.dump() - if is_group_conversation: - participants_raw = soup.find_all(class_='participants') - write_mms_messages(participants_raw, subdir, messages_raw) - else: - write_sms_messages(file, subdir, messages_raw) - - - sms_backup_file = open(sms_backup_filename, 'a') - sms_backup_file.write(u'') - sms_backup_file.close() - - write_header(sms_backup_filename, num_sms) - -def write_sms_messages(file, subdir, messages_raw): - fallback_number = 0 - title_has_number = re.search(r"(^\+*[0-9]+)", file) - if title_has_number: - fallback_number = title_has_number.group() - - sms_values = {'participants' : get_first_phone_number(messages_raw, fallback_number)} - - sms_backup_file = open(sms_backup_filename, 'a') - for i in range(len(messages_raw)): - sms_values['type'] = get_message_type(messages_raw[i]) - sms_values['message'] = get_message_text(messages_raw[i]) - sms_values['time'] = get_time_unix(messages_raw[i]) - sms_text = (' \n' % sms_values) - sms_backup_file.write(sms_text) - write_img_attachment(messages_raw[i],subdir,sms_backup_file,sms_values) - - sms_backup_file.close() - -def write_img_attachment(message,subdir,sms_backup_file,mms_values,participants=None): - - #img = re.search(r" \n' - ' \n' - ' \n' - ' \n' - ' \n' - '%(participants_xml)s' - ' \n' - ' \n' % mms_values) - - img_file.close() - sms_backup_file.write(mms_text) - return True +def process_one_chat_directory(me_contact_number, subdirectory): + participants = process_chat_group_info(me_contact_number, subdirectory) + process_chat_messages(me_contact_number, subdirectory, participants) + +def process_chat_group_info(me_contact_number, subdirectory): + group_info_basename = "group_info.json" + group_info_filename = os.path.join(subdirectory, group_info_basename) + json_target = (subdirectory, group_info_basename) + if not os.path.exists(group_info_filename): + return None + + with open(group_info_filename, "r") as fp: + parsed_group_info = json.load(fp) + + me_in_participants = False + participants = list() + for member in parsed_group_info["members"]: + email = member["email"] + email_number = contacts_oracle.get_number_by_name(email, None) + name = member["name"] + name_number = contacts_oracle.get_number_by_name(name, None) + if not email_number and not name_number: + if email not in missing_contacts or name not in missing_contacts: + print() + print(f'TODO: Missing or disallowed +phonenumber for contact: "{name}": "+",') + print(f'TODO: and Missing or disallowed +phonenumber for contact: "{email}": "+",') + print(f' due to File: "{get_abs_path(json_target)}"') + counters['todo_errors'] += 1 + missing_contacts.add(name) + missing_contacts.add(email) else: - return False + if email_number and name_number and email_number != name_number: + print(f'>> Info: conflicting information for email {email}: {email_number} versus name {name}: {name_number}. Using {name_number}.') + print(f'>> due to File: "{get_abs_path(json_target)}"') + else: + if name_number: + if contacts_oracle.is_me_number(name_number): + me_in_participants = True + else: + participants.append(name_number) + else: + if contacts_oracle.is_me_number(email_number): + me_in_participants = True + else: + participants.append(email_number) + + if not me_in_participants: + print(f'>> Info: Chat participants list does not include the "Me" phone number {me_contact_number}: {participants}') + print(f'>> due to File: "{get_abs_path(json_target)}"') + + return participants + +def process_chat_messages(me_contact_number, subdirectory, participants): + messages_basename = "messages.json" + messages_filename = os.path.join(subdirectory, messages_basename) + json_target = (subdirectory, messages_basename) + if not os.path.exists(messages_filename): + return None + + with open(messages_filename, "r") as fp: + parsed_messages = json.load(fp) + + attachment_collisions = dict() + message_list = parsed_messages['messages'] + for message in message_list: + creator = message['creator'] + name = creator['name'] + sender_number = contacts_oracle.get_number_by_name(name, None) + if not sender_number: + email = creator['email'] + sender_number = contacts_oracle.get_number_by_name(email, None) + created_date = message['created_date'] + text = message.get('text', None) + attachment_list = list() # a list of tuples + attachments = message.get('attached_files', None) + if attachments: + for attachment in attachments: + original_name = attachment['original_name'] + export_name = attachment['export_name'] + # undocumented fact, probably due to windows, not sure if all of these are treated this way + export_name = re.sub(r'[%?<>"|*/\\:]', '_', export_name) + export_name_count = attachment_collisions.get(export_name, None) + root, ext = os.path.splitext(export_name) + root = root[:47] # another undocumented fact + if export_name_count is not None: + export_name_count += 1 + export_name_revised = f'{root}({int(export_name_count)}){ext}' + else: + export_name_count = 0 + export_name_revised = f'{root}{ext}' + attachment_collisions[export_name] = export_name_count + export_path_revised = os.path.join(subdirectory, export_name_revised) + attachment_list.append((original_name, export_path_revised)) + write_message_for_chat(json_target, me_contact_number, sender_number, participants, created_date, text, attachment_list) + +def write_message_for_chat(json_target, me_contact_number, sender_number, participants, created_date, the_text, attachment_list): + name_list = contacts_oracle.get_names_by_number(sender_number) + sent_by_me = (me_contact_number == sender_number) + if sent_by_me: + message_type = '2' else: - return False + message_type = '1' + timestamp = unix_time_ms_from_datetime(datetime_from_string(created_date)) + #attachment_elts = get_attachment_elts(message_elt) + parent_elt = BeautifulSoup() + parent_elt.append(bs4_get_file_comment(json_target)) + # if it was just an attachment with no text, there is no point in creating an empty SMS to go with it + if the_text and not attachment_list and len(participants) == 1: + for other_party_number in participants: + if other_party_number != me_contact_number: + break + bs4_append_sms_elt(parent_elt, other_party_number, timestamp, the_text, message_type) + else: + msgbox_type = message_type + bs4_append_mms_elt_with_parts_for_chat(parent_elt, json_target, attachment_list, the_text, sender_number, sent_by_me, timestamp, msgbox_type, participants) + chat_backup_file.write(parent_elt.prettify()) + chat_backup_file.write('\n') + counters['number_of_chat_sms_output'] += 1 +def process_one_voice_file(is_first_pass, html_target): + global html_elt + __, html_basename = html_target + if not html_basename.endswith('.html'): return -def write_mms_messages(participants_raw, subdir, messages_raw): - sms_backup_file = open(sms_backup_filename, 'a') + get_name_or_number_from_filename(html_basename) + with open(get_rel_path(html_target), 'r', encoding="utf-8") as html_file: + html_elt = BeautifulSoup(html_file, 'html.parser') + get_name_or_number_from_title() - participants = get_participant_phone_numbers(participants_raw) - mms_values = {'participants' : '~'.join(participants)} + if is_first_pass: + scan_vcards_for_contacts(html_target, html_elt.body) + return - participants.append(me) + # Need to be firm about mapping contact names to numbers! The contact_name_to_number() function will complain. + if contact_name_from_html_title and not contact_name_to_number(html_target, contact_name_from_html_title): + return + if contact_name_from_filename and not contact_name_to_number(html_target, contact_name_from_filename): + return - for i in range(len(messages_raw)): - sender = get_mms_sender(messages_raw[i]) - sent_by_me = sender not in participants + tags_div = html_elt.body.find(class_='tags') + tag_elts = tags_div.find_all(rel='tag') + tag_values = set() + for tag_elt in tag_elts: + tag_value = tag_elt.get_text() + tag_values.add(tag_value) - mms_values['type'] = get_message_type(messages_raw[i]) - mms_values['message'] = get_message_text(messages_raw[i]) - mms_values['time'] = get_time_unix(messages_raw[i]) - mms_values['participants_xml'] = get_participants_xml(participants,sender,sent_by_me) - mms_values['msg_box'] = 2 if sent_by_me else 1 - mms_values['m_type'] = 128 if sent_by_me else 132 - - mms_text = (' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '%(participants_xml)s' - ' \n' - ' \n' % mms_values) - - sms_backup_file.write(mms_text) - write_img_attachment(messages_raw[i],subdir,sms_backup_file,mms_values,participants) - - sms_backup_file.close() - -def get_participants_xml(participants,sender,sent_by_me): - participants_xml = u'' - for participant in participants: - participant_is_sender = participant == sender or (sent_by_me and participant == me) - participant_values = {'number': participant, 'code': 137 if participant_is_sender else 151} - participants_xml += (' \n' % participant_values) - return participants_xml - -def get_message_type(message): # author_raw = messages_raw[i].cite - author_raw = message.cite - if ( not author_raw.span ): - return 2 + if "Text" in tag_values: process_Text_from_html_file(html_target) + elif "Received" in tag_values: process_call_from_html_file(html_target, 1) + elif "Placed" in tag_values: process_call_from_html_file(html_target, 2) + elif "Missed" in tag_values: process_call_from_html_file(html_target, 3) + elif "Voicemail" in tag_values: process_Voicemail_from_html_file(html_target) + elif "Recorded" in tag_values: process_Voicemail_from_html_file(html_target) else: - return 1 + print(f"Unrecognized tag_value situation '{tag_values}'; silently ignoring file '{get_abs_path(html_target)}'") + +def process_Text_from_html_file(html_target): + # A single HTML file can contain arbitrarily many SMS or MMS messages. I don't *think* + # a single HTML file can have a mix of SMS and MMS since an HTML for MMS has a global + # "participants" list. + # MMS can be either with or without attachments. + message_elts = html_elt.find_all(class_='message') + participants_elt = html_elt.find(class_='participants') + + if participants_elt: + write_mms_messages(html_target, participants_elt, message_elts) + else: + write_sms_messages(html_target, message_elts) - return 0 +def process_Voicemail_from_html_file(html_target): + # For a voicemail, we write a call record and also an MMS record with the recording attached. + # The app doesn't like type 4 (voicemail) in a call record, so we emit type 3 (missed call), + # which is kinda sorta correct. + process_call_from_html_file(html_target, 3) + write_mms_message_for_vm(html_target) -def get_message_text(message): - return BeautifulSoup(message.find('q').text,'html.parser').prettify(formatter='html').strip().replace('"',"'") +def process_call_from_html_file(html_target, call_type): + contributor_elt = html_elt.body.find(class_="contributor") + tel_elt = contributor_elt.find(class_="tel") + telephone_number_full = tel_elt.attrs['href'] + telephone_number_suffix = telephone_number_full[4:] + if not telephone_number_suffix: + presentation = '2' + else: + presentation = '1' + telephone_number = format_number(html_target, telephone_number_suffix) -def get_mms_sender(message): - return format_number(phonenumbers.parse(message.cite.a['href'][4:], None)) + published_elt = html_elt.body.find(class_="published") + readable_date = published_elt.get_text().replace("\r"," ").replace("\n"," ") + iso_date = published_elt.attrs['title'] + timestamp = get_time_unix_ms(html_elt.body) + duration_elt = html_elt.find(class_="duration") + if not duration_elt: + duration = 0 + else: + iso_duration = duration_elt.attrs['title'] + duration = isodate.parse_duration(iso_duration) + duration = round(datetime.timedelta.total_seconds(duration)) + write_call_message(html_target, telephone_number, presentation, duration, timestamp, call_type, readable_date) -def get_first_phone_number(messages, fallback_number): - # handle group messages - for author_raw in messages: - if (not author_raw.span): - continue +def contact_name_to_number(html_target, contact_name): + if not contact_name: + print(f"TODO: We can't figure out the contact name or number from an HTML file. Using '{BOGUS_NUMBER}'.") + print(f' due to File: "{get_abs_path(html_target)}"') + return BOGUS_NUMBER + contact_number = contacts_oracle.get_number_by_name(contact_name, None) + if not contact_number and not contact_name in missing_contacts: + print() + print(f'TODO: Missing or disallowed +phonenumber for contact: "{contact_name}": "+",') + print(f' due to File: "{get_abs_path(html_target)}"') + counters['todo_errors'] += 1 + # we add this fake entry to a dictionary so we don't keep complaining about the same thing + missing_contacts.add(contact_name) + return contact_number + +def get_sender_number_from_title_or_filename(html_target): + if phone_number_from_html_title: + sender = phone_number_from_html_title + elif contact_name_from_html_title: + sender = contact_name_to_number(html_target, contact_name_from_html_title) + elif phone_number_from_filename: + sender = phone_number_from_filename + elif contact_name_from_filename: + sender = contact_name_to_number(html_target, contact_name_from_filename) + else: + sender = None + return sender - sender_data = author_raw.cite +def write_call_message(html_target, telephone_number, presentation, duration, timestamp, call_type, readable_date): + parent_elt = BeautifulSoup() + parent_elt.append(bs4_get_file_comment(html_target)) + bs4_append_call_elt(parent_elt, telephone_number, duration, timestamp, presentation, readable_date, call_type) + call_backup_file.write(parent_elt.prettify()) + call_backup_file.write('\n') + counters['number_of_calls_output'] += 1 - try: - phone_number = phonenumbers.parse(sender_data.a['href'][4:], None) - except phonenumbers.phonenumberutil.NumberParseException: - return sender_data.a['href'][4:] +def write_sms_messages(html_target, message_elts): + other_party_number = None + # Since the "address" element of an SMS is always the other end, scan the + # message elements until we find a number this not "Me". Use that as the + # address value for all of the SMS files in this HTML. + for message_elt in message_elts: + if other_party_number: + break + other_party_number = scan_vcards_for_contacts(html_target, message_elt) - return format_number(phone_number) + # This will be the case if the HTML file contains only a single SMS + # that was sent by "Me". Use fallbacks. + if not other_party_number: + other_party_number = get_sender_number_from_title_or_filename(html_target) - # fallback case, use number from filename - if (fallback_number == 0 or len(fallback_number) < 7): - return fallback_number + for message_elt in message_elts: + the_text = get_message_text(message_elt) + message_type = get_message_type(message_elt) + sent_by_me = (message_type == 2) + timestamp = get_time_unix_ms(message_elt) + attachment_elts = get_attachment_elts(message_elt) + parent_elt = BeautifulSoup() + parent_elt.append(bs4_get_file_comment(html_target)) + # if it was just an attachment with no text, there is no point in creating an empty SMS to go with it + if the_text and the_text != "MMS Sent" and not attachment_elts: + bs4_append_sms_elt(parent_elt, other_party_number, timestamp, the_text, message_type) + else: + msgbox_type = message_type + bs4_append_mms_elt_with_parts_for_voice(parent_elt, html_target, attachment_elts, the_text, other_party_number, sent_by_me, timestamp, msgbox_type, [other_party_number]) + sms_backup_file.write(parent_elt.prettify()) + sms_backup_file.write('\n') + counters['number_of_voice_sms_output'] += 1 + +def write_mms_message_for_vm(html_target): + # We want to end up with an MMS messages, just like any other, but the HTML input file is + # significantly different, so we have this bit of voodoo where we fake up some of the stuff. + sender = None + sender_name = None + body_elt = html_elt.find('body') + contributor_elt = body_elt.find(class_='contributor') + this_number, this_name = get_number_and_name_from_tel_elt_parent(contributor_elt) + if this_number: + sender = contacts_oracle.get_best_number(this_number) + sender_name = this_name + if not sender: + sender = get_sender_number_from_title_or_filename(html_target) + if not sender_name: + names = contacts_oracle.get_names_by_number(sender) + if names: + for sender_name in names: + break + + participants = [sender] if sender else [BOGUS_NUMBER] + timestamp = get_time_unix_ms(body_elt) + vm_from = (sender_name if sender_name else sender if sender else "Unknown") + transcript = get_vm_transcript(body_elt) + if transcript: + the_text = "Voicemail/Recording from: " + vm_from + ";\nTranscript: " + transcript else: - return format_number(phonenumbers.parse(fallback_number, None)) + the_text = "Voicemail/Recording from: " + vm_from + attachment_elts = get_attachment_elts(body_elt) + msgbox_type = '1' # 1 = Received, 2 = Sent + sent_by_me = False + parent_elt = BeautifulSoup() + parent_elt.append(bs4_get_file_comment(html_target)) + bs4_append_mms_elt_with_parts_for_voice(parent_elt, html_target, attachment_elts, the_text, sender, sent_by_me, timestamp, msgbox_type, participants) + vm_backup_file.write(parent_elt.prettify()) + vm_backup_file.write('\n') + counters['number_of_vms_output'] += 1 -def get_participant_phone_numbers(participants_raw): - #participants = [me] # May require adding a contact for "Me" to your phone, with your current number +def write_mms_messages(html_target, participants_elt, message_elts): + participants = get_mms_participant_phone_numbers(html_target, participants_elt) - participants = [] + for message_elt in message_elts: + # TODO who is sender? + not_me_vcard_number = scan_vcards_for_contacts(html_target, message_elt) + sender = not_me_vcard_number + sent_by_me = sender not in participants + the_text = get_message_text(message_elt) + message_type = get_message_type(message_elt) + timestamp = get_time_unix_ms(message_elt) + attachment_elts = get_attachment_elts(message_elt) - for participant_set in participants_raw: - for participant in participant_set: - if (not hasattr(participant, 'a')): - continue + parent_elt = BeautifulSoup() + parent_elt.append(bs4_get_file_comment(html_target)) + bs4_append_mms_elt_with_parts_for_voice(parent_elt, html_target, attachment_elts, the_text, sender, sent_by_me, timestamp, None, participants) + sms_backup_file.write(parent_elt.prettify()) + sms_backup_file.write('\n') + counters['number_of_voice_sms_output'] += 1 - try: - phone_number = phonenumbers.parse(participant.a['href'][4:], None) - except phonenumbers.phonenumberutil.NumberParseException: - participants.push(participant.a['href'][4:]) +def get_attachment_elts(message_elt): + attachment_elts = [] + div_elts = message_elt.find_all('div') + for div_elt in div_elts: + img_elt = div_elt.find('img') + if img_elt: + attachment_elts.append(img_elt) + audio_elt = div_elt.find('audio') + if audio_elt: + attachment_elts.append(audio_elt) + vcard_elt = div_elt.find(class_='vcard') + # distinguish between a vCard that is attached vs a vcard element that is just info from Takeout + if vcard_elt and vcard_elt.name == "a": + attachment_elts.append(vcard_elt) + return attachment_elts + +def bs4_append_sms_elt(parent_elt, sender, timestamp, the_text, message_type): + sms_elt = html_elt.new_tag('sms') + parent_elt.append(sms_elt) + + # protocol - Protocol used by the message, its mostly 0 in case of SMS messages. + sms_elt['protocol'] = '0' + # address - The phone number of the sender/recipient. + sms_elt['address'] = sender + # date - The Java date representation (including millisecond) of the time when the message was sent/received. + sms_elt['date'] = timestamp + # type - 1 = Received, 2 = Sent, 3 = Draft, 4 = Outbox, 5 = Failed, 6 = Queued + sms_elt['type'] = message_type + # subject - Subject of the message, its always null in case of SMS messages. + sms_elt['subject'] = 'null' + # body - The content of the message. + sms_elt['body'] = the_text + # toa - n/a, defaults to null. + sms_elt['toa'] = 'null' + # sc_toa - n/a, defaults to null. + sms_elt['sc_toa'] = 'null' + # service_center - The service center for the received message, null in case of sent messages. + sms_elt['service_center'] = 'null' + # read - Read Message = 1, Unread Message = 0. + sms_elt['read'] = '1' + # status - None = -1, Complete = 0, Pending = 32, Failed = 64. + sms_elt['status'] = '1' + sms_elt['locked'] = '0' + + # sub_id - Optional field that has the id of the phone subscription (SIM). + # readable_date - Optional field that has the date in a human readable format. + # contact_name - Optional field that has the name of the contact. + +def bs4_append_mms_elt_with_parts_for_voice(parent_elt, html_target, attachment_elts, the_text, other_party_number, sent_by_me, timestamp, msgbox_type, participants): + m_type = 128 if sent_by_me else 132 + bs4_append_mms_elt(parent_elt, participants, timestamp, m_type, msgbox_type, other_party_number, sent_by_me, the_text) + mms_elt = parent_elt.mms + + if attachment_elts: + parts_elt = mms_elt.parts + for sequence_number, attachment_elt in enumerate(attachment_elts): + if attachment_elt.name == 'img': + attachment_file_ref = attachment_elt['src'] + bs4_append_part_elt(parts_elt, ATTACHMENT_TYPE_IMAGE, sequence_number, html_target, attachment_file_ref) + elif attachment_elt.name == 'audio': + attachment_file_ref = attachment_elt.a['href'] + bs4_append_part_elt(parts_elt, ATTACHMENT_TYPE_AUDIO, sequence_number, html_target, attachment_file_ref) + elif attachment_elt.name == 'a' and 'vcard' in attachment_elt['class']: + attachment_file_ref = attachment_elt['href'] + bs4_append_part_elt(parts_elt, ATTACHMENT_TYPE_VCARD, sequence_number, html_target, attachment_file_ref) + else: + print(f'>> Unrecognized MMS attachment in HTML file (skipped):\n>> {attachment_elt}') + print(f'>> due to File: "{get_abs_path(html_target)}"') + +def bs4_append_mms_elt(parent_elt, participants, timestamp, m_type, msgbox_type, other_party_number, sent_by_me, the_text): + mms_elt = html_elt.new_tag('mms') + parent_elt.append(mms_elt) + + bs4_append_addrs_elt(mms_elt, participants, other_party_number, sent_by_me) + + parts_elt = html_elt.new_tag('parts') + mms_elt.append(parts_elt) + bs4_append_text_part_elt(parts_elt, the_text) + + if participants: + participants_tilde = '~'.join(participants) + else: + participants_tilde = BOGUS_NUMBER + # address - The phone number of the sender/recipient. + mms_elt['address'] = participants_tilde + # ct_t - The Content-Type of the message, usually "application/vnd.wap.multipart.related" + mms_elt['ct_t'] = 'application/vnd.wap.multipart.related' + # date - The Java date representation (including millisecond) of the time when the message was sent/received. + mms_elt['date'] = timestamp + # m_type - The type of the message defined by MMS spec. + mms_elt['m_type'] = m_type + # msg_box - The type of message, 1 = Received, 2 = Sent, 3 = Draft, 4 = Outbox + mms_elt['msg_box'] = msgbox_type + # read - Has the message been read + mms_elt['read'] = '1' + # rr - The read-report of the message. + mms_elt['rr'] = '129' + mms_elt['seen'] = '1' + mms_elt['sub_id'] = '-1' + mms_elt['text_only'] = '0' + + # sub - The subject of the message, if present. + # m_id - The Message-ID of the message + # m_size - The size of the message. + # sim_slot - The sim card slot. + # readable_date - Optional field that has the date in a human readable format. + # contact_name - Optional field that has the name of the contact. + + return parent_elt + +def bs4_append_text_part_elt(elt_parent, the_text): + if not the_text or the_text == "MMS Sent": + return # don't bother with this trivial text part + + text_part_elt = html_elt.new_tag('part') + + # seq - The order of the part. + text_part_elt['seq'] = '-1' + # ct - The content type of the part. + text_part_elt['ct'] = 'text/plain' + # name - The name of the part. + text_part_elt['name'] = 'null' + # chset - The charset of the part. + text_part_elt['chset'] = '106' + text_part_elt['cd'] = 'null' + text_part_elt['fn'] = 'null' + text_part_elt['cid'] = '' + # cl - The content location of the part. + text_part_elt['cl'] = 'text000001' + text_part_elt['ctt_s'] = 'null' + text_part_elt['ctt_t'] = 'null' + # text - The text content of the part. + text_part_elt['text'] = the_text + elt_parent.append(text_part_elt) - participants.append(format_number(phone_number)) + # data - The base64 encoded binary content of the part. + +def bs4_append_part_elt(parent_elt, attachment_type, sequence_number, html_target, attachment_file_ref): + attachment_filename, content_type = figure_out_attachment_filename_and_type(attachment_type, html_target, attachment_file_ref) + subdirectory, __ = html_target + if attachment_filename: + export_path_revised = get_rel_path((subdirectory, attachment_filename)) + with open(export_path_revised, 'rb') as attachment_file: + attachment_data = base64.b64encode(attachment_file.read()).decode() + parent_elt.append(bs4_get_file_comment((subdirectory, attachment_filename))) + part_elt = html_elt.new_tag('part') + parent_elt.append(part_elt) + + # seq - The order of the part. + part_elt['seq'] = sequence_number + # ct - The content type of the part. + part_elt['ct'] = content_type + # name - The name of the part. + part_elt['name'] = attachment_filename + # chset - The charset of the part. + part_elt['chset'] = 'null' + part_elt['cd'] = 'null' + part_elt['fn'] = 'null' + part_elt['cid'] = '<0>' + part_elt['ctt_s'] = 'null' + part_elt['ctt_t'] = 'null' + # text - The text content of the part. + part_elt['text'] = 'null' + part_elt['sef_type'] = '0' + # cl - The content location of the part. + part_elt['cl'] = attachment_filename + # data - The base64 encoded binary content of the part. + part_elt['data'] = attachment_data + +# a somewhat arbitrary collection of content types; I did not encounter all of these +ext_to_content_type = { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + ".svg": "image/svg+xml", + ".wav": "audio/wav", + ".ogg": "audio/ogg", + ".mp3": "audio/mp3", + ".mov": "video/quicktime", + ".mpg": "video/mpeg", + ".mp4": "video/mpeg4", + ".vcf": "text/x-vCard" + } + +def bs4_append_mms_elt_with_parts_for_chat(parent_elt, json_target, attachment_list, the_text, sender_number, sent_by_me, timestamp, msgbox_type, participants): + m_type = 128 if sent_by_me else 132 + bs4_append_mms_elt(parent_elt, participants, timestamp, m_type, msgbox_type, sender_number, sent_by_me, the_text) + mms_elt = parent_elt.mms + if attachment_list: + parts_elt = mms_elt.parts + sequence_number = 0 + for original_name, export_path_revised in attachment_list: + sequence_number += 1 + __, ext = os.path.splitext(original_name) + content_type = ext_to_content_type.get(ext, 'application/octet-stream') + # bs4_append_part_elt(parts_elt, ATTACHMENT_TYPE_IMAGE, sequence_number, html_target, attachment_file_ref) + with open(export_path_revised, 'rb') as attachment_file: + attachment_data = base64.b64encode(attachment_file.read()).decode() + subdirectory, export_name = os.path.split(export_path_revised) + parts_elt.append(bs4_get_file_comment((subdirectory, export_name))) + part_elt = html_elt.new_tag('part') + parts_elt.append(part_elt) + + # seq - The order of the part. + part_elt['seq'] = sequence_number + # ct - The content type of the part. + part_elt['ct'] = content_type + # name - The name of the part. + part_elt['name'] = original_name + # chset - The charset of the part. + part_elt['chset'] = 'null' + part_elt['cd'] = 'null' + part_elt['fn'] = 'null' + part_elt['cid'] = '<0>' + part_elt['ctt_s'] = 'null' + part_elt['ctt_t'] = 'null' + # text - The text content of the part. + part_elt['text'] = 'null' + part_elt['sef_type'] = '0' + # cl - The content location of the part. + part_elt['cl'] = export_name + # data - The base64 encoded binary content of the part. + part_elt['data'] = attachment_data + +def bs4_append_addrs_elt(elt_parent, participants, other_party_number, sent_by_me): + addrs_elt = html_elt.new_tag('addrs') + elt_parent.append(addrs_elt) + me_contact = contacts_oracle.get_number_by_name('Me', None) + for participant in participants + [me_contact]: + if sent_by_me and participant == me_contact: + participant_is_sender = True + elif not sent_by_me and participant == other_party_number: + participant_is_sender = True + else: + participant_is_sender = False + addr_elt = html_elt.new_tag('addr') + + # address - The phone number of the sender/recipient. + addr_elt['address'] = participant + # charset - Character set of this entry + addr_elt['charset'] = '106' + # type - The type of address, 129 = BCC, 130 = CC, 151 = To, 137 = From + addr_elt['type'] = 137 if participant_is_sender else 151 + + addrs_elt.append(addr_elt) + +def bs4_append_call_elt(parent_elt, telephone_number, duration, timestamp, presentation, readable_date, call_type): + call_elt = html_elt.new_tag('call') + # number - The phone number of the call. + call_elt['number'] = telephone_number + # duration - The duration of the call in seconds. + call_elt['duration'] = duration + # date - The Java date representation (including millisecond) of the time of the call + call_elt['date'] = timestamp + # presentation - caller id presentation info. 1 = Allowed, 2 = Restricted, 3 = Unknown, 4 = Payphone. + call_elt['presentation'] = presentation + # readable_date - Optional field that has the date in a human readable format. + call_elt['readable_date'] = readable_date + # call_type - 1 = Incoming, 2 = Outgoing, 3 = Missed, 4 = Voicemail, 5 = Rejected, 6 = Refused List. + call_elt['type'] = call_type + #call_elt['post_dial_digits'] = '' + # subscription_id - Optional field that has the id of the phone subscription (SIM). On some phones these are values like 0, 1, 2 etc. based on how the phone assigns the index to the sim being used while others have the full SIM ID. + # contact_name - Optional field that has the name of the contact. + + parent_elt.append(call_elt) + +def bs4_get_file_comment(file_target): + return Comment(f' file: "{get_rel_path(file_target)}" ') + +def figure_out_attachment_filename_and_type(attachment_type, html_target, attachment_file_ref): + # Why don't we try the filename with the extension first? We only know how to handle + # specific types of attachments, and we'll find those trhough trial and error pasting + # various extensions back onto the basename, so trying the existing extension first + # doesn't get us anything except weird special cases that we can't handle. + subdirectory, html_basename = html_target + # We assume all attachment references are relative to the directory of the HTML file. + base, __ = os.path.splitext(attachment_file_ref) + attachment_filename, content_type = consider_this_attachment_file_candidate(subdirectory, base, attachment_type) + if attachment_filename: + return attachment_filename, content_type + + base = base[:50] # this is odd; probably bugs in Takeout or at least weird choices + attachment_filename, content_type = consider_this_attachment_file_candidate(subdirectory, base, attachment_type) + if attachment_filename: + return attachment_filename, content_type + + base, __ = os.path.splitext(html_basename) + attachment_filename, content_type = consider_this_attachment_file_candidate(subdirectory, base, attachment_type) + if attachment_filename: + return attachment_filename, content_type + + base = base[:50] # this is odd; probably bugs in Takeout or at least weird choices + attachment_filename, content_type = consider_this_attachment_file_candidate(subdirectory, base, attachment_type) + if attachment_filename: + return attachment_filename, content_type + + print(f'>> {attachment_type} attachment referenced in HTML file but not found (skipped); partial name: "{get_abs_path((subdirectory, attachment_file_ref))}"') + print(f'>> src="{attachment_file_ref}"') + print(f'>> due to File: "{get_abs_path(html_target)}"') + return None, None + +def consider_this_attachment_file_candidate(subdirectory, base, attachment_type): + attachment_filename = None + content_type = None + if attachment_type == ATTACHMENT_TYPE_IMAGE: + if os.path.exists(get_rel_path((subdirectory, base + '.jpg'))): + attachment_filename = base + '.jpg' + content_type = 'image/jpeg' + elif os.path.exists(get_rel_path((subdirectory, base + '.gif'))): + attachment_filename = base + '.gif' + content_type = 'image/gif' + elif os.path.exists(get_rel_path((subdirectory, base + '.png'))): + attachment_filename = base + '.png' + content_type = 'image/png' + elif attachment_type == ATTACHMENT_TYPE_AUDIO: + if os.path.exists(get_rel_path((subdirectory, base + '.mp3'))): + attachment_filename = base + '.mp3' + content_type = 'audio/mp3' + elif attachment_type == ATTACHMENT_TYPE_VCARD: + if os.path.exists(get_rel_path((subdirectory, base + '.vcf'))): + attachment_filename = base + '.vcf' + content_type = 'text/x-vCard' + return attachment_filename, content_type + +# One of the mysteries for Takeout formatting. If the element includes a +# tag, then it was sent by someone else. If no tag, it was sent by Me. +def get_message_type(message): + cite_elt = message.cite + if cite_elt.span: + return 1 + else: + return 2 +def get_vm_transcript(message_elt): + full_text_elt = message_elt.find(class_='full-text') + if not full_text_elt: + return None + + return BeautifulSoup(full_text_elt.text, 'html.parser').prettify().strip() + +def get_message_text(message_elt): + text_elt = message_elt.find('q') + if not text_elt: + return None + return text_elt.text + +def get_mms_participant_phone_numbers(html_target, participants_elt): + participants = [] + tel_elts = participants_elt.find_all(class_='tel') + for tel_elt in tel_elts: + if not tel_elt.name == 'a': + continue + raw_number = tel_elt['href'][4:] + if not raw_number: + # I don't know if this can ever happen + raw_number = contact_name_to_number(get_sender_name_from_title_or_filename(html_target)) + phone_number = contacts_oracle.get_best_number(raw_number) + if not phone_number: + contact_names = contacts_oracle.get_names_by_number(raw_number) + if contact_names: + for contact_name in contact_names: + break + else: + contact_name = get_sender_name_from_title_or_filename(html_target) + print() + print(f'TODO: Missing or disallowed +phonenumber for contact: "{contact_name}": "{raw_number}",') + print(f' due to File: "{get_abs_path(html_target)}"') + counters['todo_errors'] += 1 + phone_number = BOGUS_NUMBER + phone_number = format_number(html_target, raw_number) + participants.append(format_number(html_target, phone_number)) + + if not participants: + # The filename for an MMS is just "Group Conversation", which is worthless for here. + if phone_number_from_html_title is None: + phone_number_from_html_title = contact_name_to_number(contact_name_from_html_title) + participants.append(contact_name_to_number(phone_number_from_html_title)) + return participants -def format_number(phone_number): +def format_number(html_target, raw_number): + try: + phone_number = phonenumbers.parse(raw_number, None) + except phonenumbers.phonenumberutil.NumberParseException: + # I also saw this on a 10-year-old "Placed" call. Probably a data glitch. + print() + if raw_number: + print(f"TODO: Possibly malformed contact phone number '{raw_number}' in HTML file. Using it anyhow.") + else: + print(f"TODO: Missing contact phone number in HTML file. Using '{BOGUS_NUMBER}'.") + print(f' due to File: "{get_abs_path(html_target)}"') + counters['todo_errors'] += 1 + return raw_number return phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.E164) -def get_time_unix(message): - time_raw = message.find(class_='dt') - ymdhms = time_raw['title'] - time_obj = dateutil.parser.isoparse(ymdhms); - mstime = timegm(time_obj.timetuple()) * 1000 + time_obj.microsecond / 1000 - return int(mstime) - -def write_header(filename, numsms): - backup_file = open(filename, 'r') - backup_text = backup_file.read() - backup_file.close() - - backup_file = open(filename, 'w') - backup_file.write(u"\n") - backup_file.write(u"\n") - backup_file.write(u'\n') - backup_file.write(backup_text) - backup_file.close() +def is_phone_number(value): + match_phone_number = re.match(r'(\+?[0-9]+)', value) + if match_phone_number: + return match_phone_number.group(1) + return None + +def datetime_from_string(string): + return dateutil.parser.parse(string) + +def unix_time_ms_from_datetime(datetime): + utc_offset_millis = datetime.utcoffset().total_seconds() * 1000 + # timegm() doesn't take the TZ into account, so we have to adjust it manually + timegm_millis = timegm(datetime.timetuple()) * 1000 + unix_epoch_time_ms = timegm_millis - utc_offset_millis + return int(unix_epoch_time_ms) + + +def get_time_unix_ms(message): + time_elt = message.find(class_='dt') + if not time_elt: + time_elt = message.find(class_='published') + iso_time = time_elt['title'] + #parsed_iso_time = dateutil.parser.isoparse(iso_time) + parsed_iso_time = datetime_from_string(iso_time) + return unix_time_ms_from_datetime(parsed_iso_time) + +def unix_to_iso_time(unix_time_seconds): + dt = datetime.datetime.fromtimestamp(unix_time_seconds, datetime.timezone.utc) + iso = dt.isoformat() + return iso + +def get_aka_path(path): + if os.path.isabs(path): + return path + else: + return path + f', aka {os.path.abspath(path)}' + +def get_abs_path(target): + rel_path = get_rel_path(target) + return os.path.abspath(rel_path) + +def get_rel_path(target): + subdirectory, basename = target + return os.path.normpath(os.path.join(subdirectory, basename)) + +XML_HEADER = "\n" +def write_dummy_headers(): + # The extra padding on the "count" lines are so that we can write the real count later + # without worrying about not having enough space. The extra whitespace at that + # place in the XML file is not significant. + sms_backup_file.write(XML_HEADER) + sms_backup_file.write('' + ' \n') + sms_backup_file.write("\n") + + ################ + vm_backup_file.write(XML_HEADER) + vm_backup_file.write('' + ' \n') + vm_backup_file.write("\n") + + ################ + call_backup_file.write(XML_HEADER) + call_backup_file.write('' + ' \n') + call_backup_file.write("\n") + + ################ + chat_backup_file.write(XML_HEADER) + chat_backup_file.write('' + ' \n') + chat_backup_file.write("\n") + +def print_counters(contacts_filename, sms_backup_filename, vm_backup_filename, call_backup_filename, chat_backup_filename): + pp = pprint.PrettyPrinter(indent=2, width=132) + print(">> Counters:") + print(f">> {counters['number_of_voice_sms_output']:6} SMS/MMS records from Google Voice written to {get_aka_path(sms_backup_filename)}") + print(f">> {counters['number_of_vms_output']:6} Voicemail records from Google Voice written to {get_aka_path(vm_backup_filename)}") + print(f">> {counters['number_of_calls_output']:6} Call records from Google Voice written to {get_aka_path(call_backup_filename)}") + print(f">> {counters['number_of_chat_sms_output']:6} SMS/MMS records from Google Chat written to {get_aka_path(chat_backup_filename)}") + print(f">> {counters['number_of_discovered_contacts']:6} Contacts discovered in HTML files") + print(f">> {counters['conflict_warnings']:6} Conflict info warnings given") + print(f">> {counters['todo_errors']:6} TODO errors given") + if counters['conflict_warnings'] > 0: + print(">> Recap of conflict info warnings:") + for name, numbers in conflicting_contacts.items(): + if len(numbers) > 1: + print(f">> {name}: {numbers}") + if missing_contacts: + print(">> Recap of missing or unresolved contacts (not including disallowed numbers):") + print(f">> {missing_contacts}") + +def write_real_headers(sms_backup_filename, vm_backup_filename, call_backup_filename, chat_backup_filename): + print() + + with open(sms_backup_filename, 'r+') as backup_file: + backup_file.write(XML_HEADER) + backup_file.write(f'\n') + + ################ + with open(vm_backup_filename, 'r+') as backup_file: + backup_file.write(XML_HEADER) + backup_file.write(f'\n') + + ################ + with open(call_backup_filename, 'r+') as backup_file: + backup_file.write(XML_HEADER) + backup_file.write(f'\n') + + ################ + with open(chat_backup_filename, 'r+') as backup_file: + backup_file.write(XML_HEADER) + backup_file.write(f'\n') + + +def write_trailers(): + sms_backup_file.write('\n') + vm_backup_file.write('\n') + call_backup_file.write('\n') + chat_backup_file.write('\n') + +def prep_output_files(sms_backup_filename, vm_backup_filename, call_backup_filename, chat_backup_filename): + sms_backup_filename_BAK = sms_backup_filename + '.BAK' + if os.path.exists(sms_backup_filename): + if os.path.exists(sms_backup_filename_BAK): + print('>> Removing', os.path.abspath(sms_backup_filename_BAK)) + os.remove(sms_backup_filename_BAK) + print('>> Renaming existing SMS/MMS output file to', os.path.abspath(sms_backup_filename_BAK)) + os.rename(sms_backup_filename, sms_backup_filename_BAK) + + print('>> SMS/MMS from Google Voice will be written to', get_aka_path(sms_backup_filename)) + print(">>") + + call_backup_filename_BAK = call_backup_filename + '.BAK' + if os.path.exists(call_backup_filename): + if os.path.exists(call_backup_filename_BAK): + print('>> Removing', os.path.abspath(call_backup_filename_BAK)) + os.remove(call_backup_filename_BAK) + print('>> Renaming existing Calls output file to', os.path.abspath(call_backup_filename_BAK)) + os.rename(call_backup_filename, call_backup_filename_BAK) + + print('>> Call history from Google Voice will be written to', get_aka_path(call_backup_filename)) + print(">>") + + vm_backup_filename_BAK = vm_backup_filename + '.BAK' + if os.path.exists(vm_backup_filename): + if os.path.exists(vm_backup_filename_BAK): + print('>> Removing', os.path.abspath(vm_backup_filename_BAK)) + os.remove(vm_backup_filename_BAK) + print('>> Renaming existing Voicemail output file to', os.path.abspath(vm_backup_filename_BAK)) + os.rename(vm_backup_filename, vm_backup_filename_BAK) + + print('>> Voicemail MMS from Google Voice will be written to', get_aka_path(vm_backup_filename)) + print(">>") + + chat_backup_filename_BAK = chat_backup_filename + '.BAK' + if os.path.exists(chat_backup_filename): + if os.path.exists(chat_backup_filename_BAK): + print('>> Removing', os.path.abspath(chat_backup_filename_BAK)) + os.remove(chat_backup_filename_BAK) + print('>> Renaming existing SMS/MMS output file to', os.path.abspath(chat_backup_filename_BAK)) + os.rename(chat_backup_filename, chat_backup_filename_BAK) + + print('>> SMS/MMS from Google Chat will be written to', get_aka_path(chat_backup_filename)) + print(">>") + + print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") + +# In some extreme cases, we have to pick our the correspondent from the name +# of the file. It can be a phone number or a contact name, or it can be completely missing. +def get_name_or_number_from_filename(html_basename): + global phone_number_from_filename, contact_name_from_filename + contact_name_from_filename = None + phone_number_from_filename = is_phone_number(html_basename) + if not phone_number_from_filename: + # sometimes a single " - ", sometimes two of them + match_name = re.match(r'([^ ].*) - .+ - ', html_basename) + if not match_name: + match_name = re.match(r'([^ ].*) - ', html_basename) + if match_name: + contact_name_from_filename = match_name.group(1) + if contact_name_from_filename == "Group Conversation": + contact_name_from_filename = None + return (contact_name_from_filename, phone_number_from_filename) + + +def get_name_or_number_from_title(): + global phone_number_from_html_title, contact_name_from_html_title + phone_number_from_html_title = None + contact_name_from_html_title = None + title_elt = html_elt.find('head').find('title') + title_value = title_elt.get_text() + # Takeout puts a newline in the middle of the title + split = title_value.split("\n") + correspondent = split[len(split)-1].strip() + + if not correspondent: + return (None, None) + + match_phone_number = is_phone_number(correspondent) + if match_phone_number: + # I think this doesn't actually happen + phone_number_from_html_title = match_phone_number.group(1) + else: + contact_name_from_html_title = correspondent + if contact_name_from_html_title == "Group Conversation": + contact_name_from_html_title = None + return (contact_name_from_html_title, phone_number_from_html_title) + +# Iterate all of the vcards in the HTML body to speculatively populate the +# contacts list. Also make a note of a contact which is "not me" for +# use as the address in an SMS record (it's always "the other end"). The +# same logic does not apply to MMS, which has a different scheme for address. +def scan_vcards_for_contacts(html_target, parent_elt): + global me + not_me_vcard_number = None + # We make the simplifying assumption that the timestamps in any given HTML file + # are close (enough) together and it doesn't matter much which one we use for + # the contact timestamp. We also ignore milliseconds. + timestamp = get_time_unix_ms(parent_elt) / 1000 + + vcard_elts = parent_elt.find_all(class_="vcard") + for vcard_elt in vcard_elts: + this_number, this_name = get_number_and_name_from_tel_elt_parent(vcard_elt) + if this_number: + if this_name != "Me": + not_me_vcard_number = this_number + if this_name: + number_is_known = contacts_oracle.is_already_known_pair(this_name, this_number) + if contacts_oracle.add_discovered_contact(this_name, this_number, timestamp): + counters['number_of_discovered_contacts'] += 1 + if not number_is_known: + conflict_list = conflicting_contacts.get(this_name, None) + if not conflict_list: + conflict_list = list() + conflict_list.append(this_number) + if not this_number in conflict_list: + print(f'>> Info: conflicting information about "{this_name}":', conflict_list, f"'{this_number}'") + print(f'>> due to File: "{get_abs_path(html_target)}"') + counters['conflict_warnings'] += 1 + conflict_list.append(this_number) + conflicting_contacts[this_name] = conflict_list + return not_me_vcard_number + +def get_number_and_name_from_tel_elt_parent(parent_elt): + this_name = None + this_number = None + tel_elt = parent_elt.find(class_='tel') + if not tel_elt: + return None, None + href_attr = tel_elt['href'] + if href_attr: + if href_attr.startswith("tel:"): + href_attr = href_attr[4:] + if not href_attr: + return None, None # this shouldn't happen + this_number = href_attr + fn_elt = parent_elt.find(class_="fn") + if not fn_elt: + return this_number, None + this_name = fn_elt.get_text() + # Sometimes the "name" ends up being a repeat of the phone number, which is useless for us + if not this_name or is_phone_number(this_name): + return this_number, None + return this_number, this_name + +# The (optional) contacts file can have these types of entries: +# 1. some name: some other name (this is a simple aliasing scheme for contact names) +# 2. some name: some number (a degenerate case that is turned into a list) +# 3. some name: [some list of numbers] (all of these numbers are acceptable for this contact name; leftmost is preferred) +# 4. some number: some other number (if some number is seen, some other number will be used in output) +class ContactsOracle: + def __init__(self, contacts_filename, policy, nanp_heuritics): + self._contacts_filename = contacts_filename + self._name_to_name = dict() + self._number_to_number = dict() + self._name_to_numbers = dict() + self._number_to_names = dict() + self._policy = policy + self._nanp_heuristics = nanp_heuritics + + if not os.path.exists(self._contacts_filename): + print('>> No (optional) JSON contacts file', os.path.abspath(self._contacts_filename)) + return + + print('>> Reading contacts from JSON contacts file', os.path.abspath(self._contacts_filename)) + with open(self._contacts_filename) as fp: + parsed_file = json.load(fp) + for key, value in parsed_file.items(): + if is_phone_number(key): + self._do_number_entry(key, value) + else: + self._do_name_entry(key, value) + + print(">> JSON contact configuration counts:") + print(f'>> {len(self._name_to_numbers):6} Name-to-number(s) entries') + print(f'>> {len(self._name_to_name):6} Name-to-name entries') + print(f'>> {len(self._number_to_number):6} Number-to-number entries') + print(f'>> {len(self._number_to_names):6} Number-to-names entries (computed)') + print(f">> Contact phone number replacement policy is '{self._policy}'") + + def _do_name_entry(self, name, value): + if isinstance(value, str): + if not is_phone_number(value): + # not mapping to a number, so must be an alias + self._name_to_name[name] = value + return + values = [value] # simple scalar; make it a list + elif isinstance(value, list): + values = value + else: + raise Exception(f'"{name}" entry value of type {type(value)} is not a string or a list: {value}\n in {get_aka_path(self._contacts_filename)}') + + far_future = 2_000_000_000 # a pseudo-Unix timestamp, in seconds, in the distant future + for ii in range(len(values)): + value = values[ii] + if not is_phone_number(value): + raise Exception(f'"{name}" entry value of type {type(value)} is not a phone number: {value}\n in {get_aka_path(self._contacts_filename)}') + # (value, timestamp, isconfigured) + value = self.apply_nanp_heuristics(value) + far_future_iso = unix_to_iso_time(far_future - ii) + timestamped_number = (value, far_future_iso, True) + values[ii] = timestamped_number + self._add_number_to_name_item(name, value) + # these are already reverse sorted; just belt and suspenders + values.sort(key=itemgetter(1), reverse=True) + self._name_to_numbers[name] = values + + def _add_number_to_name_item(self, name, number): + number = self.apply_nanp_heuristics(number) + existing = self._number_to_names.get(number, None) + if not existing: + existing = set() + self._number_to_names[number] = existing + existing.add(name) # it's a set, so we don't care if it's duplicate + + def _do_number_entry(self, number, name): + if not isinstance(name, str) or not is_phone_number(name): + raise Exception(f'"{number}" entry value of type {type(name)} is not a phone number: {name}\n in {get_aka_path(self._contacts_filename)}') + number = self.apply_nanp_heuristics(number) + self._number_to_number[name] = number + + def is_already_known_pair(self, name, number): + if not name or not number: + return False + number = self.apply_nanp_heuristics(number) + existing_list = self._name_to_numbers.get(name, None) + if not existing_list: + alias_to = self._name_to_name.get(name, None) + return self.is_already_known_pair(alias_to, number) + for list_item in existing_list: + this_number = list_item[0] + if this_number == number: + return True + return False + + def is_me_number(self, number): + names = self._number_to_names.get(number, None) + if not names: + return False + return 'Me' in names + + # This is really inefficient, but we're banking on the set of contacts being managable + def add_discovered_contact(self, name, number, timestamp): + # We could ignore any discovered contacts for policy "configured", but we want to + # do proper countihg and give messages to the user, etc. + if is_phone_number(name): + # it's a number that pairs with itself instead of a name, so ignore it. + return False + number = self.apply_nanp_heuristics(number) + existing_list = self._name_to_numbers.get(name, None) + if not existing_list: + existing_list = list() + self._name_to_numbers[name] = existing_list + found_it = False + # (value, timestamp, isconfigured) + iso_timestamp = unix_to_iso_time(timestamp) + new_tuple = (number, iso_timestamp, False) + for ii in range(len(existing_list)): + this_tuple = existing_list[ii] + this_number, this_timestamp, this_isconfigured = this_tuple + if this_number == number: + found_it = True + if iso_timestamp > this_timestamp: + # it's a newer discovery + # we only want to update discovered items, but the timestamps of the configured items + # will already deal with that because configured timestamps are artificially far future + existing_list[ii] = new_tuple + break + + if not found_it: + existing_list.append(new_tuple) + self._add_number_to_name_item(name, number) + + existing_list.sort(key=itemgetter(1), reverse=True) + + return not found_it + + # The strategy for this method and the next is to first do a lookup by the + # passed in key. If that doesn't yield a result, see if the key is an + # alias and try again with the pointed-to key. We'll eventually get a hit + # or reach the end of the chain. + # The argument "number" is typically None, but if it does have a value + # we'll see if we can do better, where "better" is according to policy. + def get_number_by_name(self, name, number): + if not name: return number # only happens by recursion + number = self.apply_nanp_heuristics(number) + if self._policy == POLICY_ASIS: return self._policy_asis(name, number) + elif self._policy == POLICY_CONFIGURED: return self._policy_configured(name, number) + elif self._policy == POLICY_NEWEST: return self._policy_newest(name) + else: + raise Exception(f'We don''t recognize this number policy: "{self._policy}". It''s probably a bug in the script.') + + def _policy_asis(self, name, number): + if number: + return number + else: + try: + self._policy = POLICY_NEWEST + return self.get_number_by_name(name, number) + finally: + self._policy = POLICY_ASIS + + def _policy_newest(self, name): + candidate_list = self._name_to_numbers.get(name, None) + if candidate_list: + # candidate_list will be a list with the first item the preferred number + this_number, this_timestamp, this_isconfigured = candidate_list[0] + return this_number + else: + aliased_to = self._name_to_name.get(name, None) + return self.get_number_by_name(aliased_to, None) + + def _policy_configured(self, name, number): + value = None + candidate_list = self._name_to_numbers.get(name, None) + if candidate_list: + # if no candidate number was passed in, return the best configured number + if not number: + this_number, this_timestamp, this_isconfigured = candidate_list[0] + if this_isconfigured: + value = this_number + else: + # a number was passed in, so vet it + for this_number, this_timestamp, this_isconfigured in candidate_list: + if this_isconfigured and number == this_number: + value = this_number + break + + if value: + return value + else: + aliased_to = self._name_to_name.get(name, None) + return self.get_number_by_name(aliased_to, None) + + def get_names_by_number(self, number): + if not number: + return None + number = self.apply_nanp_heuristics(number) + value = self._number_to_names.get(number, None) + if value: + return value + + return self.get_names_by_number(self._number_to_number.get(number, None)) + + def get_best_number(self, number): + number = self.apply_nanp_heuristics(number) + if self._policy == POLICY_ASIS: + return number + best_timestamp = '0000' + best_number = None + names = self.get_names_by_number(number) + if names: + for name in names: + # iterate over all the names, choosing the latest timestamp from among all of them + tuples = self._name_to_numbers.get(name, None) + if tuples: + this_number, this_timestamp, this_isconfigured = tuples[0] + if self._policy == POLICY_CONFIGURED and not this_isconfigured: + continue + if this_timestamp > best_timestamp: + best_timestamp = this_timestamp + best_number = this_number + if best_number: + return best_number + else: + return number + + def apply_nanp_heuristics(self, number): + if not self._nanp_heuristics or not number: + return number + + if len(number) == 10 and not number.startswith('1'): + return '+1' + number + elif len(number) == 11 and number.startswith('1'): + return '+' + number + else: + # This is unlikely to work out + return number + + def dump(self): + pp = pprint.PrettyPrinter(indent=2, width=132) + print() + print("Mappings of names-to-numbers (configured True, discovered False):") + pp.pprint(self._name_to_numbers) + print() + print("Mappings of numbers-to-names (computed reverse mappings)") + pp.pprint(self._number_to_names) + print() + print("Mappings of names-to-names (configured name aliases):") + pp.pprint(self._name_to_name) + print() + print("Mappings of numbers-to-numbers (configured number aliases):") + pp.pprint(self._number_to_number) main() + diff --git a/test_data/TEST_DATA.md b/test_data/TEST_DATA.md new file mode 100644 index 0000000..2ce58e6 --- /dev/null +++ b/test_data/TEST_DATA.md @@ -0,0 +1,168 @@ +# Test data + +The files under here are for limited regression testing. +I hope they cover all interesting cases, +but it's difficult to say for sure. +The directory structure represents an unpacked Google Takeout archive, +so the interesting stuff is under `Takeout/Voice/Calls/` and `Takeout/Google Chat/Groups/`. +If you run the script from the `Takeout/` directory and use the default options, +the output files will end up here (the same directory as this file, one level above `Takeout/`). + +The data came mostly from actual Google Takeout files, +but the contact names and numbers have been faked for privacy. +Likewise, the attachment files have been munged, +which also makes them a lot smaller and easier to inspect in the output files. +All images and sound files have had the same replacement, +so don't let that bother you. +The important thing for the test data is that the attachments show up where expected; +their content is unimportant. + +## Contacts +Here are the possible contact names and numbers. +You don't need to add any of these to `contacts.json` +(in this directory, one level above `Takeouts/`) +unless instructed to do so by the script. +If you are so inclined, +you can copy `contacts.json-EXAMPLE` to `contacts.json` to save yourself a little bit of typing. +Some of the contacts have multiple phone numbers, +which happens for legitimate reasons in real data. + +For this test data, the Google Voice account belongs to user `Maria S Curie` +(so that phone number is used for `Me`). + +| Name | Numbers | +|------|---------| +|Agatha M Christie|+17323211111| +|Alan A Milne|+17323212222| +|Albert Einstein|+17323213333| +|Edson Arantes do Nascimento|+17323214444,+12123214444,+15703214444| +|F Scott Fitzgerald|+17323215555| +|George H Ruth Jr|+17323216666| +|Hans Christian Andersen|+17323217777| +|Maria S Curie|+17323210011| +|Søren Aabye Kierkegaard|+17323211414| +|William Shakespeare|+17323211515| +|Wilma Glodean Rudolph|+17323211717| +|Debbie One|+12125550001| +|Missy Two|+12125550002| +|Trish Three|+12125550003| +|Mary Four|+12125550004| +|Laura Five|+12125550005| + +## Testing SMS Backup and Restore +Before committing your own precious message and call history to the `restore` process, +you might like to make a practice run with this test data. +How can you do that? + +- In the Google Contacts for the account you use with your phone, +add the names and phone numbers from the above list. +You might like to add some distinctive label to those entries to make them easy find or delete later. +- Run the `sms.py` script against this test data. +- In the output files, +replace Curie's number, `+17323210011`, with your own number. +If you are on a Unix-like system, +you can do that with `sed` like so: +`sed -i 's/17323210011/19991111234/g' *.xml` +- Use those modified output files to do a `restore` to your phone with SMS Backup and Restore. +- In your phone's dialer app, +you should be able to see call history for several of those fake contacts. +- In your phone's text messaging app, +you should be able to see conversation history for several of those fake contacts. +That includes a few messages with voicemail recordings attached +and a group conversation with you and 5 other participants. +Some of the messages include attached images or vCard files. +- You might like to `restore` again with the same files to see that SMS Backup and Restore detects the duplicates. +- When you are done looking around, +your dialer and text messaging apps should let you delete the history for all the restored items. +Your contacts app should let you delete the contacts themselves. + +After you have done all of the above, your phone's contents should be back where you started, +with only real calls and conversations and none of the fake ones from this test data. + +## Console output +Your paths will be different, but my output looks like this (I ran it with the `-z` option to dump the internal tables): +``` +>> No (optional) JSON contacts file /home/wjc/git/gvoice-sms-takeout-xml/test_data/contacts.json +>> Removing /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-gvoice.xml.BAK +>> Renaming existing SMS/MMS output file to /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-gvoice.xml.BAK +>> SMS/MMS from Google Voice will be written to ../sms-gvoice.xml, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-gvoice.xml +>> +>> Removing /home/wjc/git/gvoice-sms-takeout-xml/test_data/calls-gvoice.xml.BAK +>> Renaming existing Calls output file to /home/wjc/git/gvoice-sms-takeout-xml/test_data/calls-gvoice.xml.BAK +>> Call history from Google Voice will be written to ../calls-gvoice.xml, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/calls-gvoice.xml +>> +>> Removing /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-vm-gvoice.xml.BAK +>> Renaming existing Voicemail output file to /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-vm-gvoice.xml.BAK +>> Voicemail MMS from Google Voice will be written to ../sms-vm-gvoice.xml, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-vm-gvoice.xml +>> +>> Removing /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-chat.xml.BAK +>> Renaming existing SMS/MMS output file to /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-chat.xml.BAK +>> SMS/MMS from Google Chat will be written to ../sms-chat.xml, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-chat.xml +>> +>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +>> 1st pass reading *.html files under Voice/Calls, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls +>> Info: conflicting information about "Edson Arantes do Nascimento": ['+17323214444'] '+15703214444' +>> due to File: "/home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2016-05-01T00_16_43Z.html" +>> Info: conflicting information about "Edson Arantes do Nascimento": ['+17323214444', '+15703214444'] '+12123214444' +>> due to File: "/home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2014-05-17T02_54_27Z.html" +>> Your 'Me' phone number is +17323210011 +>> 2nd pass reading *.html files under Voice/Calls, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls + +TODO: Missing or disallowed +phonenumber for contact: "Agatha M Christie": "+", + due to File: "/home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z.html" + +TODO: Missing contact phone number in HTML file. Using '0000000000'. + due to File: "/home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls/ - Placed - 2013-07-29T20_56_11Z.html" +>> Reading chat files under Voice/Calls, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/Takeout/Voice/Calls + +>> Counters: +>> 62 SMS/MMS records from Google Voice written to ../sms-gvoice.xml, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-gvoice.xml +>> 3 Voicemail records from Google Voice written to ../sms-vm-gvoice.xml, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-vm-gvoice.xml +>> 10 Call records from Google Voice written to ../calls-gvoice.xml, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/calls-gvoice.xml +>> 0 SMS/MMS records from Google Chat written to ../sms-chat.xml, aka /home/wjc/git/gvoice-sms-takeout-xml/test_data/sms-chat.xml +>> 14 Contacts discovered in HTML files +>> 2 Conflict info warnings given +>> 2 TODO errors given +>> Recap of conflict info warnings: +>> Edson Arantes do Nascimento: ['+17323214444', '+15703214444', '+12123214444'] +>> Recap of missing or unresolved contacts (not including disallowed numbers): +>> {'Agatha M Christie'} + +Mappings of names-to-numbers (configured and discovered): +{ 'Alan A Milne': [('+17323212222', 1696201093000, False)], + 'Albert Einstein': [('+17323213333', 1558549136000, False)], + 'Debbie One': [('+12125550001', 1696191235000, False)], + 'Edson Arantes do Nascimento': [ ('+17323214444', 1696194960000, False), + ('+15703214444', 1462061803000, False), + ('+12123214444', 1400295267000, False)], + 'Laura Five': [('+12125550005', 1696220436000, False)], + 'Mary Four': [('+12125550004', 1696208230000, False)], + 'Me': [('+17323210011', 1697995714000, False)], + 'Missy Two': [('+12125550002', 1696190939000, False)], + 'Rosalind E Franklin': [('+17323211313', 1626211647000, False)], + 'Søren Aabye Kierkegaard': [('+17323211414', 1696198699000, False)], + 'Trish Three': [('+12125550003', 1696208303000, False)], + 'William Shakespeare': [('+17323211515', 1640039728000, False)]} + +Mappings of numbers-to-names (computed reverse mappings) +{ '+12123214444': 'Edson Arantes do Nascimento', + '+12125550001': 'Debbie One', + '+12125550002': 'Missy Two', + '+12125550003': 'Trish Three', + '+12125550004': 'Mary Four', + '+12125550005': 'Laura Five', + '+15703214444': 'Edson Arantes do Nascimento', + '+17323210011': 'Me', + '+17323211313': 'Rosalind E Franklin', + '+17323211414': 'Søren Aabye Kierkegaard', + '+17323211515': 'William Shakespeare', + '+17323212222': 'Alan A Milne', + '+17323213333': 'Albert Einstein', + '+17323214444': 'Edson Arantes do Nascimento'} + +Mappings of names-to-names (configured name aliases): +{} + +Mappings of numbers-to-numbers (configured number aliases): +{} +``` diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-04-19.jpg b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-04-19.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-04-19.jpg differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-04-27(1).jpg b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-04-27(1).jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-04-27(1).jpg differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-04-27.jpg b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-04-27.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-04-27.jpg differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-06-08.jpg b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-06-08.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-06-08.jpg differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-08-18.png b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-08-18.png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2015-08-18.png differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2016-01-15.png b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2016-01-15.png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-2016-01-15.png differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Callouts_Internet_LMAO(1).png b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Callouts_Internet_LMAO(1).png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Callouts_Internet_LMAO(1).png differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Callouts_Internet_LMAO(2).png b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Callouts_Internet_LMAO(2).png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Callouts_Internet_LMAO(2).png differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Callouts_Internet_LMAO.png b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Callouts_Internet_LMAO.png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Callouts_Internet_LMAO.png differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_003.png b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_003.png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_003.png differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_004(1).png b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_004(1).png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_004(1).png differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_004.png b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_004.png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_004.png differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_005.png b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_005.png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_005.png differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_007.png b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_007.png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-Selection_007.png differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-diapers.jpg b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-diapers.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/File-diapers.jpg differ diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/group_info.json b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/group_info.json new file mode 100644 index 0000000..3304cf0 --- /dev/null +++ b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/group_info.json @@ -0,0 +1,14 @@ +{ + "members": [ + { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + } + ] +} \ No newline at end of file diff --git a/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/messages.json b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/messages.json new file mode 100644 index 0000000..e392f84 --- /dev/null +++ b/test_data/Takeout/Google Chat/Groups/DM 1pRI-QAAAAE/messages.json @@ -0,0 +1,694 @@ +{ + "messages": [ + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Wednesday, April 15, 2015 at 12:51:08 AM UTC", + "text": "$95.00", + "topic_id": "Zz55N1gqWfo", + "message_id": "1pRI-QAAAAE/Zz55N1gqWfo/Zz55N1gqWfo" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Wednesday, April 15, 2015 at 12:51:25 AM UTC", + "text": "Per $1000", + "topic_id": "tnu2Fw2cpGg", + "message_id": "1pRI-QAAAAE/tnu2Fw2cpGg/tnu2Fw2cpGg" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, April 15, 2015 at 12:51:41 AM UTC", + "text": "I\u0027ve heard of polymaths, but I think you might be the first demi-math", + "topic_id": "beJ6uN-Eb_A", + "message_id": "1pRI-QAAAAE/beJ6uN-Eb_A/beJ6uN-Eb_A" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Wednesday, April 15, 2015 at 12:54:38 AM UTC", + "text": "Demimath is a subset of polymath. So, by defn, less than a polymath. Demi Moore may be a full polymath. ", + "topic_id": "GwQIco6BxCw", + "message_id": "1pRI-QAAAAE/GwQIco6BxCw/GwQIco6BxCw" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, April 15, 2015 at 12:55:13 AM UTC", + "text": "that adds up", + "topic_id": "LS3BdKinZm8", + "message_id": "1pRI-QAAAAE/LS3BdKinZm8/LS3BdKinZm8" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Sunday, April 19, 2015 at 11:11:30 PM UTC", + "attached_files": [ + { + "original_name": "2015-04-19.jpg", + "export_name": "File-2015-04-19.jpg" + } + ], + "topic_id": "KOkx9Bb7sXI", + "message_id": "1pRI-QAAAAE/KOkx9Bb7sXI/KOkx9Bb7sXI" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 5:48:51 PM UTC", + "text": "I was in Iceland for the weekend.", + "topic_id": "w3XV6_eJp50", + "message_id": "1pRI-QAAAAE/w3XV6_eJp50/w3XV6_eJp50" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 5:49:30 PM UTC", + "text": "Nice. Always wanted to go, but fear the volcanoes. ", + "topic_id": "ZAX8yofdX_w", + "message_id": "1pRI-QAAAAE/ZAX8yofdX_w/ZAX8yofdX_w" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 5:52:06 PM UTC", + "text": "fyi, easy to go to iceland for a short visit if you are going to europe for some other reason (like I did). Icelandair organizes things so you can lay over there for up to a week withtout affecting your airfare. Icelandair is already pretty cheap to use for Europe.", + "topic_id": "ulenbtsa7Os", + "message_id": "1pRI-QAAAAE/ulenbtsa7Os/ulenbtsa7Os" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 8:16:14 PM UTC", + "text": "Maybe next time. ", + "topic_id": "Mfb9lX-uNkc", + "message_id": "1pRI-QAAAAE/Mfb9lX-uNkc/Mfb9lX-uNkc" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 8:29:01 PM UTC", + "attached_files": [ + { + "original_name": "2015-04-27.jpg", + "export_name": "File-2015-04-27.jpg" + } + ], + "topic_id": "N-9_2Qw6DvE", + "message_id": "1pRI-QAAAAE/N-9_2Qw6DvE/N-9_2Qw6DvE" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 8:29:04 PM UTC", + "text": "Today", + "topic_id": "ASeivfGueSE", + "message_id": "1pRI-QAAAAE/ASeivfGueSE/ASeivfGueSE" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 8:29:18 PM UTC", + "text": "ok, you\u0027re good", + "topic_id": "bFfiHwc63Uw", + "message_id": "1pRI-QAAAAE/bFfiHwc63Uw/bFfiHwc63Uw" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 8:31:13 PM UTC", + "attached_files": [ + { + "original_name": "2015-04-27.jpg", + "export_name": "File-2015-04-27.jpg" + } + ], + "topic_id": "z01hmbuEe8E", + "message_id": "1pRI-QAAAAE/z01hmbuEe8E/z01hmbuEe8E" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 8:31:37 PM UTC", + "text": "the good news is that haven\u0027t yet gone metric", + "topic_id": "TmvMM-l0g8s", + "message_id": "1pRI-QAAAAE/TmvMM-l0g8s/TmvMM-l0g8s" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Monday, April 27, 2015 at 8:32:01 PM UTC", + "text": "Oh. Well then, I take it all back. ", + "topic_id": "E2YnT706fYk", + "message_id": "1pRI-QAAAAE/E2YnT706fYk/E2YnT706fYk" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Tuesday, April 28, 2015 at 1:21:56 AM UTC", + "text": "That\u0027s how it appears. HTTPS://joker.com/faq/8/77/en/how-to-change-the-owner-of-a-domain.html. Seems they distinguish between \"owner\" and \"Whois\" registration. ", + "annotations": [ + { + "start_index": 83, + "length": 70, + "url_metadata": { + "title": "", + "snippet": "", + "image_url": "", + "url": { + "private_do_not_access_or_else_safe_url_wrapped_value": "HTTPS://joker.com/faq/8/77/en/how-to-change-the-owner-of-a-domain.html" + } + } + } + ], + "topic_id": "pEj_nVyjZwY", + "message_id": "1pRI-QAAAAE/pEj_nVyjZwY/pEj_nVyjZwY" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Tuesday, April 28, 2015 at 1:22:57 AM UTC", + "text": "https://joker.com/faq/content/8/77/en/how-to-change-the-owner-of-a-domain.html", + "annotations": [ + { + "start_index": 0, + "length": 78, + "url_metadata": { + "title": "Joker.com FAQ - How to Change the Owner of a Domain?", + "snippet": "", + "image_url": "", + "url": { + "private_do_not_access_or_else_safe_url_wrapped_value": "https://joker.com/faq/content/8/77/en/how-to-change-the-owner-of-a-domain.html" + } + } + } + ], + "topic_id": "S2sWvVLTXrA", + "message_id": "1pRI-QAAAAE/S2sWvVLTXrA/S2sWvVLTXrA" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Tuesday, April 28, 2015 at 1:25:35 AM UTC", + "text": "yeah, seems so", + "topic_id": "DErtPnswy2A", + "message_id": "1pRI-QAAAAE/DErtPnswy2A/DErtPnswy2A" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Tuesday, April 28, 2015 at 1:27:14 AM UTC", + "attached_files": [ + { + "original_name": "Selection_003.png", + "export_name": "File-Selection_003.png" + } + ], + "topic_id": "9IJVDNcmWeg", + "message_id": "1pRI-QAAAAE/9IJVDNcmWeg/9IJVDNcmWeg" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Tuesday, April 28, 2015 at 1:32:55 AM UTC", + "text": "I hope it\u0027s not someone in Baltimore", + "topic_id": "RNaE4c6AKak", + "message_id": "1pRI-QAAAAE/RNaE4c6AKak/RNaE4c6AKak" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Tuesday, April 28, 2015 at 3:01:39 AM UTC", + "text": "you da man:", + "topic_id": "OgtUTNLkcWc", + "message_id": "1pRI-QAAAAE/OgtUTNLkcWc/OgtUTNLkcWc" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Tuesday, April 28, 2015 at 3:01:49 AM UTC", + "attached_files": [ + { + "original_name": "Selection_004.png", + "export_name": "File-Selection_004.png" + } + ], + "topic_id": "mt6tlae-UkI", + "message_id": "1pRI-QAAAAE/mt6tlae-UkI/mt6tlae-UkI" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Thursday, May 7, 2015 at 4:26:12 PM UTC", + "text": "are you still out of town or back?", + "topic_id": "MT1u5rH5MEU", + "message_id": "1pRI-QAAAAE/MT1u5rH5MEU/MT1u5rH5MEU" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Thursday, May 7, 2015 at 4:26:19 PM UTC", + "text": "your back, not mine", + "topic_id": "iVfg2udUnW0", + "message_id": "1pRI-QAAAAE/iVfg2udUnW0/iVfg2udUnW0" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Thursday, May 7, 2015 at 4:28:03 PM UTC", + "text": "I have not had any reports of problems", + "topic_id": "ePSeSyVKdck", + "message_id": "1pRI-QAAAAE/ePSeSyVKdck/ePSeSyVKdck" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 1:12:52 AM UTC", + "text": "I was on the phone when you called. Frickin day job. Who needs it.", + "topic_id": "al0sU4hCG54", + "message_id": "1pRI-QAAAAE/al0sU4hCG54/al0sU4hCG54" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 1:19:04 AM UTC", + "text": "did you try \"password1\"?", + "topic_id": "oVUZJnyof1o", + "message_id": "1pRI-QAAAAE/oVUZJnyof1o/oVUZJnyof1o" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 1:31:09 AM UTC", + "text": "found it", + "topic_id": "veICUzxEkRE", + "message_id": "1pRI-QAAAAE/veICUzxEkRE/veICUzxEkRE" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:24:43 AM UTC", + "text": "GENERAL ESCROW INSTRUCTIONS\nFlyByNight Escrow Services, Inc., a California corporation (\"FBNES\") is the direct provider of escrow services, and is licensed by the Department of Business Oversight, State of California, License Number 000 0000, the Arizona Department of Financial Institutions EA 0000000, and the Idaho Department of Finance, License Number ESC-0000. The following provisions shall be referred to as the \"General Escrow Instructions.\" The parties hereto employ, authorize and instruct FBNES to act as Escrow Holder and Escrow Agent in connection with the Transaction under the terms and conditions on the Transaction Detail Screens, these General Escrow Instructions, Terms of Use and any supplemental Escrow Instructions as hereinafter defined and all collectively referred to as the \"Transaction Escrow Instructions.\" FBNES and FBNES are hereinafter collectively referred to as \"FBNES\" and intermittently referred to as \"we\" or \"us.\"\nGeneral Provisions\nTransactions performed by a Buyer, Seller and Broker on the FBNES site, shall be governed by the Uniform Electronic Transactions Act (Cal. Civil Code Section 1633.1 et. seq.) and the California Financial Code governing Escrow Regulations.\nA person or entity offering personal property for sale, and desiring to use the FBNES site in order to close such a sale, shall hereinafter be referred to as \"Seller\". \nA person or entity desiring to purchase personal property from a Seller, by use of the Escrow.com site for completing the purchase shall be referred to as \"Buyer\". ", + "topic_id": "oQHISnN5O6k", + "message_id": "1pRI-QAAAAE/oQHISnN5O6k/oQHISnN5O6k" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:24:53 AM UTC", + "text": "Clearly, they never visited us", + "topic_id": "AGLKb-oi7l4", + "message_id": "1pRI-QAAAAE/AGLKb-oi7l4/AGLKb-oi7l4" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:25:48 AM UTC", + "text": "So escrow services are legal in California?", + "topic_id": "HwEjl5cTN2w", + "message_id": "1pRI-QAAAAE/HwEjl5cTN2w/HwEjl5cTN2w" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:26:02 AM UTC", + "text": "it\u0027s not a euphemism", + "topic_id": "KtLfWoY1X8s", + "message_id": "1pRI-QAAAAE/KtLfWoY1X8s/KtLfWoY1X8s" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:27:41 AM UTC", + "text": "Good. I want the terms to be clear. ", + "topic_id": "i9GtYVhewec", + "message_id": "1pRI-QAAAAE/i9GtYVhewec/i9GtYVhewec" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:28:01 AM UTC", + "text": "it\u0027s their turn now ... we\u0027re in it to win it", + "topic_id": "jCFDHIT9RsI", + "message_id": "1pRI-QAAAAE/jCFDHIT9RsI/jCFDHIT9RsI" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:28:18 AM UTC", + "text": "May 12 2015 7:26PM PDT\nBoth parties have accepted the offer, awaiting buyer payment.", + "topic_id": "Xaf9UFtv_pA", + "message_id": "1pRI-QAAAAE/Xaf9UFtv_pA/Xaf9UFtv_pA" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:28:47 AM UTC", + "text": "Awesome. Should we take out the copper fixtures?", + "topic_id": "m_KdI_dYOrQ", + "message_id": "1pRI-QAAAAE/m_KdI_dYOrQ/m_KdI_dYOrQ" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:29:08 AM UTC", + "text": "I\u0027ve already drained out all the gas", + "topic_id": "I25fBXynGs8", + "message_id": "1pRI-QAAAAE/I25fBXynGs8/I25fBXynGs8" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:29:22 AM UTC", + "text": "Obviously, it\u0027s past my bedtime. ", + "topic_id": "g1MHv-UfFZ0", + "message_id": "1pRI-QAAAAE/g1MHv-UfFZ0/g1MHv-UfFZ0" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Wednesday, May 13, 2015 at 2:29:35 AM UTC", + "text": "You have no such excuse. ", + "topic_id": "jiNfO0dRjT0", + "message_id": "1pRI-QAAAAE/jiNfO0dRjT0/jiNfO0dRjT0" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, May 27, 2015 at 7:29:39 PM UTC", + "text": "figured out what to do with my spare money: http://boingboing.net/2015/05/27/son-of-chinas-richest-man-th.html", + "annotations": [ + { + "start_index": 50, + "length": 66, + "url_metadata": { + "title": "Son of China\u0027s richest man thinks two gold Apple Watches is enough for his dog | Boing Boing", + "snippet": "27-year-old Wang Sicong is the son of Wang Jianlin, worth about $34 billion dollars.", + "image_url": "https://lh6.googleusercontent.com/proxy/FPaachWZKCfTIA3Oe6qpnQ008RYuy5BGbfMTH8lnGNNPz843zZ8rM_VsZvzkGwDNnF5PKsiKzaGtqWBRctiIkSZ1Fzje7OX_9efCa6dKUtFrjAJrDicECKQXUzz2ILsmqaLyGHE5dczffLm15H-t8DOi_OxYvSTPDKnRESPw9b_JlA", + "url": { + "private_do_not_access_or_else_safe_url_wrapped_value": "http://boingboing.net/2015/05/27/son-of-chinas-richest-man-th.html" + } + } + } + ], + "topic_id": "BpRtOkyUyNI", + "message_id": "1pRI-QAAAAE/BpRtOkyUyNI/BpRtOkyUyNI" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Thursday, May 28, 2015 at 2:01:17 AM UTC", + "attached_files": [ + { + "original_name": "Selection_007.png", + "export_name": "File-Selection_007.png" + } + ], + "topic_id": "uYshDgKfruE", + "message_id": "1pRI-QAAAAE/uYshDgKfruE/uYshDgKfruE" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Tuesday, June 30, 2015 at 7:25:36 PM UTC", + "attached_files": [ + { + "original_name": "Callouts_Internet_LMAO.png", + "export_name": "File-Callouts_Internet_LMAO.png" + } + ], + "topic_id": "hoDe9zXAY3A", + "message_id": "1pRI-QAAAAE/hoDe9zXAY3A/hoDe9zXAY3A" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Tuesday, June 30, 2015 at 7:25:41 PM UTC", + "attached_files": [ + { + "original_name": "Callouts_Internet_LMAO.png", + "export_name": "File-Callouts_Internet_LMAO.png" + } + ], + "topic_id": "qKPfSD3XQYA", + "message_id": "1pRI-QAAAAE/qKPfSD3XQYA/qKPfSD3XQYA" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, July 13, 2015 at 7:11:16 PM UTC", + "text": "http://boingboing.net/2015/07/13/the-perfect-emacs-setup.html", + "annotations": [ + { + "start_index": 0, + "length": 61, + "url_metadata": { + "title": "The perfect Emacs setup | Boing Boing", + "snippet": "Everything you don’t need to know about the legendary expandable text editor, courtesy of @ieure.", + "image_url": "https://lh4.googleusercontent.com/proxy/AEGsGueXleesLABN2nv2DgqDkQe9PtTJXzGqZMemo94MXPMBFbLmHrHMoDgojvKh6iIStxSrWN5oqEHJwramPrC5NexBoZIa1dpFcr0pHRMP_O7NTFCKaJICWhpsl4VEj0bOsqi59xz2oSQKPfLI-zq_P7NzsiM", + "url": { + "private_do_not_access_or_else_safe_url_wrapped_value": "http://boingboing.net/2015/07/13/the-perfect-emacs-setup.html" + } + } + } + ], + "topic_id": "TRPlduCQln0", + "message_id": "1pRI-QAAAAE/TRPlduCQln0/TRPlduCQln0" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Tuesday, August 18, 2015 at 8:39:19 PM UTC", + "attached_files": [ + { + "original_name": "2015-08-18.png", + "export_name": "File-2015-08-18.png" + } + ], + "topic_id": "JD-LAr8or3A", + "message_id": "1pRI-QAAAAE/JD-LAr8or3A/JD-LAr8or3A" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Tuesday, January 12, 2016 at 11:29:28 PM UTC", + "text": "http://boingboing.net/features/northkorea/?traitor\u003dAA+Milne", + "annotations": [ + { + "start_index": 0, + "length": 61, + "url_metadata": { + "title": "Denunciation of Imperialist Lackey!", + "snippet": "Official news from the Central News Agency of the Democratic People\u0027s Republic of Korea", + "image_url": "https://lh4.googleusercontent.com/proxy/4g1QfpdDCIky3O_LveJQ3kvw6ms4hy7plbBxpgMg7NVih8cEyd0M1F5tpwnsc1qEJaAyx2OkMikKS6BNyofKg8Erjg", + "url": { + "private_do_not_access_or_else_safe_url_wrapped_value": "http://boingboing.net/features/northkorea/?traitor\u003dAndy+Heard" + } + } + } + ], + "topic_id": "D46pjyf9sUE", + "message_id": "1pRI-QAAAAE/D46pjyf9sUE/D46pjyf9sUE" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Friday, January 15, 2016 at 4:07:30 PM UTC", + "text": "Hey -- I just saw this.  I must say I\u0027m honored.", + "topic_id": "B_G1sAL6gNs", + "message_id": "1pRI-QAAAAE/B_G1sAL6gNs/B_G1sAL6gNs" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, February 1, 2016 at 4:27:30 PM UTC", + "text": "https://twitter.com/devops_borat", + "annotations": [ + { + "start_index": 0, + "length": 32, + "url_metadata": { + "title": "DevOps Borat (@DEVOPS_BORAT) | Twitter", + "snippet": "", + "image_url": "https://lh6.googleusercontent.com/proxy/oZeMfrcU7KtXOYoPsBmcgF-oVSeBOY9Kzzwcw4XXv1BtymDa-aiL76fyGh55qMqs7qBKUi1iiFtZCeMjXN1mgsXJ6o2XY_LxxXAgKzZ7kTv2PY0nrjwTV-f7g7ebjnqAMdOeNAeyLohH_0RlymUMYw", + "url": { + "private_do_not_access_or_else_safe_url_wrapped_value": "https://twitter.com/devops_borat" + } + } + } + ], + "topic_id": "qcsTpBvw4s8", + "message_id": "1pRI-QAAAAE/qcsTpBvw4s8/qcsTpBvw4s8" + }, + { + "creator": { + "name": "Alan A Milne", + "email": "aamilne@authors.example.com", + "user_type": "Human" + }, + "created_date": "Saturday, November 14, 2020 at 4:06:38 PM UTC", + "text": "Hey! Where’ve you been?", + "topic_id": "2O5vHAXbXw0", + "message_id": "1pRI-QAAAAE/2O5vHAXbXw0/2O5vHAXbXw0" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Saturday, November 14, 2020 at 4:07:51 PM UTC", + "text": "I\u0027ve been off doing a research project. The aim is to figure out Google messaging. It\u0027s a tough slog.", + "topic_id": "kzw5c6uAz0E", + "message_id": "1pRI-QAAAAE/kzw5c6uAz0E/kzw5c6uAz0E" + } + ] +} diff --git a/test_data/Takeout/Google Chat/Groups/DM 2k64eoAAAAE/group_info.json b/test_data/Takeout/Google Chat/Groups/DM 2k64eoAAAAE/group_info.json new file mode 100644 index 0000000..acfe3b4 --- /dev/null +++ b/test_data/Takeout/Google Chat/Groups/DM 2k64eoAAAAE/group_info.json @@ -0,0 +1,14 @@ +{ + "members": [ + { + "name": "Albert Einstein", + "email": "emc2@science.example.org", + "user_type": "Human" + }, + { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + } + ] +} \ No newline at end of file diff --git a/test_data/Takeout/Google Chat/Groups/DM 2k64eoAAAAE/messages.json b/test_data/Takeout/Google Chat/Groups/DM 2k64eoAAAAE/messages.json new file mode 100644 index 0000000..8772b33 --- /dev/null +++ b/test_data/Takeout/Google Chat/Groups/DM 2k64eoAAAAE/messages.json @@ -0,0 +1,26 @@ +{ + "messages": [ + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, August 25, 2014 at 5:29:21 PM UTC", + "text": "you are really on top of this social networking stuff; have you had training?", + "topic_id": "AHCQlT0wH1c", + "message_id": "2k64eoAAAAE/AHCQlT0wH1c/AHCQlT0wH1c" + }, + { + "creator": { + "name": "Albert Einstein", + "email": "emc2@science.example.org", + "user_type": "Human" + }, + "created_date": "Wednesday, August 27, 2014 at 5:55:31 AM UTC", + "text": "I was too busy blogging and missed this.  Yes - I\u0027m really on top of it.  Make sure you follow me on twitter.", + "topic_id": "QArPou6qfww", + "message_id": "2k64eoAAAAE/QArPou6qfww/QArPou6qfww" + } + ] +} \ No newline at end of file diff --git a/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/File-2018-11-18(1).jpg b/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/File-2018-11-18(1).jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/File-2018-11-18(1).jpg differ diff --git a/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/File-2018-11-18.jpg b/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/File-2018-11-18.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/File-2018-11-18.jpg differ diff --git a/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/File-390445992115543672_account_id=0.jpg b/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/File-390445992115543672_account_id=0.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/File-390445992115543672_account_id=0.jpg differ diff --git a/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/group_info.json b/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/group_info.json new file mode 100644 index 0000000..c6c5bcf --- /dev/null +++ b/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/group_info.json @@ -0,0 +1,15 @@ +{ + "name": "Group Chat", + "members": [ + { + "name": "Søren Aabye Kierkegaard", + "email": "søren@authors.example.com", + "user_type": "Human" + }, + { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + } + ] +} \ No newline at end of file diff --git a/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/messages.json b/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/messages.json new file mode 100644 index 0000000..553c2e6 --- /dev/null +++ b/test_data/Takeout/Google Chat/Groups/Space AAAATeQdkhI/messages.json @@ -0,0 +1,169 @@ +{ + "messages": [ + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Sunday, November 18, 2018 at 6:02:21 PM UTC", + "attached_files": [ + { + "original_name": "390445992115543672?account_id\u003d0.jpg", + "export_name": "File-390445992115543672?account_id\u003d0.jpg" + } + ], + "topic_id": "-23g6XClfTE", + "message_id": "AAAATeQdkhI/-23g6XClfTE/-23g6XClfTE" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Sunday, November 18, 2018 at 8:28:28 PM UTC", + "attached_files": [ + { + "original_name": "2018-11-18.jpg", + "export_name": "File-2018-11-18.jpg" + } + ], + "topic_id": "hkl-RJqhSB0", + "message_id": "AAAATeQdkhI/hkl-RJqhSB0/hkl-RJqhSB0" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Sunday, November 18, 2018 at 8:28:39 PM UTC", + "text": "https://www.instructables.com/id/The-Dragon-Halloween-2018", + "annotations": [ + { + "start_index": 0, + "length": 58, + "url_metadata": { + "title": "", + "snippet": "", + "image_url": "", + "url": { + "private_do_not_access_or_else_safe_url_wrapped_value": "https://www.instructables.com/id/The-Dragon-Halloween-2018" + } + } + } + ], + "topic_id": "q2kQZOE7dO8", + "message_id": "AAAATeQdkhI/q2kQZOE7dO8/q2kQZOE7dO8" + }, + { + "creator": { + "name": "Søren Aabye Kierkegaard", + "email": "søren@authors.example.com", + "user_type": "Human" + }, + "created_date": "Sunday, November 18, 2018 at 10:59:36 PM UTC", + "attached_files": [ + { + "original_name": "2018-11-18.jpg", + "export_name": "File-2018-11-18.jpg" + } + ], + "topic_id": "PLKncOBs7CQ", + "message_id": "AAAATeQdkhI/PLKncOBs7CQ/PLKncOBs7CQ" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, November 19, 2018 at 9:40:37 PM UTC", + "text": "https://twitter.com/NTarnopolsky/status/1064186032327966722", + "annotations": [ + { + "start_index": 0, + "length": 59, + "url_metadata": { + "title": "", + "snippet": "", + "image_url": "", + "url": { + "private_do_not_access_or_else_safe_url_wrapped_value": "https://twitter.com/NTarnopolsky/status/1064186032327966722" + } + } + } + ], + "topic_id": "LFdWDDru50c", + "message_id": "AAAATeQdkhI/LFdWDDru50c/LFdWDDru50c" + }, + { + "creator": { + "name": "Søren Aabye Kierkegaard", + "email": "søren@authors.example.com", + "user_type": "Human" + }, + "created_date": "Monday, November 19, 2018 at 11:10:41 PM UTC", + "text": "\"\n\n This Tweet from @NTarnopolsky has been withheld in response to a report from the copyright holder.\"", + "annotations": [ + { + "start_index": 21, + "length": 12, + "format_metadata": { + "format_type": "BOLD" + } + } + ], + "topic_id": "WY9aY7r3IVg", + "message_id": "AAAATeQdkhI/WY9aY7r3IVg/WY9aY7r3IVg" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, November 19, 2018 at 11:11:00 PM UTC", + "text": "waaaah waaaah", + "topic_id": "ul-MzeIn-y8", + "message_id": "AAAATeQdkhI/ul-MzeIn-y8/ul-MzeIn-y8" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, November 19, 2018 at 11:12:07 PM UTC", + "text": "you can see it near the bottom of this story: https://www.sfgate.com/california-wildfires/article/Cat-loves-firefighter-rescuer-Paradise-Camp-Fire-13405593.php#photo-16529277", + "annotations": [ + { + "start_index": 46, + "length": 128, + "url_metadata": { + "title": "", + "snippet": "", + "image_url": "", + "url": { + "private_do_not_access_or_else_safe_url_wrapped_value": "https://www.sfgate.com/california-wildfires/article/Cat-loves-firefighter-rescuer-Paradise-Camp-Fire-13405593.php#photo-16529277" + } + } + } + ], + "topic_id": "60DCtRDH6nI", + "message_id": "AAAATeQdkhI/60DCtRDH6nI/60DCtRDH6nI" + }, + { + "creator": { + "name": "Søren Aabye Kierkegaard", + "email": "søren@authors.example.com", + "user_type": "Human" + }, + "created_date": "Monday, November 19, 2018 at 11:13:26 PM UTC", + "text": "cute", + "topic_id": "O-OUep7XC_M", + "message_id": "AAAATeQdkhI/O-OUep7XC_M/O-OUep7XC_M" + } + ] +} \ No newline at end of file diff --git a/test_data/Takeout/Google Chat/Groups/Space AAAAdvGdRgs/File-Cat_Hilarious.png b/test_data/Takeout/Google Chat/Groups/Space AAAAdvGdRgs/File-Cat_Hilarious.png new file mode 100644 index 0000000..13c8b65 Binary files /dev/null and b/test_data/Takeout/Google Chat/Groups/Space AAAAdvGdRgs/File-Cat_Hilarious.png differ diff --git a/test_data/Takeout/Google Chat/Groups/Space AAAAdvGdRgs/group_info.json b/test_data/Takeout/Google Chat/Groups/Space AAAAdvGdRgs/group_info.json new file mode 100644 index 0000000..506511a --- /dev/null +++ b/test_data/Takeout/Google Chat/Groups/Space AAAAdvGdRgs/group_info.json @@ -0,0 +1,20 @@ +{ + "name": "Group Chat", + "members": [ + { + "name": "F Scott Fitzgerald", + "email": "fskf@authors.example.com", + "user_type": "Human" + }, + { + "name": "Edson Arantes do Nascimento", + "email": "pele@athletes.example.org", + "user_type": "Human" + }, + { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + } + ] +} \ No newline at end of file diff --git a/test_data/Takeout/Google Chat/Groups/Space AAAAdvGdRgs/messages.json b/test_data/Takeout/Google Chat/Groups/Space AAAAdvGdRgs/messages.json new file mode 100644 index 0000000..71d2bd1 --- /dev/null +++ b/test_data/Takeout/Google Chat/Groups/Space AAAAdvGdRgs/messages.json @@ -0,0 +1,64 @@ +{ + "messages": [ + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Monday, January 7, 2019 at 7:25:08 PM UTC", + "text": "word is that TSA lines are way backed up these days due to sick-outs from the shutdown; best to check to see if you need to get there earlier than usual on Saturday", + "topic_id": "HHyWm9TBetY", + "message_id": "AAAAdvGdRgs/HHyWm9TBetY/HHyWm9TBetY" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Thursday, January 10, 2019 at 7:31:49 PM UTC", + "text": "new Doc Martin tonight at 10pm", + "topic_id": "QHn2rgKTrW0", + "message_id": "AAAAdvGdRgs/QHn2rgKTrW0/QHn2rgKTrW0" + }, + { + "creator": { + "name": "F Scott Fitzgerald", + "email": "fskf@authors.example.com", + "user_type": "Human" + }, + "created_date": "Thursday, January 10, 2019 at 8:24:29 PM UTC", + "text": "Also new Brooklyn 99", + "topic_id": "-7gYaG8ge_8", + "message_id": "AAAAdvGdRgs/-7gYaG8ge_8/-7gYaG8ge_8" + }, + { + "creator": { + "name": "Maria S Curie", + "email": "mssc@science.example.org", + "user_type": "Human" + }, + "created_date": "Thursday, January 10, 2019 at 8:25:02 PM UTC", + "text": "when will it leave beta and be Brooklyn 1.00?", + "topic_id": "bNpfCFBjkqc", + "message_id": "AAAAdvGdRgs/bNpfCFBjkqc/bNpfCFBjkqc" + }, + { + "creator": { + "name": "F Scott Fitzgerald", + "email": "fskf@authors.example.com", + "user_type": "Human" + }, + "created_date": "Thursday, January 10, 2019 at 8:25:48 PM UTC", + "attached_files": [ + { + "original_name": "Cat_Hilarious.png", + "export_name": "File-Cat_Hilarious.png" + } + ], + "topic_id": "Xkm-RRFJCHg", + "message_id": "AAAAdvGdRgs/Xkm-RRFJCHg/Xkm-RRFJCHg" + } + ] +} \ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/ - Placed - 2013-07-29T20_56_11Z.html b/test_data/Takeout/Voice/Calls/ - Placed - 2013-07-29T20_56_11Z.html new file mode 100644 index 0000000..0cd9467 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/ - Placed - 2013-07-29T20_56_11Z.html @@ -0,0 +1,103 @@ + + +Placed call to + + +
Call Log for + +Placed call to + +
Placed call to +
+Jul 29, 2013, 1:56:11 PM +Pacific Time + + + + +
+(00:00:02) + +
Labels: +
+
User Deleted: +False
\ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/+17323215555 - Missed - 2016-11-22T17_57_49Z.html b/test_data/Takeout/Voice/Calls/+17323215555 - Missed - 2016-11-22T17_57_49Z.html new file mode 100644 index 0000000..7198257 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/+17323215555 - Missed - 2016-11-22T17_57_49Z.html @@ -0,0 +1,102 @@ + + +Missed call from + + +
Call Log for + +Missed call from + +
Missed call from +
+Nov 22, 2016, 9:57:49 AM +Pacific Time + + + + + + +
Labels: +,
+
User Deleted: +False
\ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/+17323215555 - Received - 2016-11-22T22_27_58Z.html b/test_data/Takeout/Voice/Calls/+17323215555 - Received - 2016-11-22T22_27_58Z.html new file mode 100644 index 0000000..81cfb36 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/+17323215555 - Received - 2016-11-22T22_27_58Z.html @@ -0,0 +1,103 @@ + + +Received call from + + +
Call Log for + +Received call from + +
Received call from +
+Nov 22, 2016, 2:27:58 PM +Pacific Time + + + + +
+(00:00:04) + +
Labels: +
+
User Deleted: +False
\ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/+17323216666 - Placed - 2011-09-03T20_12_00Z.html b/test_data/Takeout/Voice/Calls/+17323216666 - Placed - 2011-09-03T20_12_00Z.html new file mode 100644 index 0000000..ec30ef9 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/+17323216666 - Placed - 2011-09-03T20_12_00Z.html @@ -0,0 +1,103 @@ + + +Placed call to + + +
Call Log for + +Placed call to + +
Placed call to +
+Sep 3, 2011, 1:12:00 PM +Pacific Time + + + + +
+(00:00:00) + +
Labels: +
+
User Deleted: +False
\ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/+17323217777 - Text - 2021-01-22T01_33_43Z-5-1.vcf b/test_data/Takeout/Voice/Calls/+17323217777 - Text - 2021-01-22T01_33_43Z-5-1.vcf new file mode 100644 index 0000000..ae50f90 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/+17323217777 - Text - 2021-01-22T01_33_43Z-5-1.vcf @@ -0,0 +1,6 @@ +BEGIN:VCARD +VERSION:3.0 +N:Andersen;Hans;Christian;; +FN:Hans Christian Andersen +TEL;TYPE=CELL:+17323217777 +END:VCARD \ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/+17323217777 - Text - 2021-01-22T01_33_43Z.html b/test_data/Takeout/Voice/Calls/+17323217777 - Text - 2021-01-22T01_33_43Z.html new file mode 100644 index 0000000..b015f85 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/+17323217777 - Text - 2021-01-22T01_33_43Z.html @@ -0,0 +1,88 @@ + + +Me to + + +
+
Jan 21, 2021, 5:33:43 PM +Pacific Time: +Me: +Hans' weekly updates, please .. tx +
Jan 21, 2021, 5:33:44 PM +Pacific Time: +: +https://youtu.be/dQw4w9WgXcQ?si=t3H6Q-kXkE4OSY9B +
Jan 21, 2021, 5:33:46 PM +Pacific Time: +: +Hey it's Hans Christian Andersen! This is to let you know I got your text, everything else will be from me personally! Make sure u click the link & add yourself to my phone so I can respond directly to u. Excited to talk about anything else. +
Jan 21, 2021, 5:34:52 PM +Pacific Time: +: +Got your info saved, thx! I will text with fun stuff soon. +
Jan 21, 2021, 5:35:01 PM +Pacific Time: +: +MMS Received +
+ +
Labels: +,
+
User Deleted: +False
diff --git a/test_data/Takeout/Voice/Calls/+17323218888 - 2015-03-20T03_59_29Z.html b/test_data/Takeout/Voice/Calls/+17323218888 - 2015-03-20T03_59_29Z.html new file mode 100644 index 0000000..cd90014 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/+17323218888 - 2015-03-20T03_59_29Z.html @@ -0,0 +1,103 @@ + + +Placed call to + + +
Call Log for + +Placed call to + +
Placed call to +
+Mar 19, 2015, 8:59:29 PM +Pacific Time + + + + +
+(00:01:00) + +
Labels: +
+
User Deleted: +False
\ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/+17323218888 - 2015-03-23T00_35_56Z.html b/test_data/Takeout/Voice/Calls/+17323218888 - 2015-03-23T00_35_56Z.html new file mode 100644 index 0000000..6acf148 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/+17323218888 - 2015-03-23T00_35_56Z.html @@ -0,0 +1,103 @@ + + +Placed call to + + +
Call Log for + +Placed call to + +
Placed call to +
+Mar 22, 2015, 5:35:56 PM +Pacific Time + + + + +
+(00:01:00) + +
Labels: +
+
User Deleted: +False
\ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/17323211010 - Received - 2018-07-30T22_31_33Z.html b/test_data/Takeout/Voice/Calls/17323211010 - Received - 2018-07-30T22_31_33Z.html new file mode 100644 index 0000000..50a4729 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/17323211010 - Received - 2018-07-30T22_31_33Z.html @@ -0,0 +1,103 @@ + + +Received call from + + +
Call Log for + +Received call from + +
Received call from +
+Jul 30, 2018, 3:31:33 PM +Pacific Time + + + + +
+(00:12:51) + +
Labels: +
+
User Deleted: +False
\ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z-1-1.jpg b/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z-1-1.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z-1-1.jpg differ diff --git a/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z-2-1.jpg b/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z-2-1.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z-2-1.jpg differ diff --git a/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z.html b/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z.html new file mode 100644 index 0000000..942f5b5 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Agatha M Christie - Text - 2023-10-22T17_28_34Z.html @@ -0,0 +1,84 @@ + + +Me to +Agatha M Christie + +
+
Oct 22, 2023, 10:28:34 AM +Pacific Time: +Me: +MMS Sent +
Image MMS Attachment
Oct 22, 2023, 10:28:36 AM +Pacific Time: +Me: +MMS Sent +
Image MMS Attachment
Oct 22, 2023, 10:28:38 AM +Pacific Time: +Me: +this is the caption +
Oct 22, 2023, 10:28:57 AM +Pacific Time: +Me: +that was a test; ingore +
+ +
Labels: +
+
User Deleted: +False
\ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/Alan A Milne - Text - 2023-09-29T23_40_59Z-1-1.jpg b/test_data/Takeout/Voice/Calls/Alan A Milne - Text - 2023-09-29T23_40_59Z-1-1.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Alan A Milne - Text - 2023-09-29T23_40_59Z-1-1.jpg differ diff --git a/test_data/Takeout/Voice/Calls/Alan A Milne - Text - 2023-09-29T23_40_59Z-3-1.jpg b/test_data/Takeout/Voice/Calls/Alan A Milne - Text - 2023-09-29T23_40_59Z-3-1.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Alan A Milne - Text - 2023-09-29T23_40_59Z-3-1.jpg differ diff --git a/test_data/Takeout/Voice/Calls/Alan A Milne - Text - 2023-09-29T23_40_59Z.html b/test_data/Takeout/Voice/Calls/Alan A Milne - Text - 2023-09-29T23_40_59Z.html new file mode 100644 index 0000000..e0b2ade --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Alan A Milne - Text - 2023-09-29T23_40_59Z.html @@ -0,0 +1,80 @@ + + +Me to +Alan A Milne + +
+
Sep 29, 2023, 4:40:59 PM +Pacific Time: +Me: +MMS Sent +
Image MMS Attachment
Sep 29, 2023, 4:41:01 PM +Pacific Time: +Me: +A couple of photos +
Sep 29, 2023, 4:41:47 PM +Pacific Time: +Me: +MMS Sent +
Image MMS Attachment
+ +
Labels: +
+
User Deleted: +False
diff --git a/test_data/Takeout/Voice/Calls/Albert Einstein - Recorded - 2019-05-22T18_18_56Z.html b/test_data/Takeout/Voice/Calls/Albert Einstein - Recorded - 2019-05-22T18_18_56Z.html new file mode 100644 index 0000000..eac8ceb --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Albert Einstein - Recorded - 2019-05-22T18_18_56Z.html @@ -0,0 +1,104 @@ + + +Recorded call with +Albert Einstein + +
Call Log for + +Recorded call with +Albert Einstein +
Recorded call with +Albert Einstein
+May 22, 2019, 11:18:56 AM +Pacific Time + + +
+ + + +(00:29:46) + +
Labels: +, ,
+
User Deleted: +False
diff --git a/test_data/Takeout/Voice/Calls/Albert Einstein - Recorded - 2019-05-22T18_18_56Z.mp3 b/test_data/Takeout/Voice/Calls/Albert Einstein - Recorded - 2019-05-22T18_18_56Z.mp3 new file mode 100644 index 0000000..97c635b Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Albert Einstein - Recorded - 2019-05-22T18_18_56Z.mp3 differ diff --git a/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-20T00_54_54Z.html b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-20T00_54_54Z.html new file mode 100644 index 0000000..d61a1f1 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-20T00_54_54Z.html @@ -0,0 +1,76 @@ + + +Me to +Edson Arantes do Nascimento + +
+
Sep 19, 2023, 5:54:54 PM +Pacific Time: +Me: +I left you some radio frequencies in the microwave +
Sep 19, 2023, 6:19:24 PM +Pacific Time: +Edson Arantes do Nascimento: +Thanks +
+ +
Labels: +,
+
User Deleted: +False
diff --git a/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-20T14_41_11Z-17-1.jpg b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-20T14_41_11Z-17-1.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-20T14_41_11Z-17-1.jpg differ diff --git a/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-20T14_41_11Z.html b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-20T14_41_11Z.html new file mode 100644 index 0000000..343c6db --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-20T14_41_11Z.html @@ -0,0 +1,139 @@ + + +Edson Arantes do Nascimento + +
+
Sep 20, 2023, 7:41:11 AM +Pacific Time: +Edson Arantes do Nascimento: +I have exciting good news I will call and tell you about soon +
Sep 20, 2023, 8:38:51 AM +Pacific Time: +Me: +I am available until 9:30 +
Sep 20, 2023, 9:00:55 AM +Pacific Time: +Me: +Looks like somebody is having some issues today. I'm not sure if that's what's behind the outage. +
Sep 20, 2023, 9:01:18 AM +Pacific Time: +Me: +https://youtu.be/dQw4w9WgXcQ?si=t3H6Q-kXkE4OSY9B +
Sep 20, 2023, 9:01:50 AM +Pacific Time: +Edson Arantes do Nascimento: +Lol +
Sep 20, 2023, 9:02:11 AM +Pacific Time: +Edson Arantes do Nascimento: +They should try something +
Sep 20, 2023, 9:02:45 AM +Pacific Time: +Me: +that would be my bet +
Sep 20, 2023, 9:03:08 AM +Pacific Time: +Edson Arantes do Nascimento: +I would go for something +
Sep 20, 2023, 9:03:53 AM +Pacific Time: +Me: +one thing we know ... bubble sort is not the way to go +
Sep 20, 2023, 9:04:09 AM +Pacific Time: +Edson Arantes do Nascimento: +True +
Sep 20, 2023, 9:54:30 AM +Pacific Time: +Me: +Looks like that thing was a red herring. +
Sep 20, 2023, 9:54:48 AM +Pacific Time: +Edson Arantes do Nascimento: +Wow! +
Sep 20, 2023, 9:56:15 AM +Pacific Time: +Me: +but that thing is probably back online, though sometimes it takes a little while for all the odds and bobs to individually recover. +
Sep 20, 2023, 9:56:43 AM +Pacific Time: +Edson Arantes do Nascimento: +Lol +
Sep 20, 2023, 10:22:02 AM +Pacific Time: +Me: +It reminded me of this AT&T outage that happened a while back. You might find the code interesting. https://users.csc.calpoly.edu/~jdalbey/SWE/Papers/att_collapse +
Sep 20, 2023, 4:40:28 PM +Pacific Time: +Me: +https://youtu.be/dQw4w9WgXcQ?si=t3H6Q-kXkE4OSY9B +
Sep 20, 2023, 6:39:15 PM +Pacific Time: +Me: +sure, right +
Image MMS Attachment
Sep 20, 2023, 8:59:44 PM +Pacific Time: +Edson Arantes do Nascimento: +Lol +
+ +
Labels: +,
+
User Deleted: +False
diff --git a/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-21T17_04_39Z.html b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-21T17_04_39Z.html new file mode 100644 index 0000000..d56e9ad --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Text - 2023-09-21T17_04_39Z.html @@ -0,0 +1,75 @@ + + +Edson Arantes do Nascimento + +
+
Sep 21, 2023, 10:04:39 AM +Pacific Time: +Edson Arantes do Nascimento: +The minute I was joining a meeting, I got a wrong-number business-sounding call looking for someone named John. What a coincidence! +
Sep 21, 2023, 10:14:27 AM +Pacific Time: +Me: +Not really. John is a very common name. +
+ +
Labels: +,
+
User Deleted: +False
diff --git a/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2014-05-17T02_54_27Z.html b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2014-05-17T02_54_27Z.html new file mode 100644 index 0000000..3b4f226 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2014-05-17T02_54_27Z.html @@ -0,0 +1,143 @@ + + +Voicemail from +Edson Arantes do Nascimento + +
Call Log for + +Voicemail from +Edson Arantes do Nascimento + +May 16, 2014, 7:54:27 PM +Pacific Time +Transcript: +Hi, it's me. I'm just calling to say that I'm coming home. +Hi, +1.1 +1.559 +0.9426087 it's +1.559 +1.84 +0.95586413 me. +1.84 +2.3500001 +0.9431536 I'm +2.3500001 +2.8200002 +0.31141254 just +2.8200002 +3.0800002 +0.5176859 calling +3.0800002 +3.729 +0.89909077 to +3.729 +4.07 +0.9643641 say +4.07 +4.4890003 +0.8662085 that +4.4890003 +4.67 +0.7913236 I'm +4.67 +4.78 +0.9510233 coming +4.78 +5.0990005 +0.95244765 home. +5.0990005 +5.42 +0.9234381 +0.8991872 + +
+ + + +(00:00:22) + +
Labels: +,
+
User Deleted: +False
diff --git a/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2014-05-17T02_54_27Z.mp3 b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2014-05-17T02_54_27Z.mp3 new file mode 100644 index 0000000..97c635b Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2014-05-17T02_54_27Z.mp3 differ diff --git a/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2016-05-01T00_16_43Z.html b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2016-05-01T00_16_43Z.html new file mode 100644 index 0000000..751455d --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2016-05-01T00_16_43Z.html @@ -0,0 +1,140 @@ + + +Voicemail from +Edson Arantes do Nascimento + +
Call Log for + +Voicemail from +Edson Arantes do Nascimento + +Apr 30, 2016, 5:16:43 PM +Pacific Time +Transcript: +Hi, Dad. It's me Edson. I'm currently sitting out in front. +Hi, +0.0 +0.0 +0.85 Dad. +0.0 +0.0 +0.85 It's +0.0 +0.0 +0.85 me +0.0 +0.0 +0.85 Edson. +0.0 +0.0 +0.85 I'm +0.0 +0.0 +0.85 currently +0.0 +0.0 +0.85 sitting +0.0 +0.0 +0.85 out +0.0 +0.0 +0.85 in +0.0 +0.0 +0.85 front. +0.0 +0.0 +0.85 + + +
+ + + +(00:00:28) + +
Labels: +,
+
User Deleted: +False
diff --git a/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2016-05-01T00_16_43Z.mp3 b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2016-05-01T00_16_43Z.mp3 new file mode 100644 index 0000000..97c635b Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Edson Arantes do Nascimento - Voicemail - 2016-05-01T00_16_43Z.mp3 differ diff --git a/test_data/Takeout/Voice/Calls/Group Conversation - 2021-07-13T21_25_26Z-3-1.vcf b/test_data/Takeout/Voice/Calls/Group Conversation - 2021-07-13T21_25_26Z-3-1.vcf new file mode 100644 index 0000000..93293d9 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Group Conversation - 2021-07-13T21_25_26Z-3-1.vcf @@ -0,0 +1,7 @@ +BEGIN:VCARD +VERSION:2.1 +N:Kierkegaard;Søren;Aabye;; +FN:Søren Aabye Kierkegaard +TEL;CELL:+1 425-406-0541 +X-GROUP: +END:VCARD diff --git a/test_data/Takeout/Voice/Calls/Group Conversation - 2021-07-13T21_25_26Z.html b/test_data/Takeout/Voice/Calls/Group Conversation - 2021-07-13T21_25_26Z.html new file mode 100644 index 0000000..a5df747 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Group Conversation - 2021-07-13T21_25_26Z.html @@ -0,0 +1,80 @@ + + +Group Conversation + +
Group conversation with: +Rosalind E Franklin, Alan A Milne
+
Jul 13, 2021, 2:25:26 PM +Pacific Time: +Rosalind E Franklin: +Hi guys, I got this message from Søren.

Hi Rosalind quick question.
I got his number we just wanted to double check if this is his number (732) 321-0011.

If the issue is you did not recognize Søren's number I will send that to you.
+
Jul 13, 2021, 2:26:21 PM +Pacific Time: +Me: +we're all good ... got the text yesterday; just slow in responding (there's a pandemic on) +
Jul 13, 2021, 2:27:27 PM +Pacific Time: +Rosalind E Franklin: +science guys's helper +
+ +
Labels: +,
+
User Deleted: +False
diff --git a/test_data/Takeout/Voice/Calls/Group Conversation - 2021-12-20T22_35_28Z-1-1.jpg b/test_data/Takeout/Voice/Calls/Group Conversation - 2021-12-20T22_35_28Z-1-1.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Group Conversation - 2021-12-20T22_35_28Z-1-1.jpg differ diff --git a/test_data/Takeout/Voice/Calls/Group Conversation - 2021-12-20T22_35_28Z-1-2.jpg b/test_data/Takeout/Voice/Calls/Group Conversation - 2021-12-20T22_35_28Z-1-2.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Group Conversation - 2021-12-20T22_35_28Z-1-2.jpg differ diff --git a/test_data/Takeout/Voice/Calls/Group Conversation - 2021-12-20T22_35_28Z.html b/test_data/Takeout/Voice/Calls/Group Conversation - 2021-12-20T22_35_28Z.html new file mode 100644 index 0000000..48cd8d1 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Group Conversation - 2021-12-20T22_35_28Z.html @@ -0,0 +1,72 @@ + + +Group Conversation + +
Group conversation with: +William Shakespeare, +17323211717
+
Dec 20, 2021, 2:35:28 PM +Pacific Time: +William Shakespeare: + +
Image MMS Attachment
Image MMS Attachment
+ +
Labels: +,
+
User Deleted: +False
\ No newline at end of file diff --git a/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T15_30_41Z-2-1.jpg b/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T15_30_41Z-2-1.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T15_30_41Z-2-1.jpg differ diff --git a/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T15_30_41Z.html b/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T15_30_41Z.html new file mode 100644 index 0000000..f388458 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T15_30_41Z.html @@ -0,0 +1,100 @@ + + +Group Conversation + +
+
Oct 1, 2023, 8:30:41 AM +Pacific Time: +Alan A Milne: +You were the best one! +
Oct 1, 2023, 11:44:12 AM +Pacific Time: +Me: +Happy birthday Edson. For your birthday, I went on a riverboat cruise for you. Hope you enjoyed it. +
Image MMS Attachment
Oct 1, 2023, 11:45:28 AM +Pacific Time: +Edson Arantes do Nascimento: +Lol! Thanks :) +
Oct 1, 2023, 11:56:12 AM +Pacific Time: +Alan A Milne: +If you're lucky I'll be home in time to take you to dinner! +
Oct 1, 2023, 2:13:08 PM +Pacific Time: +Alan A Milne: +Leaving the place now. The map says it's 3.5 hrs til I get home. Probably 4 with a coffee stop! +
Oct 1, 2023, 2:16:00 PM +Pacific Time: +Edson Arantes do Nascimento: +See you later! +
Oct 1, 2023, 3:18:19 PM +Pacific Time: +Søren Aabye Kierkegaard: +ah, right on time at 1 pm, i see +
Oct 1, 2023, 3:58:13 PM +Pacific Time: +Alan A Milne: +Leaving coffee stop current time 2 more hours! +
+ +
Labels: +,
+
User Deleted: +False
diff --git a/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T19_34_50Z-16-1.jpg b/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T19_34_50Z-16-1.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T19_34_50Z-16-1.jpg differ diff --git a/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T19_34_50Z-2-1.jpg b/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T19_34_50Z-2-1.jpg new file mode 100644 index 0000000..9e293f1 Binary files /dev/null and b/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T19_34_50Z-2-1.jpg differ diff --git a/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T19_34_50Z.html b/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T19_34_50Z.html new file mode 100644 index 0000000..5a1aea3 --- /dev/null +++ b/test_data/Takeout/Voice/Calls/Group Conversation - 2023-10-01T19_34_50Z.html @@ -0,0 +1,148 @@ + + +Group Conversation + +
Group conversation with: +Debbie One, Missy Two, Mary Four, Trish Three, Laura Five
+
Oct 1, 2023, 12:34:50 PM +Pacific Time: +Trish Three: +Such a beautiful weekend - so great to see you all! Safe travels, everyone....
❤️
+
Oct 1, 2023, 12:35:05 PM +Pacific Time: +Trish Three: + +
Image MMS Attachment
Oct 1, 2023, 12:40:08 PM +Pacific Time: +Me: +Thanks for forwarding the picture, Trish. (For anyone wondering about an unrecognized phone number, 732-321-0011 is Maria Curie.) +
Oct 1, 2023, 12:40:27 PM +Pacific Time: +Debbie One: +Was so fun - especially the times when my eyes were actually open!!!
+
Oct 1, 2023, 12:41:21 PM +Pacific Time: +Trish Three: +Laughed at “Was so fun - especially the times when my eyes wer…” +
Oct 1, 2023, 12:41:51 PM +Pacific Time: +Debbie One: +Maybe could everyone identify their numbers?
+
Oct 1, 2023, 1:07:47 PM +Pacific Time: +Missy Two: +Laughed at “Was so fun - especially the times when my eyes wer…” +
Oct 1, 2023, 1:08:14 PM +Pacific Time: +Missy Two: +Missy- 585-750-1086
+
Oct 1, 2023, 1:08:59 PM +Pacific Time: +Missy Two: +It was a great weekend!
+
Oct 1, 2023, 1:13:55 PM +Pacific Time: +Debbie One: +Debbie- 212-555-0001
+
Oct 1, 2023, 1:19:55 PM +Pacific Time: +Me: +Weird that Trish isn't in that photo. I was sure she was there at that table. +
Oct 1, 2023, 1:22:29 PM +Pacific Time: +Trish Three: +Laughed at “Weird that Trish isn't in that photo. I was sure s…” +
Oct 1, 2023, 4:40:55 PM +Pacific Time: +Mary Four: +Mary is 212-555-0004 +
Oct 1, 2023, 4:45:11 PM +Pacific Time: +Mary Four: +Laughed at “Weird that Trish isn't in that photo. I was sure s…” +
Oct 1, 2023, 4:46:13 PM +Pacific Time: +Mary Four: +Loved “Such a beautiful weekend - so great to see you all…” +
Oct 1, 2023, 5:55:47 PM +Pacific Time: +Me: +I found Trish! (This was taken by somebody else) +
Image MMS Attachment
Oct 1, 2023, 5:57:10 PM +Pacific Time: +Mary Four: +Liked “I found Trish! (This was taken by somebody else)” +
Oct 1, 2023, 5:57:22 PM +Pacific Time: +Trish Three: +Thanks, Maria! And they are CLEARLY a much better photographer than I - thanks!
❣️
+
Oct 1, 2023, 5:58:23 PM +Pacific Time: +Trish Three: +Trish - 212-555-0003 +
Oct 1, 2023, 9:20:36 PM +Pacific Time: +Laura Five: +Laura 212 555 0005
What a great weekend! Until next time...
+
+ +
Labels: +,
+
User Deleted: +False
diff --git a/test_data/contacts.json-EXAMPLE b/test_data/contacts.json-EXAMPLE new file mode 100644 index 0000000..0818d50 --- /dev/null +++ b/test_data/contacts.json-EXAMPLE @@ -0,0 +1,6 @@ +{ + "Me": "+17323210011", + "Maria S Curie": "Me", + "Agatha M Christie": "+17323211111", + "fskf@authors.example.com": "+17323215555" +}