Merge pull request #38 from XPRIZE/#22-Team-Chimple-Extract-Storybook…

…-Events #22 team chimple extract storybook events
XPRIZE · Oct 29, 2019 · 34c80e0 · 34c80e0
2 parents 4138646 + b3a93fd
commit 34c80e0
Show file tree

Hide file tree

Showing 63 changed files with 27,033 additions and 3 deletions.
diff --git a/team-CHIMPLE/storybook-events/extract_storybook_events_from_csv.py b/team-CHIMPLE/storybook-events/extract_storybook_events_from_csv.py
@@ -77,7 +77,9 @@ def extract_from_week(directory_containing_weekly_data):
                     continue
 
                 # Extract tablet serial number from the parent folder's name
-                # E.g. "2019-02-08/29/REMOTE/5A27001390/userlog.1548797314282.csv" --> "5A27001390"
+                # E.g. "2019-02-08/29/REMOTE/5A27001390/userlog.1548797314282.csv"
+                #  --> "2019-02-08/29/REMOTE/5A27001390"
+                #  --> "5A27001390"
                 tablet_serial = file_path.replace("/" + basename, "")
                 tablet_serial = tablet_serial[len(tablet_serial)-10:len(tablet_serial)]
                 print(os.path.basename(__file__), "tablet_serial: \"{}\"".format(tablet_serial))
@@ -86,11 +88,25 @@ def extract_from_week(directory_containing_weekly_data):
                 is_valid_tablet_serial_number = serial_number_util.is_valid(tablet_serial)
                 print(os.path.basename(__file__), "is_valid_tablet_serial_number: {}".format(is_valid_tablet_serial_number))
                 if not is_valid_tablet_serial_number:
-                    raise ValueError("Invalid tablet_serial: \"{}\"".format(tablet_serial))
+                    # Invalid serial number. Skip file.
+                    warnings.warn("The parent folder does not represent a 10-character serial number: \"{}\"".format(tablet_serial))
+                    continue
 
                 # Extract storybook events from CSV
                 with open(file_path) as csv_file:
                     csv_data = csv.reader(csv_file)
+
+                    # Skip if corrupt file content
+                    try:
+                        for storybook_event_row in csv_data:
+                            # Only check if the first line in the file is valid, and then skip iteration of the rest of the file.
+                            break
+                    except csv.Error:
+                        # Handle "_csv.Error: line contains NULL byte"
+                        # Example: 2018-09-07/35/REMOTE/6111001892/userlog.1532930088249.csv contains "^@^@^@^@^@^@^@^@^@^@"
+                        warnings.warn("Skipping file which contains NULL byte")
+                        continue
+
                     for storybook_event_row in csv_data:
                         print(os.path.basename(__file__), "storybook_event_row: {}".format(storybook_event_row))
 
@@ -123,7 +139,13 @@ def extract_from_week(directory_containing_weekly_data):
                                 storybook_start_time = arrow.get(userlog_logged_at, "ddd MMM DD HH:mm:ss ZZZZZ YYYY").timestamp
                             elif len(userlog_logged_at) == 28:
                                 # E.g. "Wed Jan 09 09:55:25 PST 2019"
-                                storybook_start_time = arrow.get(userlog_logged_at, "ddd MMM DD HH:mm:ss ZZZ YYYY").timestamp
+                                try:
+                                    storybook_start_time = arrow.get(userlog_logged_at, "ddd MMM DD HH:mm:ss ZZZ YYYY").timestamp
+                                except arrow.parser.ParserError:
+                                    # Handle "arrow.parser.ParserError: Could not parse timezone expression "AST"".
+                                    # E.g. "Sun Jan 09 21:51:30 AST 2000" or "Thu Aug 29 15:25:29 EDT 2019"
+                                    warnings.warn("Skipping invalid timezone expression")
+                                    continue
                             print(os.path.basename(__file__), "storybook_start_time: {}".format(storybook_start_time))
 
                             # Storybook end time is not stored, so set to None

diff --git a/team-CHIMPLE/storybook-events/extract_storybook_events_from_multiple_weeks.py b/team-CHIMPLE/storybook-events/extract_storybook_events_from_multiple_weeks.py
@@ -0,0 +1,71 @@
+# Collects storybook events from multiple weeks of data and combines them into one file.
+#
+# Example usage:
+#     cd storybook-events
+#     python3 extract_storybook_events_from_multiple_weeks.py ../tablet-usage-data
+#
+# The extracted data will be stored in a file named `storybook-events-CHIMPLE.csv`.
+
+import sys
+import os
+
+import extract_storybook_events_from_csv
+
+# A directory containing multiple subdirectories on the format "2017-12-22", "2017-12-29", etc.
+BASE_PATH = "../tablet-usage-data"
+if len(sys.argv) > 1:
+    BASE_PATH = sys.argv[1]
+print(os.path.basename(__file__), "BASE_PATH: {}".format(BASE_PATH))
+
+data_collection_week_end_dates = [
+    '2017-12-22', '2017-12-29',
+    '2018-01-05', '2018-01-12', '2018-01-19', '2018-01-26',
+    '2018-02-02', '2018-02-09', '2018-02-16', '2018-02-23',
+    '2018-03-02', '2018-03-09', '2018-03-16', '2018-03-23', '2018-03-30',
+    '2018-04-06', '2018-04-13', '2018-04-20', '2018-04-27',
+    '2018-05-04', '2018-05-11', '2018-05-18', '2018-05-25',
+    '2018-06-01', '2018-06-08', '2018-06-15', '2018-06-22', '2018-06-29',
+    '2018-07-06', '2018-07-13', '2018-07-20', '2018-07-27',
+    '2018-08-03', '2018-08-10', '2018-08-17', '2018-08-24', '2018-08-31',
+    '2018-09-07', '2018-09-14', '2018-09-21', '2018-09-28',
+    '2018-10-05', '2018-10-12', '2018-10-19', '2018-10-26',
+    '2018-11-02', '2018-11-09', '2018-11-16', '2018-11-23', '2018-11-30',
+    '2018-12-07', '2018-12-14', '2018-12-21', '2018-12-28',
+    '2019-01-04', '2019-01-11', '2019-01-18', '2019-01-25',
+    '2019-02-01', '2019-02-08', '2019-02-22'
+]
+print(os.path.basename(__file__), "len(data_collection_week_end_dates): {}".format(len(data_collection_week_end_dates)))
+
+# Extract storybook events and store them in a CSV file for each week of data
+for week_end_date in data_collection_week_end_dates:
+    directory_containing_weekly_data = BASE_PATH + os.sep + week_end_date
+    print(os.path.basename(__file__), "directory_containing_weekly_data: \"{}\"".format(directory_containing_weekly_data))
+    extract_storybook_events_from_csv.extract_from_week(directory_containing_weekly_data)
+
+# Combine each CSV file for one week of data into one file
+print(os.path.basename(__file__), "Writing data to \"storybook-events-CHIMPLE.csv\"...")
+with open('storybook-events-CHIMPLE.csv', 'w') as outfile:
+    infile_count = 0
+    for week_end_date in data_collection_week_end_dates:
+        print(os.path.basename(__file__), "\n\n"
+                                          "**********\n")
+        csv_filename_weekly = "storybook-events-CHIMPLE_" + week_end_date + ".csv"
+        print(os.path.basename(__file__), "csv_filename: \"{}\"".format(csv_filename_weekly))
+        with open(csv_filename_weekly) as infile:
+            infile_row_count = 0
+            for line in infile:
+                print(os.path.basename(__file__), "line: {}".format(line))
+                print(os.path.basename(__file__), "infile_row_count: {}".format(infile_row_count))
+
+                # Column headers are included in each weekly file.
+                # Only include them once, and skip them for each subsequent file.
+                is_column_header_row = (infile_row_count == 0)
+                is_first_infile = (infile_count == 0)
+                skip_row = (is_column_header_row and not is_first_infile)
+
+                if not skip_row:
+                    outfile.write(line)
+
+                infile_row_count += 1
+        infile_count += 1
+print(os.path.basename(__file__), "Writing data to \"storybook-events-CHIMPLE.csv\" complete!")