Skip to content

Commit

Permalink
updating changelog
Browse files Browse the repository at this point in the history
  • Loading branch information
kylevillegas93 committed Sep 9, 2024
1 parent 8b12144 commit 29f6659
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ tags
*.swp
/*venv*

analytics/upress_reporting/log_files
analytics/upress_reporting/log_files
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
- Updated fulfill process to check rights status before updating manifest
- Deleted fulfill script due to no longer being necessary
- Updating report book ID
- Implemented script to aggregate access logs


## Fixed
Expand Down
27 changes: 18 additions & 9 deletions analytics/upress_reporting/aggregate_logs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import argparse
import boto3
from datetime import datetime, date, timedelta
from datetime import datetime, timedelta
import os
import re
import shutil
import sys

from main import load_env_file
Expand All @@ -16,25 +17,31 @@ def parse_args(args):
return parser.parse_args(args)


def aggregate_logs(bucket_name: str, file_path: str, folder_name: str):
def aggregate_logs(bucket_name: str, file_path: str, folder_name: str, file_id_regex: str):
download_folder = f'analytics/upress_reporting/log_files/{bucket_name}/{folder_name}'
os.system(f'aws s3 cp --recursive s3://{bucket_name}/{file_path}{folder_name} {download_folder}')

folder_directory = os.fsencode(download_folder)
aggregated_log_file = f'analytics/upress_reporting/log_files/{bucket_name}/{folder_name}/aggregated_log'

for file in os.listdir(folder_directory):
if file == aggregated_log_file: continue
if file == aggregated_log_file:
continue

filename = os.fsdecode(file)

with open(aggregated_log_file, 'a') as aggregated_log:
with open(f'{download_folder}/{filename}', 'r') as log_file:
for line in log_file:
# TODO: filter lines that don't matter
aggregated_log.write(line)
match_file_id = re.search(file_id_regex, line)
match_referrer = re.search(os.getenv('REFERRER_URL'), line)

if not match_file_id or not match_referrer or "403 AccessDenied" in line:
continue

# TODO: upload back to S3 and delete files
aggregated_log.write(line)

os.remove(f'{download_folder}/{filename}')


def main():
Expand All @@ -50,11 +57,13 @@ def main():
start_date = datetime.strptime(parsed_args.start, '%Y-%m-%d').date()
end_date = datetime.strptime(parsed_args.end, '%Y-%m-%d').date()

shutil.rmtree('analytics/upress_reporting/log_files', ignore_errors=True)

while start_date <= end_date:
folder_name = start_date.strftime('%Y/%m/%d')

aggregate_logs(view_bucket, view_log_path, folder_name)
aggregate_logs(download_bucket, download_bucket_path, folder_name)
aggregate_logs(view_bucket, view_log_path, folder_name, r"REST.GET.OBJECT manifests/(.*?json)\s")
aggregate_logs(download_bucket, download_bucket_path, folder_name, r"REST.GET.OBJECT (.+pdf\s)")

start_date += timedelta(days=1)

Expand Down

0 comments on commit 29f6659

Please sign in to comment.