-
Notifications
You must be signed in to change notification settings - Fork 0
/
sbahnmuc03b_TransferDB.py
112 lines (83 loc) · 3.35 KB
/
sbahnmuc03b_TransferDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import boto3
import configparser
import pandas as pd
import os
import s3fs
import json
import pytictoc
import datetime
import logging
from DeArchive import dearchive
t = pytictoc.TicToc()
t.tic()
logpath = "/home/ubuntu/sbmd/logs/"
normlogfilename = "sb03blog_" \
+ datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") + ".log"
logging.basicConfig(filename=logpath+normlogfilename, level=logging.DEBUG)
config = configparser.ConfigParser()
config.read("/home/ubuntu/sbmd/dwh.cfg")
rdsid = config['RDS']['ID1']
rdspw = config["RDS"]["PW"]
os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['KEY']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['SECRET']
s3r = boto3.resource("s3")
BUCKET = "sbmd1db2"
bucket = s3r.Bucket(BUCKET)
objsr_all = bucket.objects.all()
client = boto3.client("s3")
logging.info("Starting gather S3 files...")
try:
s3r_files = []
for o in objsr_all.filter(Prefix="DB_2"):
s3r_files.append(o.key)
logging.info("Finished gathering S3 files.")
basefile = s3r_files[0]
result = client.get_object(Bucket=BUCKET, Key=basefile)
text = json.loads(result["Body"].read().decode())
base_df = pd.io.json.json_normalize(text, sep="_")
logging.info("Finished DF")
FILE_TO_READ = s3r_files[0]
client = boto3.client('s3')
archivfoldername = str(datetime.date.today()) + "-ArchivDB/"
response = client.put_object(
Bucket=BUCKET,
Body="",
Key=archivfoldername)
s3res = boto3.resource("s3")
copy_source = {"Bucket": BUCKET, "Key": s3r_files[0]}
dest = s3res.Object(BUCKET, archivfoldername + copy_source["Key"])
dest.copy(CopySource=copy_source)
response = s3res.Object(BUCKET, s3r_files[0]).delete()
logging.info("Finished first line of df")
for file in s3r_files[1:]:
result = client.get_object(Bucket=BUCKET, Key=file)
text = json.loads(result["Body"].read().decode())
df = pd.io.json.json_normalize(text, sep="_")
base_df = pd.concat([base_df, df], axis=0, ignore_index=True)
#archiving
copy_source = {"Bucket": BUCKET, "Key": file}
dest = s3res.Object(BUCKET, archivfoldername + copy_source["Key"])
dest.copy(CopySource=copy_source)
response = s3res.Object(BUCKET, file).delete()
logging.info("Finished whole DF and file")
coln = list(base_df.columns)
coln = [x.lower() for x in coln]
base_df.columns = coln
base_df_filename = str(datetime.datetime.now()) + "_DB_DF.csv"
base_df.to_csv("/home/ubuntu/sbmd/" + base_df_filename, index=False)
s3object = s3res.Object(BUCKET, "CSV/" + base_df_filename)
s3object.upload_file("/home/ubuntu/sbmd/" + base_df_filename)
os.remove("/home/ubuntu/sbmd/" + base_df_filename)
logging.info("Finished Copy, starting data quality checks")
if (len(s3r_files) != base_df.shape[0]):
logging.error("Data Quality check failed! Files and DF length not \
identical!")
except Exception as e:
if len(s3r_files) == 0:
logging.info("Currently no files to load")
else:
logging.error("Something went wrong...starting de-archiving!")
logging.error(e)
dearchive(BUCKET, [archivfoldername], 20)
logging.info("Succesfully dearchived!")
t.toc()