-
Notifications
You must be signed in to change notification settings - Fork 40
/
s3_to_exasol.sql
344 lines (281 loc) · 13.2 KB
/
s3_to_exasol.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
CREATE SCHEMA IF NOT EXISTS DATABASE_MIGRATION;
--/
CREATE OR REPLACE PYTHON3 SCALAR SCRIPT DATABASE_MIGRATION."S3_GET_FILENAMES"
(
"connection_name" VARCHAR(1024),
"folder_name" VARCHAR(1024) UTF8,
"generate_urls" BOOLEAN,
"filter_string" VARCHAR(1024)
)
EMITS ("BUCKET_NAME" VARCHAR(1024) UTF8, "URL" VARCHAR(4096) UTF8, "FILE_LAST_MODIFIED" TIMESTAMP) AS
import fnmatch
import boto3
from botocore.client import Config
#####################################################################################
def s3_connection_helper(connection_name):
con = exa.get_connection(connection_name)
aws_key = con.user
aws_secret = con.password
address = con.address
bucket_pos = address.find('://') + 3
host_pos = address.find('.s3.') + 1
bucket_name = address[bucket_pos:host_pos-1]
host_address = address[host_pos:]
if aws_key == '':
s3conn = boto3.client('s3', config=Config(signature_version='s3v4'))
else:
s3conn = boto3.client('s3',
aws_access_key_id = aws_key,
aws_secret_access_key = aws_secret,
config=Config(signature_version='s3v4'))
return s3conn, bucket_name;
#####################################################################################
def run(ctx):
s3conn, bucket_name = s3_connection_helper(ctx.connection_name)
continuation_token = None
while True:
if continuation_token:
listresponse = s3conn.list_objects_v2(Bucket=bucket_name, ContinuationToken=continuation_token)
else:
listresponse = s3conn.list_objects_v2(Bucket=bucket_name)
rs = listresponse['Contents']
# if folder_name is empty, put a star to make filter work
if not ctx.folder_name:
ctx.folder_name = '*'
for key in rs:
if not ctx.filter_string or fnmatch.fnmatch(key['Key'], ctx.folder_name + ctx.filter_string):
# http://stackoverflow.com/questions/9954521/s3-boto-list-keys-sometimes-returns-directory-key
if not key['Key'].endswith('/'):
if ctx.generate_urls:
# expires_in: defines the expiry of the url in seconds. It has only an effect if query_auth=True. With value True, a signature is created.
# query_auth: can also be set to False, if it is not required. Then, no signature is created.
protocol, filepath = s3conn.generate_presigned_url('get_object', Params={'Bucket': bucket_name, 'Key': key['Key']}).split('://', 1)
s3_bucket, localpath = filepath.split('/', 1)
else:
localpath = key['Key']
ctx.emit(bucket_name, localpath, key['LastModified'].replace(tzinfo=None))
continuation_token = listresponse.get('NextContinuationToken', None)
if not continuation_token:
break
/
--select DATABASE_MIGRATION.s3_get_filenames(
--, 'S3_IMPORT_BOTO' -- connection_name
--, '<some folder>/<some_date>/dump-' -- folder_name
--, false -- generate_urls
--, '*abc*123*' -- filter_string
--);
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
--/
CREATE OR REPLACE LUA SCALAR SCRIPT DATABASE_MIGRATION.GET_CONNECTION_NAME(connection_name VARCHAR(2000))
RETURNS VARCHAR(20000) AS
function run(ctx)
url = exa.get_connection(ctx.connection_name).address
return url
end
/
--select DATABASE_MIGRATION.GET_CONNECTION_NAME(
--'S3_IMPORT_BOTO' -- connection_name
--);
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
-- # parallel_connections: number of parallel files imported in one import statement
-- # file_opts: search EXASolution_User_Manual for 'file_opts' to see all possible options
--/
CREATE OR REPLACE LUA SCRIPT DATABASE_MIGRATION.S3_PARALLEL_READ
(
execute_statements,
force_reload,
logging_schema,
schema_name,
table_name,
number_cols,
connection_name,
folder_name,
filter_string,
parallel_connections,
file_opts)
RETURNS TABLE AS
------------------------------------------------------------------------------------------------------------------------
-- returns the string between the two defined strings
-- example: get_string_between('abcdef', 'ab', 'f') --> 'cde'
function get_string_between(str, str_before, str_after)
_, start_pos = string.find(str, str_before, 1, true)
end_pos = string.find(str, str_after, 1, true)
return url.sub(str, start_pos+1, end_pos-1)
end
------------------------------------------------------------------------------------------------------------------------
-- function for debugging, prints table
function debug(table)
if table == nil or table[1] == nil or table[1][1] == nil then
return
end
output(string.rep('-',50))
for i = 1,#table do
tmp = ''
for j = 1,#table[i] do
tmp = tmp .. table[i][j] ..' '
end
output(tmp)
end
output(string.rep('-',50))
end
------------------------------------------------------------------------------------------------------------------------
-- type: 'table' or 'schema'
-- name: name of the table/ schema you want to check
-- returns true if object exists, false if it doesn't exist, second return parameter is error message you can output
function check_exists(type,name)
-- schema
if type == 'schema' or type == 's' then
res = query([[select current_schema]])
curr_schem = res[1][1]
suc_schem, res = pquery([[open schema ::s]], {s=name})
-- restore old schema
if curr_schem == null then
query([[close schema]])
else
query([[open schema ::cs]],{cs=curr_schem})
end
-- check if schema named 'name' existed
if suc_schem then
return true, 'Schema '..name..' already exists'
else
return false, 'Schema '..name..' does not exist'
end
end
-- table for table names you should add the schema
-- like this: check_name('table', 'my_schema.my_table')
if type == 'table' or type == 't' then
suc, res = pquery([[desc ::t]],{t=name})
if suc then
return true, 'Table '..name..' already exists'
else
return false, 'Table '..name..' does not exist'
end
end
end
------------------------------------------------------------------------------------------------------------------------
-- additional parameters that won't need to be modified normally
generate_urls = false
script_schema = quote(exa.meta.script_schema)
-- quote everything to handle case sensitivity
logging_schema = quote(logging_schema)
schema_name = quote(schema_name)
table_name = quote(table_name)
ex, msg = check_exists('schema', logging_schema)
if not ex then error(msg) end
ex, msg = check_exists('table', schema_name..'.'..table_name)
if not ex then error(msg) end
status_done = 'done'
waiting_for_update = 'waiting for update'
waiting_for_insertion = 'waiting for insertion'
local log_tbl = {}
local inserted_total = 0
-- acquire write lock on the table to prevent transactioin conflicts
query([[DELETE FROM ::s.::t WHERE FALSE]], {s=schema_name, t=table_name})
res = query([[select ::ss.GET_CONNECTION_NAME(:c)]], {ss=script_schema, c=connection_name})
url = res[1][1]
bucket_name = get_string_between(url, '://', '.s3.')
-- create a regular name by removing special characters
logging_table = [["LOG_]].. string.gsub(schema_name, "[^a-zA-Z0-9]", "") .. [[_]] .. string.gsub(table_name, "[^a-zA-Z0-9]", "")..[["]]
query([[CREATE TABLE IF NOT EXISTS ::s.::t (bucket_name varchar(20000), file_name varchar(20000), FILE_LAST_MODIFIED timestamp, status varchar(20000), FILE_LAST_TRIED timestamp)]],
{s=logging_schema, t=logging_table})
if(force_reload) then
trun = query([[TRUNCATE TABLE ::ls.::lt]],{ls= logging_schema, lt= logging_table})
table.insert(log_tbl,{'Truncated '..logging_schema..'.'..logging_table, trun.rows_affected ,trun.statement_text, ''})
trun = query([[TRUNCATE TABLE ::s.::t]],{s= schema_name, t= table_name})
table.insert(log_tbl,{'Truncated '..schema_name..'.'..table_name,trun.rows_affected ,trun.statement_text, ''})
end
-- update logging table: for all new files, add an entry
-- for all existing files, update FILE_LAST_MODIFIED column and status column
query([[
merge into ::ls.::lt as l using
( select ::ss.s3_get_filenames(:c,:fn, :gu, :fi) order by 1) as p
on p.url = l.file_name and p.bucket_name = l.bucket_name
WHEN MATCHED THEN UPDATE SET l.status = :wu, l.FILE_LAST_MODIFIED = p.FILE_LAST_MODIFIED where p.FILE_LAST_MODIFIED > l.FILE_LAST_MODIFIED or status not = :sd
WHEN NOT MATCHED THEN INSERT (bucket_name, file_name, FILE_LAST_MODIFIED, status) VALUES (p.bucket_name, p.url, p.FILE_LAST_MODIFIED, :wi);
]], {ss=script_schema, c=connection_name, fn=folder_name, fi=filter_string, gu=generate_urls, ls=logging_schema, lt=logging_table, sd=status_done, wu=waiting_for_update, wi=waiting_for_insertion})
-- get the bucket name and the file names of the files that should be modified
local res = query([[
select * from ::ls.::lt where status like 'waiting%'
]], {ls=logging_schema, lt=logging_table})
if(#res == 0) then
exit({{'No queries generated, either bucket is empty or files have already been imported'}}, "message varchar(2000000)")
end
-- generate query text for parallel import
local queries = {}
local stmt = ''
local s3_keys = ''
local pre = "IMPORT INTO ".. schema_name.. ".".. table_name .. " FROM CSV AT "..connection_name
for i = 1, #res do
local curr_bucket_name = res[i][1]
local curr_file_name = "'"..res[i][2].."'"
local curr_FILE_LAST_MODIFIED = res[i][3]
if math.fmod(i,parallel_connections) == 1 or parallel_connections == 1 then
stmt = pre
end
if number_cols == NULL then
file_range = ' '
elseif(number_cols == 1) then
file_range = ' (1)'
else
file_range = ' (1..'..number_cols..')'
end
stmt = stmt .. "\n\tFILE " .. curr_file_name .. file_range
s3_keys = s3_keys ..curr_file_name..", "
if (math.fmod(i,parallel_connections) == 0 or i == #res) then
stmt = stmt .. "\n\t"..file_opts..";"
-- remove the last comma from s3_keys
s3_keys = string.sub(s3_keys, 0, #s3_keys -2)
table.insert(queries,{stmt, bucket_name, s3_keys})
s3_keys = ''
end
end
-- if statements should be only generated, end program here
if not execute_statements then
exit(queries, "queries varchar(2000000)")
end
for i = 1, #queries do
-- execute query
curr_query = queries[i][1]
curr_bucket = queries[i][2]
curr_files = queries[i][3]
suc, res = pquery(curr_query)
if not suc then
output(res.statement_text)
status_error = 'Error: '.. res.error_message
query([[update ::ls.::lt set status = :se, FILE_LAST_TRIED = systimestamp
where file_name in (]]..curr_files..[[) and bucket_name = :b
]],{ls=logging_schema, lt=logging_table, se=status_error, b=curr_bucket})
table.insert(log_tbl,{'Error while inserting: '.. res.error_message,0,curr_query, curr_files})
else
query([[update ::ls.::lt set status = :sd, FILE_LAST_TRIED = systimestamp
where file_name in (]]..curr_files..[[) and bucket_name = :b
]],{ls=logging_schema, lt=logging_table, sd = status_done, b=curr_bucket})
inserted_total = inserted_total + res.rows_inserted
table.insert(log_tbl,{'Inserted', res.rows_inserted,curr_query, curr_files})
end
end
table.insert(log_tbl, {'Summary, total inserts:', inserted_total, '', ''})
exit(log_tbl, "status varchar(20000), affected_rows decimal(18,0) ,executed_queries varchar(2000000),files varchar(2000000)")
/
create or replace connection S3_IMPORT_BOTO
TO 'https://<bucketname>.s3.<my_region>.amazonaws.com' -- <my_region> could e.g. be eu-west-1
USER '<my_access_key>' -- optional, if you don't need user and password, just delete these two lines
IDENTIFIED BY '<my_secret_key>';
create schema S3_LOADER_LOGGING;
-- USE CONNECTION S3_IMPORT_BOTO
execute script DATABASE_MIGRATION.s3_parallel_read(
true -- execute_statements: if true, statements are executed immediately, if false only statements are generated
, false -- force reload: if true, target table and logging table will be truncated, all files in the bucket will be loaded again
, 'S3_LOADER_LOGGING' -- schema you want to use for the logging tables
, '<schema>' -- name of the schema that holds the table you want to import into
, '<table>' -- name of the table you want to import into
, NULL -- number_cols: NULL if you want to import all columns of the file, if set to a number n, the first n columns will be imported
, 'S3_IMPORT_BOTO' -- connection name ( see statement above)
, '<folder>/' -- folder name (no regex!), if you want to import everything, leave blank
, '*.csv' -- filter for file-names, to include all files, put empty string, example for filter '*_2018.csv'
, 4 -- number of parallel connections you want to use
, 'ENCODING=''UTF-8'' SKIP=0 ROW SEPARATOR = ''LF'' COLUMN SEPARATOR = '',''' -- file options, see manual, section 'import' for further details, set skip if you have a headers
)
;