-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmeta_data_read (1).py
138 lines (103 loc) · 4.7 KB
/
meta_data_read (1).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import datetime
from datetime import datetime,timedelta,date
import pyspark
from pyspark.sql.functions import unix_timestamp,when,lit
from pyspark.sql.functions import from_unixtime, substring
#import pandas as pd
#import numpy as np
#import findspark
#findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField,ArrayType ,DateType, ShortType, StringType, IntegerType, TimestampType, LongType, BooleanType, DoubleType
from pyspark.sql import functions as f
from pyspark.sql.functions import dense_rank, row_number,concat, regexp_replace
from pyspark.sql.window import Window
import os
import configparser
import logging
#read conf file
myconfig = configparser.ConfigParser()
myconfig.read('/home/vizuamatix/vx_freedom/analytics_scripts/cfi_summary.conf')
#log file
logging.basicConfig(filename=myconfig.get('log_files','metadata_read'),format='%(asctime)s - %(message)s',level=logging.INFO)
logging.info('Process is started')
if __name__ == "__main__":
spark = SparkSession .builder .appName("meta data reading process") .master("local[*]") .getOrCreate()
sc = spark.sparkContext
headers = {
'Content-Type': 'application/json',
}
params = (
('pretty', ''),
)
#db_properties = {
# 'username':'vxanalytics',
# 'url' : 'jdbc:postgresql://localhost:5432/vx_freedom',
# 'password' : 'vxanalytics@123',
#'host' : '127.0.0.1',
#'port' : 5432,
#'database': 'vx_freedom',
#'driver' : 'org.postgresql.Driver'
#}
db_properties={}
db_properties['username']=myconfig.get('db_conf','user')
db_properties['password']=myconfig.get('db_conf','password')
db_properties['url']= myconfig.get('db_conf','url')
db_properties['driver']= myconfig.get('db_conf','driver')
logging.info('metadata read process is start')
base_path = '/data/raw_ipdr'
all_paths = set(os.listdir(base_path))
print(all_paths)
no_of_days=1
yesterday = datetime.now() - timedelta(days=no_of_days)
DAY_START = yesterday
DAY_END = datetime.now()
today= datetime.now().strftime('%Y-%m-%d').replace('-','')
yest= yesterday.strftime('%Y-%m-%d').replace('-','')
#print(DAY_START)
#print(DAY_END)
sql = "SELECT * FROM public.ipdr_base WHERE file_date_time BETWEEN '{}' AND '{}'".format(DAY_START, DAY_END)
#sql = "SELECT * FROM public.ipdr_base"
df_load= spark.read.format('jdbc').options(url=db_properties['url'], driver=db_properties['driver'], user=db_properties['username'], password=db_properties['password'], query=sql).load()
df_load=df_load.withColumn('file_name',regexp_replace('file_name', '/data/raw_ipdr/', ''))
exist_paths= (df_load.select('file_name').collect())
exist_paths= set([str(i.file_name) for i in exist_paths])
new_paths= all_paths- exist_paths
correct_new_paths=[]
for i in new_paths:
#print('00000in for loop 000000000')
if '.csv.gz' in i:
i_part= str(i.split('__')[1])[0:8]
print(i_part)
if (i_part==today or i_part==yest):
#print('......,,,,,,,,,...........if true.............')
rec_count=spark.read.csv(path=('/data/raw_ipdr/'+i), sep= ';', header= False).count()
correct_new_paths.append((i,rec_count,0,None,None,None,'',0,0))
print('need paths..........................')
#print(new_paths)
#print(correct_new_paths)
schema = StructType([
StructField('file_name', StringType(),True),
StructField('record_count', LongType(),True),
StructField('processed_count', LongType(),True),
StructField('status', IntegerType(),True),
StructField('file_date_time', TimestampType(),True),
StructField('processed_time', TimestampType(),True),
StructField('process_file_path', StringType(),True),
StructField('api_call_status', ShortType(),True),
StructField('api_selected_status', ShortType(),True)])
df= spark.createDataFrame(correct_new_paths, schema)
#df= df.withColumn('file_name', str('/data/raw_ipdr/')+df.file_name)
df= df.withColumn('file_date_time', substring(df.file_name,20,13))
df= df.withColumn('file_date_time',from_unixtime(unix_timestamp( df.file_date_time, "yyyyMMdd'_'HHmm" ), "yyyy-MM-dd' 'HH:mm:00" ))
df= df.withColumn('file_date_time', df.file_date_time.cast(TimestampType()))
df= df.withColumn('status', lit(1))
df= df.withColumn('file_name', concat(lit('/data/raw_ipdr/'),df.file_name))
df= df.withColumn('api_call_status', lit(0))
df= df.withColumn('api_selected_status', lit(0))
#df.show()
df.write.format("jdbc").mode('append').option("url", db_properties['url']).option("driver", db_properties['driver']).option("dbtable", "ipdr_base").option("user", db_properties['username']).option("password", db_properties['password']).save()
#df.show()
print('..............done............')
sc.stop()
exit()