-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathAQI_scrape.py
47 lines (35 loc) · 1.08 KB
/
AQI_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# Import necessary packages
import pandas as pd
import camelot
from datetime import datetime
import cv2
import ghostscript
# Date
today = datetime.today()
date_str = today.strftime("%Y%m%d")
file = f'data/AQI_Bulletin_{date_str}.pdf'
#file = f'data/AQI_Bulletin_20231217.pdf'
# Extract tables
tables = camelot.read_pdf(file, pages='all', strip_text='\n', flag_size=True)
# Define headers
def filter_df(df):
df.columns = ['City', 'Air Quality', 'Index Value', 'Prominent Pollutant', 'No. of Monitoring Stations']
df.reset_index(drop=True, inplace=True)
return df
# All tables except the last one
table_list = []
for num, table in enumerate(tables):
if num < len(tables) - 1:
table_df = table.df
modified_df = table_df.drop(0).drop(0, axis=1)
table_list.append(modified_df)
# Concatenate all dataframes
df = pd.concat(table_list)
# Add headers
df = df.pipe(filter_df)
# Add date
df['Date'] = date_str
# Append new data with old file
with open('AQI.csv', 'a') as f:
df.to_csv(f, header=False, index=False)
print("Successfully completed...")