generated from streamlit/blank-app-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Pdf-to-csv.py
104 lines (90 loc) · 2.94 KB
/
Pdf-to-csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import streamlit as st
import pypdf as pdf
import pdfplumber
import io
import re
st.title("ETL App")
st.write("Import your file")
file = st.file_uploader("Choose a PDF File")
if file is not None:
# Open the PDF using pypdf
data = pdf.PdfReader(file)
# Extracting data from PDF pages
Obs_mass=[]
sam_pos=[]
flp=[]
obs_temp=[]
for i in range(len(data.pages)):
page_no=data.pages[i]
txt=page_no.extract_text().split()
for j, x in enumerate(txt):
if x == "(Da)":
ind_1=j+10
ind_temp=ind_1+5
if len(txt[ind_temp]) == 9:
Obs_mass.append([txt[ind_1],txt[ind_temp]])
else:
Obs_mass.append(txt[ind_1])
ind_2= j-8
sam_pos.append(txt[ind_2])
ind_3= j+12
flp.append(txt[ind_3])
else:
pass
# Creating DataFrame
df = pd.DataFrame({
"Sample Position": sam_pos,
"Observed Mass (Da)": Obs_mass,
"FLP UV % Area": flp
})
# Finding NaN values
data_1= data
### pdf_file = io.BytesIO(data.read())
pdf_file = io.BytesIO(file.read())
data_1 = pdfplumber.open(pdf_file)
position=[]
for i in range(len(data_1.pages)):
try:
table=data_1.pages[i].extract_table()
if table[0][2] == 'Sample type':
position.append(table[1][-2])
else:
pass
except IndexError:
continue
except TypeError:
continue
# Handle other exceptions
#print(f"An error occurred for page {i}: {e}")
# Creating DataFrame for sample positions
sample_df = pd.DataFrame({"Sample Position": position})
df["Sample Position"] = df["Sample Position"].str[:-1]
# Merging both DataFrames
df_new = pd.merge(df, sample_df, on="Sample Position", how='outer')
# Sorting DataFrame
def custom_sort_logic(value):
match = re.match(r'(\d+)\s*:\s*(\w+)\s*,\s*(\d+)', value)
if match:
section1 = int(match.group(1))
section2 = match.group(2)
section3 = int(match.group(3))
return (section1, section3, section2)
else:
return (float('inf'), '', float('inf'))
def sort_dataframe(df, column_name):
df_sorted = df.sort_values(by=column_name, key=lambda x: x.apply(custom_sort_logic))
return df_sorted
sorted_df = sort_dataframe(df_new, 'Sample Position')
# Generate CSV file name based on uploaded PDF file name
pdf_filename = file.name
csv_filename = f"Updated_{pdf_filename}.csv"
# Exporting to CSV
st.write(f"Exporting the file into {csv_filename}")
csv_data = sorted_df.to_csv(index=False)
st.download_button(
label="Download CSV",
data=csv_data,
file_name=csv_filename,
mime="text/csv"
)