-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathXMLToCSV.py
136 lines (110 loc) · 3.96 KB
/
XMLToCSV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import boto3
import xml.etree.ElementTree as ET
import pandas
import config
import re
from helper import *
def XMLStringToXML_ET(xmlString: str) -> ET.Element:
# This will be used to convert XML content from String to Element
"""To convert XML String to XML Element
Args:
xmlString (str): XML file content in string format
Raises:
f: Custom Error
Returns:
ET.Element: XML Element Object
"""
try:
return ET.fromstring(xmlString)
except Exception as e:
raise createErrorStr("converting XML string to XML Element", e)
def XMLElementToRows(xmlElement: ET.Element) -> list[dict]:
# This will be used to convert XML Element to array of rows
"""To convert XML Element to List of rows
Args:
xmlElement (ET.Element): XLM Element to convert
Raises:
f: Custom error
Returns:
list[dict]: list of rows
"""
try:
rows = []
# iterate over all sub element of roor xmlObject and create rows for each data
for element in xmlElement:
element: ET.Element
id = element.attrib['id']
author = element.find('author').text
title = element.find('title').text
genre = element.find('genre').text
price = element.find('price').text
publish_date = element.find('publish_date').text
description = re.sub(
"\s\s+", " ", element.find('description').text)
rows.append({
"id": id,
"author": author,
"title": title,
"genre": genre,
"price": price,
"publish_date": publish_date,
"description": description
})
return rows
except Exception as e:
raise createErrorStr("converting XML Element to Rows", e)
def rowsToDF(rows: list[dict], columnHeader: list[str]) -> pandas.DataFrame:
# Will be used to covnert list of row to pandas dataframe
"""To covert Rows to pandas DataFrame
Args:
rows (list[dict]): List of all rows
columnHeader (list[str]): Column headers to use
Raises:
f: Custom Error
Returns:
pandas.DataFrame: Dataframe converted from rows
"""
try:
return pandas.DataFrame(rows, columns=columnHeader)
except Exception as e:
raise createErrorStr("converting Rows to pandas DataFrame", e)
def DFToCSV(DF: pandas.DataFrame) -> str:
"""To convert pandas DF to CSV string
Args:
DF (pandas.DataFrame): pandas DF
Returns:
str: CSV string
"""
# Will be used to convert pandas DF to CSV File
try:
return DF.to_csv(index=False)
except Exception as e:
raise createErrorStr("converting pandas DataFrame to CSV", e)
def XMLToCSV(bucket: str, XMLfileKey: str) -> None:
"""Entry point of XML to CSV conversion
Args:
bucket (str): name
XMLfileKey (str): key
"""
try:
s3 = boto3.resource('s3')
# Read XML file to string
xmlStr = str(readFileContentFromS3(bucket, XMLfileKey, s3))
# print(xmlStr)
# XML File content to XML ElementTree
xmlElement = XMLStringToXML_ET(xmlStr)
# Array of rows
rows = XMLElementToRows(xmlElement)
# pandas dataframe
df = rowsToDF(rows, config.tableColumns)
# DF to CSV Str
csvStr = DFToCSV(df)
csvStr = re.sub('\n\n+', "", csvStr)
# save CSV file
csvFileKey: str = XMLfileKey[:-3]+'csv' # change extension
csvFileKey = csvFileKey.replace('XML/', 'CSV/') # change Folder
writeFileContentToS3(csvStr, bucket, csvFileKey, s3)
# delete source xml file
deleteFile(bucket, XMLfileKey, s3)
except Exception as e:
raise createErrorStr("converting XML to CSV", e)