-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathload_imdb.py
164 lines (142 loc) · 5.74 KB
/
load_imdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""Load IMDb TSV files
1 Download the files if they're not present
2 Load the TSV files a chunk at time
3 Clean the chunk
4 Write the chunk to either a CSV file or SQLite database table
"""
import os
import argparse
import sqlite3 as sqlite
import pandas as pd
from tqdm import tqdm
# from imdb_data import (
# IMDbData,
# TitleBasicsData,
# TitleCrewData,
# )
from name_basics_data import NameBasicsData
from title_basics_data import TitleBasicsData
from title_crew_data import TitleCrewData
from title_ratings_data import TitleRatingsData
class IMDbLoader:
"""Load the IMDb data sets either into CSV files or a SQLite database"""
def __init__(self, tsv_file):
self.tsv_file = tsv_file
def tsv_load(self):
"""Load the data from the TSV file"""
return pd.read_csv(self.tsv_file, sep="\t", low_memory=False)
def df_to_csv(self, data_frame, csv_path):
"""Take rows from an IMDb TSV file, clean the rows and output a CSV file"""
with open(csv_path, "w", encoding="utf-8", newline="") as csv_file:
# Read the TSV file in chunks
# Write the data frame to a CSV file using a writer
data_frame.to_csv(csv_file, index=False, mode="w")
def df_to_sqlite(self, data_frame, db_path, db_table):
"""Take rows from an IMDb TSV file, clean the rows and output to a SQLite database"""
db_conn = sqlite.connect(db_path)
db_cursor = db_conn.cursor()
# Store journal in memory, temp store in memory, and turn off synchronous writes
db_cursor.execute("PRAGMA journal_mode = MEMORY")
db_cursor.execute("PRAGMA temp_store = MEMORY")
db_cursor.execute("PRAGMA synchronous = OFF")
with db_conn:
# Read the TSV file in one go
# Write the data frames to a SQLite table
data_frame.to_sql(
db_table,
db_conn,
if_exists="replace",
)
def main(input_dir, output_dir, output_format, db_file):
"""By default, read TSV files from the `import` directory, clean them up, convert them
to CSV files and write them to the `export` directory
Option to write to a SQLite database instead."""
cleaner_classes = {
"title.basics.tsv": TitleBasicsData,
"title.ratings.tsv": TitleRatingsData,
"title.crew.tsv": TitleCrewData,
"name.basics.tsv": NameBasicsData,
# "title.principals.tsv": IMDbData,
}
# Create the export directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Check command-line arguments
if output_format == "sqlite":
if not db_file:
raise ValueError(
"SQLite database file path must be provided when output format is 'sqlite'"
)
elif output_format != "csv":
raise ValueError("Invalid output format: ", output_format)
# Format for progress bar
bar_format = "Progress: {l_bar}{bar} | Completed: {n_fmt} | Time: [{elapsed}]"
# Iterate through all TSV files with a progress bar
with tqdm(
total=len(cleaner_classes),
bar_format=bar_format,
desc="Processing files",
) as files_progress:
for tsv_name, cleaner_class in cleaner_classes.items():
tsv_path = os.path.join(input_dir, tsv_name)
# Create a generator to supply the TSV data in chunks as data frames
loader = IMDbLoader(tsv_file=tsv_path)
cleaner = cleaner_class(loader.tsv_load())
for df_name, clean_df in cleaner.data_frames.items():
if output_format == "csv":
# If output format is csv create CSV files
# Define the corresponding CSV file path in the export directory
csv_path = os.path.join(output_dir, df_name, ".csv")
# Write the data frames as a CSV file
loader.df_to_csv(data_frame=clean_df, csv_path=csv_path)
elif output_format == "sqlite":
# If output format is SQLite create a SQLite database
db_path = os.path.join(output_dir, db_file)
loader.df_to_sqlite(
data_frame=clean_df,
db_path=db_path,
db_table=df_name,
)
files_progress.update(1)
print("Conversion complete!")
if __name__ == "__main__":
# Parse command-line arguments
parser = argparse.ArgumentParser(
prog="load_imdb",
description="Process TSV files from IMDb and save to CSV or SQLite.",
epilog="You can download files using download_imdb",
)
parser.add_argument(
"input_dir",
type=str,
default="input",
help="Directory that contains the input TSV files (default: input)",
)
parser.add_argument(
"-f",
"--output_format",
type=str,
choices=["csv", "sqlite"],
default="csv",
help="Output format: csv or sqlite (default: csv)",
)
parser.add_argument(
"-d",
"--db_file",
type=str,
default="imdb.sqlite",
help="Path to SQLite database file - required if output is sqlite (default:imdb.sqlite)",
)
parser.add_argument(
"-o",
"--output_dir",
type=str,
default="data",
help="Path to the output CSV file - required if output is csv (default:data)",
)
args = parser.parse_args()
main(
input_dir=args.input_dir,
output_dir=args.output_dir,
output_format=args.output_format,
db_file=args.db_file,
)