-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
161 lines (123 loc) · 6.45 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: kneehit
"""
# Load Libraries
import numpy as np
import pandas as pd
import cv2
import os
import googletrans
import glob
import random
import matplotlib.pyplot as plt
import pprint
import string
# Load all CSV files
periods_train = pd.read_csv('periods_train.csv',nrows = 10)
periods_test = pd.read_csv('periods_test.csv',nrows = 10)
train = pd.read_csv('train.csv',nrows = 100)
test = pd.read_csv('test.csv',nrows = 100)
train_active = pd.read_csv('train_active.csv',nrows = 100)
test_active = pd.read_csv('test_active.csv',nrows = 100)
train_images_path = 'train/data/competition_files/train_jpg/'
test_images_path = 'test/data/competition_files/test_jpg/'
train_images = glob.glob(train_images_path +'*.jpg')
test_images = glob.glob(test_images_path +'*.jpg')
# Function to visualize ad image and the related information
translator = googletrans.Translator()
def visualize_translated(num):
item_translated = {}
# Translate relevant columns from Russian to English
item_translated['region'] = translator.translate(train.iloc[num,2]).text
item_translated['city'] = translator.translate(train.iloc[num,3]).text
item_translated['parent_cat'] = translator.translate(train.iloc[num,4]).text
item_translated['category'] = translator.translate(train.iloc[num,5]).text
item_translated['param_1'] = translator.translate(train.iloc[num,6]).text if not pd.isna(train.iloc[num,6]) else 'NA'
item_translated['param_2'] = translator.translate(train.iloc[num,7]).text if not pd.isna(train.iloc[num,7]) else 'NA'
item_translated['param_3'] = translator.translate(train.iloc[num,8]).text if not pd.isna(train.iloc[num,8]) else 'NA'
item_translated['title']= translator.translate(train.iloc[num,9]).text
item_translated['desc'] = translator.translate(train.iloc[num,10]).text if not pd.isna(train.iloc[num,10]) else 'NA'
# pprint so that it is formatted appropriately in the output
pprint.pprint(item_translated)
# Display Image
if not pd.isna(train.iloc[num,15]):
image_path = train_images_path + train.iloc[num,15] + '.jpg'
img = cv2.imread(image_path)
cv2.imshow('Item {}'.format(num),img)
cv2.waitKey(0)
cv2.destroyAllWindows()
else:
print('\nImage Missing')
# Display 4 images and their translated information
for i in range(0,4):
num = random.randint(0,train.shape[0])
visualize_translated(num)
city_counts = train['city'].value_counts()
unique_cities = list(city_counts.index)
cities_translated = {}
for i in range(len(unique_cities)):
translated_city = translator.translate(unique_cities[i]).text
cities_translated.update({unique_cities[i]:translated_city})
# Parent category plots
plt.hist(train.iloc[:,17], np.arange(0.0,1.1,0.1),edgecolor = 'black',linewidth = 1.2)
plt.xticks(np.arange(0.0,1.1,0.1))
cats_and_counts = train['parent_category_name'].value_counts()
translated_cats = []
for i in list(cats_and_counts.index):
translated_cats.append(translator.translate(i).text)
cats_and_counts.plot('bar').set_xticklabels(translated_cats)
# Make plots
for j in range(len(cats_and_counts)):
# plt.subplot(2,1,j+1)
subset = train[train['parent_category_name'] == list(cats_and_counts.index)[j]]
subset_cats = subset['category_name'].value_counts()
translated_subcats = []
for i in list(subset_cats.index):
translated_subcats.append(translator.translate(i).text)
subset_cats.plot('bar').set_xticklabels(translated_subcats)
plt.title('Parent Category: ' + translated_cats[j],fontsize = 20)
plt.show()
# Read the city population scrapped from wikipedia
pop = pd.read_csv('Population Clean.csv')
pop.columns = ['city','popu_count']
# Replace nan by average population count
pop = pop.fillna(np.round(np.mean(pop['popu_count'])))
# insert population column which is based on city names from train and pop dataset
train['population'] = train['city'].map(pop.set_index('city')['popu_count'])
test['population'] = test['city'].map(pop.set_index('city')['popu_count'])
# Percent of missing values in columns
train.isna().sum()*100/train.shape[0]
# Param_2 and Param_3 have 43% and 57% missing values respectively.
# Replace nans by with empty string
train.loc[:,['param_1','param_2','param_3']] = train.loc[:,['param_1','param_2','param_3']].fillna('')
train.loc[:,'description'] = train.loc[:,'description'].fillna('')
# Replace nans in price column by average price of the corresponding category
train['price'] = train.loc[:,['category_name','price']].groupby('category_name').transform(lambda x: x.fillna(x.mean()))
# Combine params since param_2 and param_3 have about 50% empty strings
train['param'] = train['param_1'] + ' ' + train['param_2'] + ' ' + train['param_3']
# Remove white spaces from start and end
train['param'] = train['param'].str.strip()
# Replace double white spaces by single white space.
train['param'] = train['param'].str.replace(' ',' ')
train['description'][random.randint(0,train.shape[0])]
# After going through many (200+) descriptions, following characters appear in the dataset
# These should be removed/treated separately.
chars_to_replace = ['/\n','№','Б/у','\n','☎','✔','✘','☛','☚','•','«','»']
# List of punctuation marks
chars_to_replace.extend(list(string.punctuation))
# List of commonly occuring fractions
chars_to_replace.extend(['¹', '²', '³', '½', '⅓', '¼', '⅕', '⅙', '⅐', '⅛', '⅑', '⅒', '⅔', '⅖', '¾', '⅗', '⅜', '⅘', '⅚', '⅝', '⅞'])
# Numbers can occur in various forms in the item description.
chars_to_replace.extend(['⓵','⓶','⓷','⓸','⓹','⓺','⓻','⓼','⓽','⓾'])
chars_to_replace.extend(['Ⓞ','①','②','③','④','⑤','⑥','⑦','⑧','⑨','⑩'])
chars_to_replace.extend(['⓪', '➀', '➁', '➂', '➃', '➄', '➅', '➆', '➇', '➈', '➉'])
chars_to_replace.extend(['⓿', '❶', '❷', '❸', '❹', '❺', '❻', '❼', '❽', '❾', '❿'])
chars_to_replace.extend(['➊', '➋', '➌', '➍', '➎', '➏', '➐', '➑', '➒', '➓' ])
# Symbols like ⒈ and ⑴ are single characters and hence will be replaced altogether.
chars_to_replace.extend(['⒈','⒉', '⒊', '⒋', '⒌', '⒍', '⒎', '⒏', '⒐','⒑'])
chars_to_replace.extend(['⑴', '⑵', '⑶', '⑷', '⑸', '⑹', '⑺', '⑻', '⑼', '⑽'])
# Remove the above characters in description column
for char in chars_to_replace:
train['description'] = train['description'].str.replace(char,'')