-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmongoTesting.py
116 lines (93 loc) · 5.51 KB
/
mongoTesting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import json
import io
import os, fnmatch
import bz2
import pymongo
import re
import nltk
import nltk.data
from nltk.tokenize import RegexpTokenizer
#---------Saving JSON to MongoDB----------------------------------
client1=pymongo.MongoClient('localhost', 27017)
def save_to_mongo(data,mongo_db,mongo_db_coll):
db1=client1[mongo_db]
coll1=db1[mongo_db_coll]
return coll1.insert(data)
#-----------------Finding files in the directory-----------------
def find_files(directory, pattern):
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
yield filename
#-------------------------------Reading each file and saving from Json to Mongo------------------
for filename in find_files('/Users/shivinkapur/Desktop/246Code/TestJson', '*.bz2'):
#for filename in find_files('/Users/shrutisarin/Desktop/twitter_datacollect/mongodb-osx-x86_64-2.4.9/bin/TestJson', '*.bz2'):
f = bz2.BZ2File(filename, "r")
print filename
for line in f:
while True:
try:
jfile = json.loads(line)
if(('user' in jfile.keys()) and ('lang' in jfile.keys()) and ('id' in jfile["user"].keys()) and ('name' in jfile["user"].keys()) and ('description' in jfile["user"].keys()) and ('source' in jfile.keys()) and ('screen_name' in jfile["user"].keys()) and ('created_at' in jfile["user"].keys()) and ('friends_count' in jfile["user"].keys()) and ('followers_count' in jfile["user"].keys()) and ('favourites_count' in jfile["user"].keys()) and ('time_zone' in jfile["user"].keys()) ):
line1=jfile["user"]["screen_name"]
sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
sentences= sent_tokenizer.tokenize(line1)
validLetters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVYXYZ ' "
for sentence in sentences:
newsentence = ''.join([char for char in sentence if char in validLetters])
line2=jfile["text"]#+","+jfile["user"]["screen_name"]
sentences= sent_tokenizer.tokenize(line2)
for sentence in sentences:
newsentence2 = ''.join([char for char in sentence if char in validLetters])
if(jfile["user"]["description"] is not None):
line3=jfile["user"]["description"]#+","+jfile["user"]["screen_name"]
sentences= sent_tokenizer.tokenize(line3)
for sentence in sentences:
newsentence3 = ''.join([char for char in sentence if char in validLetters])
else:
newsentence3=''
line4=jfile["user"]["name"]#+","+jfile["user"]["screen_name"]
sentences= sent_tokenizer.tokenize(line4)
for sentence in sentences:
newsentence4 = ''.join([char for char in sentence if char in validLetters])
# if("hashtags" in jfile["entities"].keys()):
# if(jfile["entities"]["hashtags"] is not None):
# a=[]
# a=jfile["entities"]["hashtags"]
# print(a[0]["text"])
# #if(["entities"]["hashtags"][0] is not None):# is not ''):
# line5=jfile["entities"]["hashtags"][0]#+","+jfile["user"]["screen_name"]
# print(line5)
# # sentences= sent_tokenizer.tokenize(line5)
# # for sentence in sentences:
# # newsentence5 = ''.join([char for char in sentence if char in validLetters])
# else:
# newsentence5=''
if ((len(newsentence)>2) and (newsentence[0]!='') and jfile["lang"]=="en"):
json_to_store_into_mongo={
#'tweet':jfile["text"],
'tweet':newsentence2,
'source':jfile["source"],
'user_id':jfile["user"]["id"],
'orig_name':jfile["user"]["name"],
'name':newsentence4,
'orig_screen_name':jfile["user"]["screen_name"],
'screen_name':newsentence,
#'description':jfile["user"]["description"],
'description':newsentence3,
'followers':jfile["user"]["followers_count"],
'friends':jfile["user"]["friends_count"],
'created_at':jfile["user"]["created_at"],
'favourites_count':jfile["user"]["favourites_count"],
'time_zone':jfile["user"]["time_zone"],
'hashtags':jfile["entities"]["hashtags"],
#'hashtags':newsentence5,
'language':jfile["lang"]
}
#print jfile["user"]["screen_name"]
#save_to_mongo(json_to_store_into_mongo, '246ProjData1', 'twittercoll2')
save_to_mongo(json_to_store_into_mongo, 'followers_jsonfiles1', 'follower_ids')
break
except ValueError:
line += next(f)