-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwordcloud.py
43 lines (34 loc) · 1.6 KB
/
wordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/python
import sys
import re
from itertools import groupby
from operator import itemgetter
from collections import Counter
import pandas as pd
import nltk
import json
import string
if __name__ == "__main__":
df = pd.read_csv(sys.argv[1], encoding='utf-8')
tokens = re.split(';|,', df['IEEEKeyword'].str.cat(sep=';'))
with open('keywordcount.txt', 'w') as outfile:
outfile.write(' '.join(tokens))
c = Counter(tokens)
wordcount = {}
wordcount['keyword'] = [{'text': t.strip(), 'count': v} for t,v in c.items()]
stopwords = set(nltk.corpus.stopwords.words('english'))
punctuations = set(string.punctuation)
text = df['Abstract'].str.cat(sep=' ').lower()
tokens = nltk.regexp_tokenize(text, pattern='\w+|\$[\d\.]+|\S+')
tokens = [i for i in tokens if i not in stopwords and i not in punctuations and i.isalpha()]
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(s) for s in tokens]
c = Counter(lemmas)
wordcount['abstract'] = [item for item in c.items() if item[1] > 10]
# with open('wordcount.json', 'w') as outfile:
# json.dump(wordcount, outfile)
# with open('keywordcount.txt', 'w') as outfile:
# outfile.writelines(["{0}\t{1}\n".format(item['text'], item['count']) for item in sorted(wordcount['keyword'], key=itemgetter('count'), reverse=True)])
with open('abstractcount.txt', 'w') as outfile:
#outfile.writelines(["{0}\t{1}\n".format(item['text'], item['count']) for item in sorted(wordcount['keyword'], key=itemgetter('count'), reverse=True)])
outfile.write(' '.join(lemmas))