Skip to content
This repository has been archived by the owner on Aug 21, 2020. It is now read-only.

Commit

Permalink
stash base
Browse files Browse the repository at this point in the history
  • Loading branch information
CaribouW committed Mar 24, 2020
1 parent 6a23fa2 commit 3ae41e5
Show file tree
Hide file tree
Showing 11 changed files with 17,148 additions and 18 deletions.
2 changes: 1 addition & 1 deletion makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ APP_DOCKER_COMPOSE = docker-compose-app.yml
DATA_DOCKER_COMPOSE = docker-compose-data.yml
MONGO_FLAGS = -u root -p mongo -d se
BACK_FILES = author paper authorCitation
EXTEND_DOCS = affiliation field conference field
EXTEND_DOCS = affiliation conference field
COUNTER_DOCS = counterBase

local-set:
Expand Down
6,994 changes: 6,994 additions & 0 deletions oasis-data/affiliation_post.txt

Large diffs are not rendered by default.

6,994 changes: 6,994 additions & 0 deletions oasis-data/affiliations.txt

Large diffs are not rendered by default.

Empty file.
22 changes: 22 additions & 0 deletions oasis-data/analysers/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
file_names = ['../affiliations.txt', '../conferences.txt']


# 机构处理
def affiliation_handler(file):
with open(file) as f:
for line in f:
import re
t = re.sub(r'[().*]', "", re.sub(r'\(.*\)', "", line.strip()).strip())
print(t.strip())


# 会议处理
def conference_handler(file):
with open(file) as f:
for line in f:
import re
sub = re.sub(r'[().*]', "", re.sub(r'(\(.*\))', "", line.strip()).strip())
print(line.strip())


affiliation_handler(file_names[0])
121 changes: 121 additions & 0 deletions oasis-data/conference_analyser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import re


def is_times(str):
times_dic = {
'First',
'Second',
'Third',
'Fourth',
'Fifth',
'Sixth',
'Seventh',
'Eighth',
'Ninth',
'Tenth',
'Eleventh',
'Twelfth',
'Thirteenth',
'Fourteenth',
'Fifteenth',
'Sixteenth',
'Seventeenth',
'Eighteenth',
'Nineteenth',
'Twentieth',
}
ieee_dic = {
'IEEE',
'ACE',
'IEEE/ACE',
'IEEE/ACM',
'ACM/IEEE',
'ASE'
}
return not (str in times_dic)


def inc_Num(word):
res = re.match('.*[0-9]+.*', word)
if res:
return False
else:
return True
# return res


def subHigh(line):
temp = ' ' + line
res = re.sub('[ |(][0-9A-Z/\']{3,100}[ |)|\n]', ' ', temp)
return res


def filter_times(line):
strs = list(filter(is_times, line.split(' ')))
res = ' '.join(strs)
return res


def filter_num(line):
strs = list(filter(inc_Num, line.split(' ')))
res = ' '.join(strs)
return res


def sub_proceedings(line):
res0 = re.sub('Proceedings\.|'
'Proceedings of the Annual |'
'Proceedings of the|'
'Proceedings of|'
', Proceedings|'
'\[*Proceedings]',
'', line)
res = re.sub('Proceedings *', '', res0)
return res


def sub_kuohao(line):
res = re.sub('\(.*[)]*', '', line)
nres = re.sub('\[.*\]', '', res)
return nres


def sub_end_High(line):
res = re.sub('\. [A-Z]*$', ' ', line)
res = re.sub(', [A-Z]*$', ' ', res)
res = re.sub('- [A-Z]*$', ' ', res)
res = re.sub(', Part.*$', ' ', res)
return res


def sub_end(line):
res = line.rstrip().rstrip(' ,.-')
return res + '\n'


def sub_start(line):
res = line.lstrip(' .-')
return res


files = ['conferences.txt', 'conferences_post.txt']

f = open(files[0])
w = open(files[-1], 'w')
raw_data = []
line = f.readline()
while line:
raw_data.append(line)
line = f.readline()

res_set = set()
res_list = []
i = 0
for line in raw_data:
res0 = filter_times(line)
res1 = filter_num(res0)
res2 = sub_proceedings(res1)
res3 = sub_end_High(res2)
res = sub_start(sub_end(sub_kuohao(res3)))
t = re.sub(r'[()*.]', "", res)
w.write(t)
1,492 changes: 1,492 additions & 0 deletions oasis-data/conferences.txt

Large diffs are not rendered by default.

Loading

0 comments on commit 3ae41e5

Please sign in to comment.