stash base

NJU-SE3 · Mar 24, 2020 · 3ae41e5 · 3ae41e5
1 parent 6a23fa2
commit 3ae41e5
Show file tree

Hide file tree

Showing 11 changed files with 17,148 additions and 18 deletions.
diff --git a/makefile b/makefile
@@ -12,7 +12,7 @@ APP_DOCKER_COMPOSE = docker-compose-app.yml
 DATA_DOCKER_COMPOSE = docker-compose-data.yml
 MONGO_FLAGS = -u root -p mongo -d se
 BACK_FILES = author paper authorCitation
-EXTEND_DOCS = affiliation field conference field
+EXTEND_DOCS = affiliation conference field
 COUNTER_DOCS = counterBase
 
 local-set:

diff --git a/oasis-data/affiliation_post.txt b/oasis-data/affiliation_post.txt
diff --git a/oasis-data/affiliations.txt b/oasis-data/affiliations.txt
diff --git a/oasis-data/analysers/__init__.py b/oasis-data/analysers/__init__.py
diff --git a/oasis-data/analysers/model.py b/oasis-data/analysers/model.py
@@ -0,0 +1,22 @@
+file_names = ['../affiliations.txt', '../conferences.txt']
+
+
+# 机构处理
+def affiliation_handler(file):
+    with open(file) as f:
+        for line in f:
+            import re
+            t = re.sub(r'[().*]', "", re.sub(r'\(.*\)', "", line.strip()).strip())
+            print(t.strip())
+
+
+# 会议处理
+def conference_handler(file):
+    with open(file) as f:
+        for line in f:
+            import re
+            sub = re.sub(r'[().*]', "", re.sub(r'(\(.*\))', "", line.strip()).strip())
+            print(line.strip())
+
+
+affiliation_handler(file_names[0])
diff --git a/oasis-data/conference_analyser.py b/oasis-data/conference_analyser.py
@@ -0,0 +1,121 @@
+import re
+
+
+def is_times(str):
+    times_dic = {
+        'First',
+        'Second',
+        'Third',
+        'Fourth',
+        'Fifth',
+        'Sixth',
+        'Seventh',
+        'Eighth',
+        'Ninth',
+        'Tenth',
+        'Eleventh',
+        'Twelfth',
+        'Thirteenth',
+        'Fourteenth',
+        'Fifteenth',
+        'Sixteenth',
+        'Seventeenth',
+        'Eighteenth',
+        'Nineteenth',
+        'Twentieth',
+    }
+    ieee_dic = {
+        'IEEE',
+        'ACE',
+        'IEEE/ACE',
+        'IEEE/ACM',
+        'ACM/IEEE',
+        'ASE'
+    }
+    return not (str in times_dic)
+
+
+def inc_Num(word):
+    res = re.match('.*[0-9]+.*', word)
+    if res:
+        return False
+    else:
+        return True
+    # return res
+
+
+def subHigh(line):
+    temp = ' ' + line
+    res = re.sub('[ |(][0-9A-Z/\']{3,100}[ |)|\n]', ' ', temp)
+    return res
+
+
+def filter_times(line):
+    strs = list(filter(is_times, line.split(' ')))
+    res = ' '.join(strs)
+    return res
+
+
+def filter_num(line):
+    strs = list(filter(inc_Num, line.split(' ')))
+    res = ' '.join(strs)
+    return res
+
+
+def sub_proceedings(line):
+    res0 = re.sub('Proceedings\.|'
+                  'Proceedings of the Annual |'
+                  'Proceedings of the|'
+                  'Proceedings of|'
+                  ', Proceedings|'
+                  '\[*Proceedings]',
+                  '', line)
+    res = re.sub('Proceedings *', '', res0)
+    return res
+
+
+def sub_kuohao(line):
+    res = re.sub('\(.*[)]*', '', line)
+    nres = re.sub('\[.*\]', '', res)
+    return nres
+
+
+def sub_end_High(line):
+    res = re.sub('\. [A-Z]*$', ' ', line)
+    res = re.sub(', [A-Z]*$', ' ', res)
+    res = re.sub('- [A-Z]*$', ' ', res)
+    res = re.sub(', Part.*$', ' ', res)
+    return res
+
+
+def sub_end(line):
+    res = line.rstrip().rstrip(' ,.-')
+    return res + '\n'
+
+
+def sub_start(line):
+    res = line.lstrip(' .-')
+    return res
+
+
+files = ['conferences.txt', 'conferences_post.txt']
+
+f = open(files[0])
+w = open(files[-1], 'w')
+raw_data = []
+line = f.readline()
+while line:
+    raw_data.append(line)
+    line = f.readline()
+
+res_set = set()
+res_list = []
+i = 0
+for line in raw_data:
+    res0 = filter_times(line)
+    res1 = filter_num(res0)
+    res2 = sub_proceedings(res1)
+    res3 = sub_end_High(res2)
+    res = sub_start(sub_end(sub_kuohao(res3)))
+    t = re.sub(r'[()*.]', "", res)
+    w.write(t)
diff --git a/oasis-data/conferences.txt b/oasis-data/conferences.txt