-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdblp.py
90 lines (79 loc) · 3.07 KB
/
dblp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""DBLP web scraper.
COPIED UNDER MIT LICENSE
https://github.com/sebastianGehrmann/dblp-pub
Slightly modified to support the 'Id' property.
"""
from bs4 import BeautifulSoup
import pandas as pd
import requests
#options
STRINGS_FOR_TEST = ["Collaborative Writing"]
DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/"
def query_db(pub_string=STRINGS_FOR_TEST):
'''
returns the BeautifulSoup object of a query to DBLP
:param pub_string: A list of strings of keywords
:return: BeautifulSoup: A BeautifulSoup Object
'''
resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string})
return BeautifulSoup(resp.content, features="lxml")
def get_pub_data(pub):
'''
Extracts the information about a publication from a BeautifulSoup object
:param pub: A BeautifulSoup Object with Publication Information
:return: dict: All Information of this Publication
'''
ptype = 'nothing'
link = 'nothing'
authors = []
title = 'nothing'
where = 'nothing'
if 'year' in pub.get('class'):
# year is not always scrapable, except for this case. Might be done more elegantly
return int(pub.contents[0])
else:
ptype = pub.attrs.get('class')[1]
for content_item in pub.contents:
class_of_content_item = content_item.attrs.get('class', [0])
if 'data' in class_of_content_item:
for author in content_item.findAll('span', attrs={"itemprop": "author"}):
authors.append(author.text)
title = content_item.find('span', attrs={"class": "title"}).text
for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}):
found_where = where_data.find('span', attrs={"itemprop": "name"})
if found_where:
where = found_where.text
elif 'publ' in class_of_content_item:
link = content_item.contents[0].find('a').attrs.get('href', "nothing")
return {'Type': ptype,
'Link': link,
'Authors': authors,
'Title': title,
'Where': where,
'Id': pub.attrs.get('id'),}
def search(search_string=STRINGS_FOR_TEST):
'''
returns the information found in a search query to dblp as a pandas dataframe.
Shows the following information:
- Authors
- Link to Publication
- Title
- Type (Article, Proceedings etc.)
- Where it was published
- Year of publication
:param search_string: A List of Strings of Keywords, that should be searched for
:return: pd.DataFrame: A Dataframe with all data
'''
soup = query_db(search_string)
pub_list_raw = soup.find("ul", attrs={"class": "publ-list"})
pub_list_data = []
curr_year = 0
for child in pub_list_raw.children:
pub_data = get_pub_data(child)
if type(pub_data) == int:
curr_year = pub_data
else:
pub_data['Year'] = curr_year
pub_list_data.append(pub_data)
return pd.DataFrame(pub_list_data)