dblp.py

"""DBLP web scraper.

COPIED UNDER MIT LICENSE
https://github.com/sebastianGehrmann/dblp-pub

Slightly modified to support the 'Id' property.
"""
from bs4 import BeautifulSoup
import pandas as pd
import requests

#options
STRINGS_FOR_TEST = ["Collaborative Writing"]
DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/"


def query_db(pub_string=STRINGS_FOR_TEST):
    '''
    returns the BeautifulSoup object of a query to DBLP
    :param pub_string: A list of strings of keywords
    :return: BeautifulSoup: A BeautifulSoup Object
    '''
    resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string})
    return BeautifulSoup(resp.content, features="lxml")

def get_pub_data(pub):
    '''
    Extracts the information about a publication from a BeautifulSoup object
    :param pub: A BeautifulSoup Object with Publication Information
    :return: dict: All Information of this Publication
    '''
    ptype = 'nothing'
    link = 'nothing'
    authors = []
    title = 'nothing'
    where = 'nothing'

    if 'year' in pub.get('class'):
        # year is not always scrapable, except for this case. Might be done more elegantly
        return int(pub.contents[0])
    else:
        ptype = pub.attrs.get('class')[1]
        for content_item in pub.contents:
            class_of_content_item = content_item.attrs.get('class', [0])
            if 'data' in class_of_content_item:
                for author in content_item.findAll('span', attrs={"itemprop": "author"}):
                    authors.append(author.text)
                title = content_item.find('span', attrs={"class": "title"}).text
                for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}):
                    found_where = where_data.find('span', attrs={"itemprop": "name"})
                    if found_where:
                        where = found_where.text
            elif 'publ' in class_of_content_item:
                link = content_item.contents[0].find('a').attrs.get('href', "nothing")

    return {'Type': ptype,
            'Link': link,
            'Authors': authors,
            'Title': title,
            'Where': where,
            'Id': pub.attrs.get('id'),}

def search(search_string=STRINGS_FOR_TEST):
    '''
    returns the information found in a search query to dblp as a pandas dataframe.
    Shows the following information:
        - Authors
        - Link to Publication
        - Title
        - Type (Article, Proceedings etc.)
        - Where it was published
        - Year of publication
    :param search_string: A List of Strings of Keywords, that should be searched for
    :return: pd.DataFrame: A Dataframe with all data
    '''
    soup = query_db(search_string)
    pub_list_raw = soup.find("ul", attrs={"class": "publ-list"})

    pub_list_data = []
    curr_year = 0
    for child in pub_list_raw.children:
        pub_data = get_pub_data(child)
        if type(pub_data) == int:
            curr_year = pub_data
        else:
            pub_data['Year'] = curr_year
            pub_list_data.append(pub_data)

    return pd.DataFrame(pub_list_data)