Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/jcklie/wikimapper
Browse files Browse the repository at this point in the history
  • Loading branch information
jcklie committed Apr 21, 2022
2 parents 72e8142 + fadf266 commit 750aa2f
Show file tree
Hide file tree
Showing 13 changed files with 51 additions and 54 deletions.
7 changes: 0 additions & 7 deletions .coveragerc

This file was deleted.

29 changes: 29 additions & 0 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Run Tests

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:

runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
python-version: [3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
pip install --upgrade -e .[test]
- name: Run tests
run: |
pytest
14 changes: 0 additions & 14 deletions .travis.yml

This file was deleted.

5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,10 @@ black:
black -l 100 wikimapper/
black -l 100 tests/

isort:
isort --profile black wikimapper/ tests/

format: black isort

html:
cd docs && make html
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,12 @@
install_requires=[]

test_dependencies = [
"tox",
"pytest",
"codecov",
"pytest-cov",
]

dev_dependencies = [
"black",
"isort",
"twine",
"pygments",
"wheel"
Expand Down
6 changes: 2 additions & 4 deletions tests/fixtures.py → tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
from collections import namedtuple
import os

import pytest

from wikimapper import download_wikidumps, create_index, WikiMapper

from wikimapper import WikiMapper, create_index, download_wikidumps

Wiki = namedtuple("Wiki", ["dumpname", "path"])


@pytest.fixture(scope="package")
def bavarian_wiki_dump(tmpdir_factory) -> Wiki:
""" We download the Bavarian Wiki, as it is quite small. """
"""We download the Bavarian Wiki, as it is quite small."""

dumpname = "barwiki-latest"
path = tmpdir_factory.mktemp("dumps").strpath
Expand Down
4 changes: 1 addition & 3 deletions tests/test_mapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import pytest

from tests.fixtures import *

BAVARIAN_PARAMS = [
pytest.param("Stoaboog", "Q168327"),
pytest.param("Wechslkrod", "Q243242"),
Expand All @@ -12,7 +10,7 @@
pytest.param("Quadrátkilometa", "Q25343", id="Has redirect"),
pytest.param("D'_boarische_Woocha", "Q20616808", id="Has special character"),
pytest.param("I am not in the Wiki", None, id="Title not in the wiki"),
pytest.param("Pergentinus_und_Laurentinus", None, id="In the index, but not mapped"),
pytest.param("tungsten", None, id="In the index, but not mapped"),
]


Expand Down
2 changes: 0 additions & 2 deletions tests/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

from wikimapper import create_index

from tests.fixtures import *


def test_create_index(tmpdir, bavarian_wiki_dump):
path_to_db = tmpdir.mkdir("processor").join("index_test.db").strpath
Expand Down
9 changes: 0 additions & 9 deletions tox.ini

This file was deleted.

6 changes: 3 additions & 3 deletions wikimapper/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import logging
import os

from wikimapper import WikiMapper, create_index, download_wikidumps
from wikimapper.__version__ import __version__
from wikimapper import download_wikidumps, create_index, WikiMapper


def main():
Expand Down Expand Up @@ -126,15 +126,15 @@ def main():


def _dir_path(path) -> str:
""" Checks whether `path` is a valid path to a directory. """
"""Checks whether `path` is a valid path to a directory."""
if os.path.isdir(path):
return path
else:
raise argparse.ArgumentTypeError(f"readable_dir:{path} is not a valid path to a directory!")


def _dump_name(name) -> str:
""" Checks whether `name` is a valid Wikipedia dump name. """
"""Checks whether `name` is a valid Wikipedia dump name."""
parts = name.split("-")
err = lambda: argparse.ArgumentTypeError(f"dumpname: [{name}] is not a valid dump name")

Expand Down
4 changes: 2 additions & 2 deletions wikimapper/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def _report_hook(count: int, block_size: int, total_size: int):


def _download_file(url: str, target: str, overwrite: bool):
""" Downloads the content identified by `url` and saves it in `target`."""
"""Downloads the content identified by `url` and saves it in `target`."""
if os.path.exists(target) and not overwrite:
_logger.info("[%s] already exists, skipping downloading [%s]!", target, url)
return
Expand All @@ -29,7 +29,7 @@ def _download_file(url: str, target: str, overwrite: bool):
def download_wikidumps(
dumpname: str, path: str, mirror: str = "https://dumps.wikimedia.org/", overwrite: bool = False
):
""" Downloads pages, page props and redirect SQL dumps for the dump
"""Downloads pages, page props and redirect SQL dumps for the dump
specified by `dumpname` to the folder `path`. If `overwrite` is true,
then it is downloaded again even if the files already exist.
Expand Down
8 changes: 4 additions & 4 deletions wikimapper/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@


class WikiMapper:
""" Uses a precomputed database created by `create_wikipedia_wikidata_mapping_db`. """
"""Uses a precomputed database created by `create_wikipedia_wikidata_mapping_db`."""

def __init__(self, path_to_db: str):
self._path_to_db = path_to_db

def title_to_id(self, page_title: str) -> Optional[str]:
""" Given a Wikipedia page title, returns the corresponding Wikidata ID.
"""Given a Wikipedia page title, returns the corresponding Wikidata ID.
The page title is the last part of a Wikipedia url **unescaped** and spaces
replaced by underscores , e.g. for `https://en.wikipedia.org/wiki/Fermat%27s_Last_Theorem`,
Expand All @@ -35,7 +35,7 @@ def title_to_id(self, page_title: str) -> Optional[str]:
return None

def url_to_id(self, wiki_url: str) -> Optional[str]:
""" Given an URL to a Wikipedia page, returns the corresponding Wikidata ID.
"""Given an URL to a Wikipedia page, returns the corresponding Wikidata ID.
This is just a convenience function. It is not checked whether the index and
URL are from the same dump.
Expand All @@ -53,7 +53,7 @@ def url_to_id(self, wiki_url: str) -> Optional[str]:
return self.title_to_id(title)

def id_to_titles(self, wikidata_id: str) -> List[str]:
""" Given a Wikidata ID, return a list of corresponding pages that are linked to it.
"""Given a Wikidata ID, return a list of corresponding pages that are linked to it.
Due to redirects, the mapping from Wikidata ID to Wikipedia title is not unique.
Expand Down
7 changes: 4 additions & 3 deletions wikimapper/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
"""

import csv
import ctypes as ct
import gzip
import logging
import os
import sqlite3
import sys

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -91,7 +91,7 @@ def _parse_values(values):


def create_index(dumpname: str, path_to_dumps: str, path_to_db: str = None) -> str:
""" Creates an index mapping Wikipedia page titles to Wikidata IDs and vice versa.
"""Creates an index mapping Wikipedia page titles to Wikidata IDs and vice versa.
This requires a previously downloaded dump `dumpname` in `path_to_dumps`.
Args:
Expand All @@ -114,7 +114,8 @@ def create_index(dumpname: str, path_to_dumps: str, path_to_db: str = None) -> s
page_props_dump = os.path.join(path_to_dumps, dumpname + "-page_props.sql.gz")
redirects_dump = os.path.join(path_to_dumps, dumpname + "-redirect.sql.gz")

csv.field_size_limit(sys.maxsize)
# https://stackoverflow.com/a/54517228
csv.field_size_limit(int(ct.c_ulong(-1).value // 2))

# (Re)Create the database file
try:
Expand Down

0 comments on commit 750aa2f

Please sign in to comment.