From e79e9804f1d6f901ff3855a4639f402eef3fde07 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 9 Jul 2024 21:29:32 +0100 Subject: [PATCH] rename main package --- docs/advanced/library.md | 14 ++++++------ docs/advanced/multilingual.md | 8 +++---- docs/api/downloader.md | 6 ++--- docs/api/extractor.md | 10 ++++----- docs/usage/extract.md | 2 +- mkdocs.yml | 2 +- pyproject.toml | 4 ++-- src/{extractor => wpextract}/__init__.py | 0 src/{extractor => wpextract}/cli/__init__.py | 0 src/{extractor => wpextract}/cli/_dl.py | 6 ++--- src/{extractor => wpextract}/cli/_extract.py | 6 ++--- src/{extractor => wpextract}/cli/_shared.py | 0 src/{extractor => wpextract}/cli/cli.py | 4 ++-- .../dl/ORIGINAL_LICENSE | 0 src/{extractor => wpextract}/dl/__init__.py | 0 src/{extractor => wpextract}/dl/downloader.py | 8 +++---- src/{extractor => wpextract}/dl/exceptions.py | 0 src/{extractor => wpextract}/dl/exporter.py | 2 +- .../dl/requestsession.py | 0 src/{extractor => wpextract}/dl/utils.py | 0 src/{extractor => wpextract}/dl/wpapi.py | 6 ++--- src/{extractor => wpextract}/extract.py | 22 +++++++++---------- .../extractors/__init__.py | 0 .../extractors/categories.py | 6 ++--- .../extractors/data/__init__.py | 0 .../extractors/data/images.py | 2 +- .../extractors/data/link_resolver.py | 4 ++-- .../extractors/data/links.py | 0 src/{extractor => wpextract}/extractors/io.py | 0 .../extractors/media.py | 6 ++--- .../extractors/pages.py | 10 ++++----- .../extractors/posts.py | 20 ++++++++--------- .../extractors/tags.py | 6 ++--- .../extractors/users.py | 2 +- .../parse/__init__.py | 0 src/{extractor => wpextract}/parse/content.py | 8 +++---- src/{extractor => wpextract}/parse/html.py | 2 +- .../parse/translations/__init__.py | 0 .../parse/translations/_extractor.py | 2 +- .../parse/translations/_pickers.py | 12 +++++----- .../parse/translations/_resolver.py | 2 +- .../scrape/__init__.py | 0 .../scrape/crawler.py | 2 +- .../scrape/processor.py | 0 src/{extractor => wpextract}/scrape/scrape.py | 0 src/{extractor => wpextract}/util/__init__.py | 0 src/{extractor => wpextract}/util/args.py | 0 src/{extractor => wpextract}/util/file.py | 0 src/{extractor => wpextract}/util/locale.py | 2 +- src/{extractor => wpextract}/util/str.py | 0 tests/extract/test_extract.py | 2 +- tests/extractors/data/test_images.py | 4 ++-- tests/extractors/data/test_link_resolver.py | 4 ++-- tests/extractors/data/test_links.py | 2 +- tests/extractors/test_categories.py | 4 ++-- tests/extractors/test_io.py | 2 +- tests/extractors/test_media.py | 4 ++-- tests/extractors/test_pages.py | 4 ++-- tests/extractors/test_posts.py | 12 +++++----- tests/extractors/test_tags.py | 4 ++-- tests/extractors/test_users.py | 2 +- tests/parse/test_content.py | 6 ++--- tests/parse/translations/test_pickers.py | 4 ++-- tests/util/test_args.py | 2 +- tests/util/test_file.py | 2 +- tests/util/test_locale.py | 2 +- tests/util/test_str.py | 2 +- 67 files changed, 124 insertions(+), 124 deletions(-) rename src/{extractor => wpextract}/__init__.py (100%) rename src/{extractor => wpextract}/cli/__init__.py (100%) rename src/{extractor => wpextract}/cli/_dl.py (96%) rename src/{extractor => wpextract}/cli/_extract.py (88%) rename src/{extractor => wpextract}/cli/_shared.py (100%) rename src/{extractor => wpextract}/cli/cli.py (92%) rename src/{extractor => wpextract}/dl/ORIGINAL_LICENSE (100%) rename src/{extractor => wpextract}/dl/__init__.py (100%) rename src/{extractor => wpextract}/dl/downloader.py (96%) rename src/{extractor => wpextract}/dl/exceptions.py (100%) rename src/{extractor => wpextract}/dl/exporter.py (99%) rename src/{extractor => wpextract}/dl/requestsession.py (100%) rename src/{extractor => wpextract}/dl/utils.py (100%) rename src/{extractor => wpextract}/dl/wpapi.py (99%) rename src/{extractor => wpextract}/extract.py (90%) rename src/{extractor => wpextract}/extractors/__init__.py (100%) rename src/{extractor => wpextract}/extractors/categories.py (87%) rename src/{extractor => wpextract}/extractors/data/__init__.py (100%) rename src/{extractor => wpextract}/extractors/data/images.py (96%) rename src/{extractor => wpextract}/extractors/data/link_resolver.py (95%) rename src/{extractor => wpextract}/extractors/data/links.py (100%) rename src/{extractor => wpextract}/extractors/io.py (100%) rename src/{extractor => wpextract}/extractors/media.py (95%) rename src/{extractor => wpextract}/extractors/pages.py (89%) rename src/{extractor => wpextract}/extractors/posts.py (92%) rename src/{extractor => wpextract}/extractors/tags.py (85%) rename src/{extractor => wpextract}/extractors/users.py (94%) rename src/{extractor => wpextract}/parse/__init__.py (100%) rename src/{extractor => wpextract}/parse/content.py (94%) rename src/{extractor => wpextract}/parse/html.py (94%) rename src/{extractor => wpextract}/parse/translations/__init__.py (100%) rename src/{extractor => wpextract}/parse/translations/_extractor.py (95%) rename src/{extractor => wpextract}/parse/translations/_pickers.py (92%) rename src/{extractor => wpextract}/parse/translations/_resolver.py (88%) rename src/{extractor => wpextract}/scrape/__init__.py (100%) rename src/{extractor => wpextract}/scrape/crawler.py (98%) rename src/{extractor => wpextract}/scrape/processor.py (100%) rename src/{extractor => wpextract}/scrape/scrape.py (100%) rename src/{extractor => wpextract}/util/__init__.py (100%) rename src/{extractor => wpextract}/util/args.py (100%) rename src/{extractor => wpextract}/util/file.py (100%) rename src/{extractor => wpextract}/util/locale.py (96%) rename src/{extractor => wpextract}/util/str.py (100%) diff --git a/docs/advanced/library.md b/docs/advanced/library.md index 734d4f3..591f31b 100644 --- a/docs/advanced/library.md +++ b/docs/advanced/library.md @@ -4,24 +4,24 @@ The extractor can also be used as a library instead of on the command line. Typically, you would: -- instantiate a [`WPDownloader`][extractor.WPDownloader] instance and call its [`download`][extractor.WPDownloader.download] method. -- instantiate a [`WPExtractor`][extractor.WPExtractor] instance and call its `extract` method. The dataframes can then be accessed as class attributes or exported with the `export` method. +- instantiate a [`WPDownloader`][wpextract.WPDownloader] instance and call its [`download`][wpextract.WPDownloader.download] method. +- instantiate a [`WPExtractor`][wpextract.WPExtractor] instance and call its `extract` method. The dataframes can then be accessed as class attributes or exported with the `export` method. -Examples of usage are available in the CLI scripts in the `extractor.cli` module. +Examples of usage are available in the CLI scripts in the `wpextract.cli` module. ## Downloader -Use the [`extractor.WPDownloader`][extractor.WPDownloader] class. +Use the [`wpextract.WPDownloader`][wpextract.WPDownloader] class. Possible customisations include: -- Implement highly custom request behaviour by subclassing [`RequestSession`][extractor.dl.RequestSession] and passing to the `session` parameter. +- Implement highly custom request behaviour by subclassing [`RequestSession`][wpextract.dl.RequestSession] and passing to the `session` parameter. ## Extractor -Use the [`extractor.WPExtractor`][extractor.WPExtractor] class. +Use the [`wpextract.WPExtractor`][wpextract.WPExtractor] class. -When using this approach, it's possible to use [customised translation pickers](../advanced/multilingual.md#adding-support) by passing subclasses of [`LanguagePicker`][extractor.parse.translations.LangPicker] to the +When using this approach, it's possible to use [customised translation pickers](../advanced/multilingual.md#adding-support) by passing subclasses of [`LanguagePicker`][wpextract.parse.translations.LangPicker] to the diff --git a/docs/advanced/multilingual.md b/docs/advanced/multilingual.md index b29a209..09346e0 100644 --- a/docs/advanced/multilingual.md +++ b/docs/advanced/multilingual.md @@ -58,14 +58,14 @@ Currently the following plugins are supported: !!! info "See also" [Using WPextract as a library](library.md) for information on how to run wpextract as a library using additional pickers. -Support can be added by creating a new picker definition inheriting from [`LangPicker`][extractor.parse.translations.LangPicker]. +Support can be added by creating a new picker definition inheriting from [`LangPicker`][wpextract.parse.translations.LangPicker]. This parent class defines two abstract methods which must be implemented: -- [`LangPicker.get_root`][extractor.parse.translations.LangPicker.get_root] - returns the root element of the picker -- [`LangPicker.extract`][extractor.parse.translations.LangPicker.extract] - find the languages, call [`LangPicker.set_current_lang`][extractor.parse.translations.LangPicker.set_current_lang] and call [`LangPicker.add_translation`][extractor.parse.translations.LangPicker.add_translation] for each +- [`LangPicker.get_root`][wpextract.parse.translations.LangPicker.get_root] - returns the root element of the picker +- [`LangPicker.extract`][wpextract.parse.translations.LangPicker.extract] - find the languages, call [`LangPicker.set_current_lang`][wpextract.parse.translations.LangPicker.set_current_lang] and call [`LangPicker.add_translation`][wpextract.parse.translations.LangPicker.add_translation] for each -More complicted pickers may need to override additional methods of the class, but should still ultimately populate the [`LangPicker.translations`][extractor.parse.translations.LangPicker.translations] and [`LangPicker.current_language`][extractor.parse.translations.LangPicker.current_language] attributes as the parent class does. +More complicted pickers may need to override additional methods of the class, but should still ultimately populate the [`LangPicker.translations`][wpextract.parse.translations.LangPicker.translations] and [`LangPicker.current_language`][wpextract.parse.translations.LangPicker.current_language] attributes as the parent class does. This section will show implementing a new picker with the following simplified markup: diff --git a/docs/api/downloader.md b/docs/api/downloader.md index 502fc41..440c652 100644 --- a/docs/api/downloader.md +++ b/docs/api/downloader.md @@ -2,12 +2,12 @@ ## Downloading -::: extractor.WPDownloader +::: wpextract.WPDownloader ## Configuring Request Behaviour -::: extractor.dl.RequestSession +::: wpextract.dl.RequestSession options: members: false -::: extractor.dl.requestsession.AuthorizationType +::: wpextract.dl.requestsession.AuthorizationType diff --git a/docs/api/extractor.md b/docs/api/extractor.md index 59f81fa..b2f3181 100644 --- a/docs/api/extractor.md +++ b/docs/api/extractor.md @@ -1,23 +1,23 @@ # Extractor API ## Extraction -::: extractor.WPExtractor +::: wpextract.WPExtractor ## Extraction Data -::: extractor.extractors.data.links +::: wpextract.extractors.data.links options: show_root_heading: false show_root_toc_entry: false ## Multilingual Extraction -::: extractor.parse.translations.LangPicker +::: wpextract.parse.translations.LangPicker -::: extractor.parse.translations.PickerListType +::: wpextract.parse.translations.PickerListType -::: extractor.parse.translations.TranslationLink +::: wpextract.parse.translations.TranslationLink options: inherited_members: - destination diff --git a/docs/usage/extract.md b/docs/usage/extract.md index a46bcb4..45da793 100644 --- a/docs/usage/extract.md +++ b/docs/usage/extract.md @@ -59,7 +59,7 @@ The extraction process is applied to all posts simultaneously in the following o 2. Parse the HTML content from the API response input. 3. Parse the HTML content from the scrape file, if it was found for the link during the crawl 4. Extract the post's language and translations from the scrape file - * Translations are detected using the translation pickers (implementing [`LangPicker`][extractor.parse.translations.LangPicker]) + * Translations are detected using the translation pickers (implementing [`LangPicker`][wpextract.parse.translations.LangPicker]) * Custom pickers can be added if using this tool as a library * Any extracted translations are stored as unresolved links 5. Add the post's link to the link registry diff --git a/mkdocs.yml b/mkdocs.yml index 2f2469c..8a89ed5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -66,7 +66,7 @@ plugins: ignore_init_summary: true docstring_section_style: spacy # filters: ["!^_"] - preload_modules: ["extractor"] + preload_modules: ["wpextract"] heading_level: 3 inherited_members: true merge_init_into_class: true diff --git a/pyproject.toml b/pyproject.toml index 223cf59..b7a1c9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ version="1.0.0a0" description="Create a dataset from the WordPress API" authors=["Freddy Heppell "] packages=[ - { include = "extractor", from = "src"} + { include = "wpextract", from = "src"} ] homepage="https://gatenlp.github.io/wordpress-site-extractor/" repository="https://github.com/GateNLP/wordpress-site-extractor" @@ -12,7 +12,7 @@ license="Apache-2.0" readme = "README.md" [tool.poetry.scripts] -wpextract = "extractor.cli.cli:main" +wpextract = "wpextract.cli.cli:main" # Workaround for https://github.com/python-poetry/poetry/issues/9293 [[tool.poetry.source]] diff --git a/src/extractor/__init__.py b/src/wpextract/__init__.py similarity index 100% rename from src/extractor/__init__.py rename to src/wpextract/__init__.py diff --git a/src/extractor/cli/__init__.py b/src/wpextract/cli/__init__.py similarity index 100% rename from src/extractor/cli/__init__.py rename to src/wpextract/cli/__init__.py diff --git a/src/extractor/cli/_dl.py b/src/wpextract/cli/_dl.py similarity index 96% rename from src/extractor/cli/_dl.py rename to src/wpextract/cli/_dl.py index 46753d9..5faeb0a 100644 --- a/src/extractor/cli/_dl.py +++ b/src/wpextract/cli/_dl.py @@ -1,8 +1,8 @@ from argparse import Namespace -from extractor.cli._shared import _register_shared -from extractor.dl.downloader import WPDownloader -from extractor.dl.requestsession import RequestSession +from wpextract.cli._shared import _register_shared +from wpextract.dl.downloader import WPDownloader +from wpextract.dl.requestsession import RequestSession dl_types = ["categories", "media", "pages", "posts", "tags", "users"] diff --git a/src/extractor/cli/_extract.py b/src/wpextract/cli/_extract.py similarity index 88% rename from src/extractor/cli/_extract.py rename to src/wpextract/cli/_extract.py index fa69bb7..3b26456 100644 --- a/src/extractor/cli/_extract.py +++ b/src/wpextract/cli/_extract.py @@ -1,6 +1,6 @@ -from extractor.cli._shared import _register_shared -from extractor.extract import WPExtractor -from extractor.util.args import directory, empty_directory +from wpextract.cli._shared import _register_shared +from wpextract.extract import WPExtractor +from wpextract.util.args import directory, empty_directory def register_extract_parser(subparsers): diff --git a/src/extractor/cli/_shared.py b/src/wpextract/cli/_shared.py similarity index 100% rename from src/extractor/cli/_shared.py rename to src/wpextract/cli/_shared.py diff --git a/src/extractor/cli/cli.py b/src/wpextract/cli/cli.py similarity index 92% rename from src/extractor/cli/cli.py rename to src/wpextract/cli/cli.py index fe02188..cebf4b9 100644 --- a/src/extractor/cli/cli.py +++ b/src/wpextract/cli/cli.py @@ -5,8 +5,8 @@ from tqdm.auto import tqdm from tqdm.contrib.logging import logging_redirect_tqdm -from extractor.cli._dl import do_dl, register_dl_parser -from extractor.cli._extract import do_extract, register_extract_parser +from wpextract.cli._dl import do_dl, register_dl_parser +from wpextract.cli._extract import do_extract, register_extract_parser def _exec_command(parser, args): diff --git a/src/extractor/dl/ORIGINAL_LICENSE b/src/wpextract/dl/ORIGINAL_LICENSE similarity index 100% rename from src/extractor/dl/ORIGINAL_LICENSE rename to src/wpextract/dl/ORIGINAL_LICENSE diff --git a/src/extractor/dl/__init__.py b/src/wpextract/dl/__init__.py similarity index 100% rename from src/extractor/dl/__init__.py rename to src/wpextract/dl/__init__.py diff --git a/src/extractor/dl/downloader.py b/src/wpextract/dl/downloader.py similarity index 96% rename from src/extractor/dl/downloader.py rename to src/wpextract/dl/downloader.py index 242650c..206d6e5 100644 --- a/src/extractor/dl/downloader.py +++ b/src/wpextract/dl/downloader.py @@ -2,10 +2,10 @@ from pathlib import Path from typing import List, Optional -from extractor.dl.exceptions import WordPressApiNotV2 -from extractor.dl.exporter import Exporter -from extractor.dl.requestsession import RequestSession -from extractor.dl.wpapi import WPApi +from wpextract.dl.exceptions import WordPressApiNotV2 +from wpextract.dl.exporter import Exporter +from wpextract.dl.requestsession import RequestSession +from wpextract.dl.wpapi import WPApi class WPDownloader: diff --git a/src/extractor/dl/exceptions.py b/src/wpextract/dl/exceptions.py similarity index 100% rename from src/extractor/dl/exceptions.py rename to src/wpextract/dl/exceptions.py diff --git a/src/extractor/dl/exporter.py b/src/wpextract/dl/exporter.py similarity index 99% rename from src/extractor/dl/exporter.py rename to src/wpextract/dl/exporter.py index b56384c..7a44dbf 100644 --- a/src/extractor/dl/exporter.py +++ b/src/wpextract/dl/exporter.py @@ -28,7 +28,7 @@ from tqdm.auto import tqdm -from extractor.dl.requestsession import RequestSession +from wpextract.dl.requestsession import RequestSession class Exporter: diff --git a/src/extractor/dl/requestsession.py b/src/wpextract/dl/requestsession.py similarity index 100% rename from src/extractor/dl/requestsession.py rename to src/wpextract/dl/requestsession.py diff --git a/src/extractor/dl/utils.py b/src/wpextract/dl/utils.py similarity index 100% rename from src/extractor/dl/utils.py rename to src/wpextract/dl/utils.py diff --git a/src/extractor/dl/wpapi.py b/src/wpextract/dl/wpapi.py similarity index 99% rename from src/extractor/dl/wpapi.py rename to src/wpextract/dl/wpapi.py index 8cbd184..a519b0b 100644 --- a/src/extractor/dl/wpapi.py +++ b/src/wpextract/dl/wpapi.py @@ -26,17 +26,17 @@ from tqdm.auto import tqdm -from extractor.dl.exceptions import ( +from wpextract.dl.exceptions import ( NoWordpressApi, NSNotFoundException, WordPressApiNotV2, ) -from extractor.dl.requestsession import ( +from wpextract.dl.requestsession import ( HTTPError404, HTTPErrorInvalidPage, RequestSession, ) -from extractor.dl.utils import ( +from wpextract.dl.utils import ( get_by_id, get_content_as_json, url_path_join, diff --git a/src/extractor/extract.py b/src/wpextract/extract.py similarity index 90% rename from src/extractor/extract.py rename to src/wpextract/extract.py index 1109a86..4420216 100644 --- a/src/extractor/extract.py +++ b/src/wpextract/extract.py @@ -4,22 +4,22 @@ from pandas import DataFrame -from extractor.extractors.categories import load_categories -from extractor.extractors.data.links import LinkRegistry -from extractor.extractors.io import export_df -from extractor.extractors.media import load_media -from extractor.extractors.pages import load_pages -from extractor.extractors.posts import ( +from wpextract.extractors.categories import load_categories +from wpextract.extractors.data.links import LinkRegistry +from wpextract.extractors.io import export_df +from wpextract.extractors.media import load_media +from wpextract.extractors.pages import load_pages +from wpextract.extractors.posts import ( ensure_translations_undirected, load_posts, resolve_post_links, resolve_post_translations, ) -from extractor.extractors.tags import load_tags -from extractor.extractors.users import load_users -from extractor.parse.translations import PickerListType -from extractor.scrape.crawler import ScrapeCrawl -from extractor.util.file import prefix_filename +from wpextract.extractors.tags import load_tags +from wpextract.extractors.users import load_users +from wpextract.parse.translations import PickerListType +from wpextract.scrape.crawler import ScrapeCrawl +from wpextract.util.file import prefix_filename class WPExtractor: diff --git a/src/extractor/extractors/__init__.py b/src/wpextract/extractors/__init__.py similarity index 100% rename from src/extractor/extractors/__init__.py rename to src/wpextract/extractors/__init__.py diff --git a/src/extractor/extractors/categories.py b/src/wpextract/extractors/categories.py similarity index 87% rename from src/extractor/extractors/categories.py rename to src/wpextract/extractors/categories.py index 6ff43cc..1392c32 100644 --- a/src/extractor/extractors/categories.py +++ b/src/wpextract/extractors/categories.py @@ -4,9 +4,9 @@ import numpy as np import pandas as pd -from extractor.extractors.data.links import LinkRegistry -from extractor.extractors.io import load_df -from extractor.util.locale import extract_locale +from wpextract.extractors.data.links import LinkRegistry +from wpextract.extractors.io import load_df +from wpextract.util.locale import extract_locale EXPORT_COLUMNS = [ "name", diff --git a/src/extractor/extractors/data/__init__.py b/src/wpextract/extractors/data/__init__.py similarity index 100% rename from src/extractor/extractors/data/__init__.py rename to src/wpextract/extractors/data/__init__.py diff --git a/src/extractor/extractors/data/images.py b/src/wpextract/extractors/data/images.py similarity index 96% rename from src/extractor/extractors/data/images.py rename to src/wpextract/extractors/data/images.py index 6b9e967..45fed1c 100644 --- a/src/extractor/extractors/data/images.py +++ b/src/wpextract/extractors/data/images.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from typing import List, Optional -from extractor.extractors.data.links import Linkable, LinkRegistry +from wpextract.extractors.data.links import Linkable, LinkRegistry @dataclass diff --git a/src/extractor/extractors/data/link_resolver.py b/src/wpextract/extractors/data/link_resolver.py similarity index 95% rename from src/extractor/extractors/data/link_resolver.py rename to src/wpextract/extractors/data/link_resolver.py index 7df7275..9ff99a6 100644 --- a/src/extractor/extractors/data/link_resolver.py +++ b/src/wpextract/extractors/data/link_resolver.py @@ -2,8 +2,8 @@ from typing import List, Optional from urllib.parse import urlparse, urlunparse -from extractor.extractors.data.links import LinkRegistry, ResolvableLink -from extractor.util.str import remove_ends +from wpextract.extractors.data.links import LinkRegistry, ResolvableLink +from wpextract.util.str import remove_ends def resolve_link( diff --git a/src/extractor/extractors/data/links.py b/src/wpextract/extractors/data/links.py similarity index 100% rename from src/extractor/extractors/data/links.py rename to src/wpextract/extractors/data/links.py diff --git a/src/extractor/extractors/io.py b/src/wpextract/extractors/io.py similarity index 100% rename from src/extractor/extractors/io.py rename to src/wpextract/extractors/io.py diff --git a/src/extractor/extractors/media.py b/src/wpextract/extractors/media.py similarity index 95% rename from src/extractor/extractors/media.py rename to src/wpextract/extractors/media.py index a55b9ed..6c57a9c 100644 --- a/src/extractor/extractors/media.py +++ b/src/wpextract/extractors/media.py @@ -4,9 +4,9 @@ import pandas as pd from bs4 import Tag -from extractor.extractors.data.links import LinkRegistry -from extractor.extractors.io import load_df -from extractor.parse.html import extract_html_text +from wpextract.extractors.data.links import LinkRegistry +from wpextract.extractors.io import load_df +from wpextract.parse.html import extract_html_text EXPORT_COLUMNS = [ "alt_text", diff --git a/src/extractor/extractors/pages.py b/src/wpextract/extractors/pages.py similarity index 89% rename from src/extractor/extractors/pages.py rename to src/wpextract/extractors/pages.py index aaf8a61..1a5a24f 100644 --- a/src/extractor/extractors/pages.py +++ b/src/wpextract/extractors/pages.py @@ -4,11 +4,11 @@ import pandas as pd from tqdm.auto import tqdm -from extractor.extractors.data.links import LinkRegistry -from extractor.extractors.io import load_df -from extractor.parse.content import extract_content_data -from extractor.parse.html import extract_html_text, parse_html -from extractor.util.locale import extract_locale +from wpextract.extractors.data.links import LinkRegistry +from wpextract.extractors.io import load_df +from wpextract.parse.content import extract_content_data +from wpextract.parse.html import extract_html_text, parse_html +from wpextract.util.locale import extract_locale EXPORT_COLUMNS = [ "author", diff --git a/src/extractor/extractors/posts.py b/src/wpextract/extractors/posts.py similarity index 92% rename from src/extractor/extractors/posts.py rename to src/wpextract/extractors/posts.py index a6bce9b..ab007e1 100644 --- a/src/extractor/extractors/posts.py +++ b/src/wpextract/extractors/posts.py @@ -6,16 +6,16 @@ from pandas import DataFrame from tqdm.auto import tqdm -from extractor.extractors.data.images import resolve_images -from extractor.extractors.data.link_resolver import resolve_links -from extractor.extractors.data.links import LinkRegistry -from extractor.extractors.io import load_df -from extractor.parse.content import extract_content_data -from extractor.parse.html import extract_html_text, parse_html -from extractor.parse.translations import PickerListType, extract_translations -from extractor.parse.translations._resolver import TranslationLink -from extractor.scrape.scrape import load_scrape -from extractor.util.locale import extract_locale +from wpextract.extractors.data.images import resolve_images +from wpextract.extractors.data.link_resolver import resolve_links +from wpextract.extractors.data.links import LinkRegistry +from wpextract.extractors.io import load_df +from wpextract.parse.content import extract_content_data +from wpextract.parse.html import extract_html_text, parse_html +from wpextract.parse.translations import PickerListType, extract_translations +from wpextract.parse.translations._resolver import TranslationLink +from wpextract.scrape.scrape import load_scrape +from wpextract.util.locale import extract_locale EXPORT_COLUMNS = [ "author", diff --git a/src/extractor/extractors/tags.py b/src/wpextract/extractors/tags.py similarity index 85% rename from src/extractor/extractors/tags.py rename to src/wpextract/extractors/tags.py index ae02e13..9102c43 100644 --- a/src/extractor/extractors/tags.py +++ b/src/wpextract/extractors/tags.py @@ -3,9 +3,9 @@ import pandas as pd -from extractor.extractors.data.links import LinkRegistry -from extractor.extractors.io import load_df -from extractor.util.locale import extract_locale +from wpextract.extractors.data.links import LinkRegistry +from wpextract.extractors.io import load_df +from wpextract.util.locale import extract_locale EXPORT_COLUMNS = [ "count", diff --git a/src/extractor/extractors/users.py b/src/wpextract/extractors/users.py similarity index 94% rename from src/extractor/extractors/users.py rename to src/wpextract/extractors/users.py index 2656630..935bc2e 100644 --- a/src/extractor/extractors/users.py +++ b/src/wpextract/extractors/users.py @@ -3,7 +3,7 @@ import pandas as pd -from extractor.extractors.io import load_df +from wpextract.extractors.io import load_df EXPORT_COLUMNS = ["avatar", "description", "link", "name", "slug", "url"] diff --git a/src/extractor/parse/__init__.py b/src/wpextract/parse/__init__.py similarity index 100% rename from src/extractor/parse/__init__.py rename to src/wpextract/parse/__init__.py diff --git a/src/extractor/parse/content.py b/src/wpextract/parse/content.py similarity index 94% rename from src/extractor/parse/content.py rename to src/wpextract/parse/content.py index 2747479..26fe6a4 100644 --- a/src/extractor/parse/content.py +++ b/src/wpextract/parse/content.py @@ -6,10 +6,10 @@ import pandas as pd from bs4 import BeautifulSoup, Comment, NavigableString -from extractor.extractors.data.images import MediaUse, ResolvableMediaUse -from extractor.extractors.data.links import Link, ResolvableLink -from extractor.extractors.media import get_caption -from extractor.util.str import squash_whitespace +from wpextract.extractors.data.images import MediaUse, ResolvableMediaUse +from wpextract.extractors.data.links import Link, ResolvableLink +from wpextract.extractors.media import get_caption +from wpextract.util.str import squash_whitespace EXCLUDED_CONTENT_TAGS = {"figcaption"} NEWLINE_TAGS = {"br", "p"} diff --git a/src/extractor/parse/html.py b/src/wpextract/parse/html.py similarity index 94% rename from src/extractor/parse/html.py rename to src/wpextract/parse/html.py index 6a120a9..5f0b181 100644 --- a/src/extractor/parse/html.py +++ b/src/wpextract/parse/html.py @@ -2,7 +2,7 @@ from bs4 import BeautifulSoup -from extractor.util.str import squash_whitespace +from wpextract.util.str import squash_whitespace PROBABLY_HTML = re.compile(r"<|&\S+;") diff --git a/src/extractor/parse/translations/__init__.py b/src/wpextract/parse/translations/__init__.py similarity index 100% rename from src/extractor/parse/translations/__init__.py rename to src/wpextract/parse/translations/__init__.py diff --git a/src/extractor/parse/translations/_extractor.py b/src/wpextract/parse/translations/_extractor.py similarity index 95% rename from src/extractor/parse/translations/_extractor.py rename to src/wpextract/parse/translations/_extractor.py index b678c1f..10f084e 100644 --- a/src/extractor/parse/translations/_extractor.py +++ b/src/wpextract/parse/translations/_extractor.py @@ -4,7 +4,7 @@ import pandas as pd from bs4 import BeautifulSoup -import extractor.parse.translations._pickers as pickers +import wpextract.parse.translations._pickers as pickers PICKERS = [pickers.Polylang, pickers.GenericLangSwitcher] PickerListType = List[Type[pickers.LangPicker]] diff --git a/src/extractor/parse/translations/_pickers.py b/src/wpextract/parse/translations/_pickers.py similarity index 92% rename from src/extractor/parse/translations/_pickers.py rename to src/wpextract/parse/translations/_pickers.py index 9ff40e7..5ebd5c8 100644 --- a/src/extractor/parse/translations/_pickers.py +++ b/src/wpextract/parse/translations/_pickers.py @@ -4,8 +4,8 @@ from bs4 import BeautifulSoup, PageElement, Tag from langcodes import Language -from extractor.parse.translations._resolver import TranslationLink -from extractor.util.str import squash_whitespace +from wpextract.parse.translations._resolver import TranslationLink +from wpextract.util.str import squash_whitespace class LangPicker(ABC): @@ -17,11 +17,11 @@ class LangPicker(ABC): page_doc: BeautifulSoup """The document to extract the language picker from.""" root_el: Tag - """The root element of the language picker, populated if [`LangPicker.matches`][extractor.parse.translations.LangPicker.matches] is succesful.""" + """The root element of the language picker, populated if [`LangPicker.matches`][wpextract.parse.translations.LangPicker.matches] is succesful.""" translations: List[TranslationLink] - """A list of translation links, populated by calling [`LangPicker.add_translation`][extractor.parse.translations.LangPicker.add_translation] within [`LangPicker.extract`][extractor.parse.translations.LangPicker.extract].""" + """A list of translation links, populated by calling [`LangPicker.add_translation`][wpextract.parse.translations.LangPicker.add_translation] within [`LangPicker.extract`][wpextract.parse.translations.LangPicker.extract].""" current_language: Language - """The current language of the page, populated by calling [`LangPicker.set_current_lang`][extractor.parse.translations.LangPicker.set_current_lang] within [`LangPicker.extract`][extractor.parse.translations.LangPicker.extract].""" + """The current language of the page, populated by calling [`LangPicker.set_current_lang`][wpextract.parse.translations.LangPicker.set_current_lang] within [`LangPicker.extract`][wpextract.parse.translations.LangPicker.extract].""" def __init__(self, page_doc: BeautifulSoup): """Inits a language picker searcher. @@ -58,7 +58,7 @@ def matches(self) -> bool: def get_root(self) -> PageElement: """Retrieve the root element of the translation picker. - Using the [`LangPicker.page_doc`][extractor.parse.translations.LangPicker.page_doc] attribute (a [`bs4.BeautifulSoup`][bs4.BeautifulSoup] object representing the whole page), the root element of the picker shoudl be found and returned. + Using the [`LangPicker.page_doc`][wpextract.parse.translations.LangPicker.page_doc] attribute (a [`bs4.BeautifulSoup`][bs4.BeautifulSoup] object representing the whole page), the root element of the picker shoudl be found and returned. Returns: The root element, or None if this picker is not found on the page. diff --git a/src/extractor/parse/translations/_resolver.py b/src/wpextract/parse/translations/_resolver.py similarity index 88% rename from src/extractor/parse/translations/_resolver.py rename to src/wpextract/parse/translations/_resolver.py index 585ab84..72825f0 100644 --- a/src/extractor/parse/translations/_resolver.py +++ b/src/wpextract/parse/translations/_resolver.py @@ -2,7 +2,7 @@ from langcodes import Language -from extractor.extractors.data.links import ResolvableLink +from wpextract.extractors.data.links import ResolvableLink @dataclass diff --git a/src/extractor/scrape/__init__.py b/src/wpextract/scrape/__init__.py similarity index 100% rename from src/extractor/scrape/__init__.py rename to src/wpextract/scrape/__init__.py diff --git a/src/extractor/scrape/crawler.py b/src/wpextract/scrape/crawler.py similarity index 98% rename from src/extractor/scrape/crawler.py rename to src/wpextract/scrape/crawler.py index 06de7d3..686454d 100644 --- a/src/extractor/scrape/crawler.py +++ b/src/wpextract/scrape/crawler.py @@ -6,7 +6,7 @@ from bs4 import BeautifulSoup from tqdm.auto import tqdm -from extractor.scrape.processor import extract_self_url, self_url_strainer +from wpextract.scrape.processor import extract_self_url, self_url_strainer # Increment to invalidate old caches SCRAPE_CRAWL_VERSION = 1 diff --git a/src/extractor/scrape/processor.py b/src/wpextract/scrape/processor.py similarity index 100% rename from src/extractor/scrape/processor.py rename to src/wpextract/scrape/processor.py diff --git a/src/extractor/scrape/scrape.py b/src/wpextract/scrape/scrape.py similarity index 100% rename from src/extractor/scrape/scrape.py rename to src/wpextract/scrape/scrape.py diff --git a/src/extractor/util/__init__.py b/src/wpextract/util/__init__.py similarity index 100% rename from src/extractor/util/__init__.py rename to src/wpextract/util/__init__.py diff --git a/src/extractor/util/args.py b/src/wpextract/util/args.py similarity index 100% rename from src/extractor/util/args.py rename to src/wpextract/util/args.py diff --git a/src/extractor/util/file.py b/src/wpextract/util/file.py similarity index 100% rename from src/extractor/util/file.py rename to src/wpextract/util/file.py diff --git a/src/extractor/util/locale.py b/src/wpextract/util/locale.py similarity index 96% rename from src/extractor/util/locale.py rename to src/wpextract/util/locale.py index 911fbf9..6de0ec5 100644 --- a/src/extractor/util/locale.py +++ b/src/wpextract/util/locale.py @@ -3,7 +3,7 @@ from langcodes import Language, tag_is_valid -from extractor.util.str import remove_ends +from wpextract.util.str import remove_ends EXCLUDED_TAGS = {"tag"} # Tagoi diff --git a/src/extractor/util/str.py b/src/wpextract/util/str.py similarity index 100% rename from src/extractor/util/str.py rename to src/wpextract/util/str.py diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index 9652f85..4306221 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -1,7 +1,7 @@ from pathlib import Path import pandas as pd -from extractor.extract import WPExtractor +from wpextract.extract import WPExtractor def _assert_extractor_valid(extractor: WPExtractor): diff --git a/tests/extractors/data/test_images.py b/tests/extractors/data/test_images.py index c1665aa..e5b46c2 100644 --- a/tests/extractors/data/test_images.py +++ b/tests/extractors/data/test_images.py @@ -1,5 +1,5 @@ -from extractor.extractors.data.images import ResolvableMediaUse, resolve_image -from extractor.extractors.data.links import LinkRegistry +from wpextract.extractors.data.images import ResolvableMediaUse, resolve_image +from wpextract.extractors.data.links import LinkRegistry def test_image_resolver(): diff --git a/tests/extractors/data/test_link_resolver.py b/tests/extractors/data/test_link_resolver.py index 5e610cb..cdfec82 100644 --- a/tests/extractors/data/test_link_resolver.py +++ b/tests/extractors/data/test_link_resolver.py @@ -1,6 +1,6 @@ import pytest -from extractor.extractors.data.link_resolver import resolve_link -from extractor.extractors.data.links import LinkRegistry, ResolvableLink +from wpextract.extractors.data.link_resolver import resolve_link +from wpextract.extractors.data.links import LinkRegistry, ResolvableLink def test_link_resolver(): diff --git a/tests/extractors/data/test_links.py b/tests/extractors/data/test_links.py index 5034010..487ad12 100644 --- a/tests/extractors/data/test_links.py +++ b/tests/extractors/data/test_links.py @@ -1,4 +1,4 @@ -from extractor.extractors.data.links import Linkable, LinkRegistry +from wpextract.extractors.data.links import Linkable, LinkRegistry def test_add_link(): diff --git a/tests/extractors/test_categories.py b/tests/extractors/test_categories.py index 13f7f26..15cb113 100644 --- a/tests/extractors/test_categories.py +++ b/tests/extractors/test_categories.py @@ -1,8 +1,8 @@ import numpy as np import pandas as pd import pytest -from extractor.extractors.categories import load_categories -from extractor.extractors.data.links import LinkRegistry +from wpextract.extractors.categories import load_categories +from wpextract.extractors.data.links import LinkRegistry from helpers.df import ordered_col diff --git a/tests/extractors/test_io.py b/tests/extractors/test_io.py index 7270a3c..b545de8 100644 --- a/tests/extractors/test_io.py +++ b/tests/extractors/test_io.py @@ -1,5 +1,5 @@ import pandas as pd -from extractor.extractors.io import ( +from wpextract.extractors.io import ( df_denormalize_to_dict, export_df, load_df, diff --git a/tests/extractors/test_media.py b/tests/extractors/test_media.py index a105362..0dbb14c 100644 --- a/tests/extractors/test_media.py +++ b/tests/extractors/test_media.py @@ -2,8 +2,8 @@ import pandas as pd import pytest -from extractor.extractors.data.links import LinkRegistry -from extractor.extractors.media import load_media +from wpextract.extractors.data.links import LinkRegistry +from wpextract.extractors.media import load_media @pytest.fixture() diff --git a/tests/extractors/test_pages.py b/tests/extractors/test_pages.py index cc098b8..2c9eba0 100644 --- a/tests/extractors/test_pages.py +++ b/tests/extractors/test_pages.py @@ -2,8 +2,8 @@ import pandas as pd import pytest -from extractor.extractors.data.links import LinkRegistry -from extractor.extractors.pages import load_pages +from wpextract.extractors.data.links import LinkRegistry +from wpextract.extractors.pages import load_pages from helpers.df import ordered_col diff --git a/tests/extractors/test_posts.py b/tests/extractors/test_posts.py index 0e6eb66..441d4de 100644 --- a/tests/extractors/test_posts.py +++ b/tests/extractors/test_posts.py @@ -1,18 +1,18 @@ from datetime import datetime from pathlib import Path -import extractor +import wpextract import pandas as pd import pytest from bs4 import BeautifulSoup -from extractor.extractors.data.links import Linkable, LinkRegistry -from extractor.extractors.posts import ( +from wpextract.extractors.data.links import Linkable, LinkRegistry +from wpextract.extractors.posts import ( ensure_translations_undirected, load_posts, resolve_post_media, resolve_post_translations, ) -from extractor.parse.translations._resolver import TranslationLink +from wpextract.parse.translations._resolver import TranslationLink from helpers.df import ordered_col from helpers.file import json_without_cols from pytest_mock import MockerFixture @@ -52,7 +52,7 @@ def mock_translation_extractor(post_bs: BeautifulSoup, link: str, translation_pi @pytest.fixture() def _do_mock_translation_extractor(mocker: MockerFixture): mocker.patch( - "extractor.extractors.posts.extract_translations", mock_translation_extractor + "wpextract.extractors.posts.extract_translations", mock_translation_extractor ) @@ -131,7 +131,7 @@ def test_language(posts_df): @pytest.fixture() def spy_extractor_data(mocker: MockerFixture): - return mocker.spy(extractor.extractors.posts, "extract_content_data") + return mocker.spy(wpextract.extractors.posts, "extract_content_data") def test_extract_content_call(spy_extractor_data, posts_df): diff --git a/tests/extractors/test_tags.py b/tests/extractors/test_tags.py index 04a29ae..08499ce 100644 --- a/tests/extractors/test_tags.py +++ b/tests/extractors/test_tags.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from extractor.extractors.data.links import LinkRegistry -from extractor.extractors.tags import load_tags +from wpextract.extractors.data.links import LinkRegistry +from wpextract.extractors.tags import load_tags from helpers.df import ordered_col diff --git a/tests/extractors/test_users.py b/tests/extractors/test_users.py index bb327e8..44244c5 100644 --- a/tests/extractors/test_users.py +++ b/tests/extractors/test_users.py @@ -1,6 +1,6 @@ import pandas as pd import pytest -from extractor.extractors.users import load_users +from wpextract.extractors.users import load_users from helpers.file import json_without_cols diff --git a/tests/parse/test_content.py b/tests/parse/test_content.py index d6c4417..113ab02 100644 --- a/tests/parse/test_content.py +++ b/tests/parse/test_content.py @@ -1,9 +1,9 @@ from pathlib import Path from bs4 import BeautifulSoup -from extractor.extractors.data.images import MediaUse, ResolvableMediaUse -from extractor.extractors.data.links import Link, ResolvableLink -from extractor.parse.content import ( +from wpextract.extractors.data.images import MediaUse, ResolvableMediaUse +from wpextract.extractors.data.links import Link, ResolvableLink +from wpextract.parse.content import ( extract_content_data, extract_embeds, extract_images, diff --git a/tests/parse/translations/test_pickers.py b/tests/parse/translations/test_pickers.py index 538c26d..205ad38 100644 --- a/tests/parse/translations/test_pickers.py +++ b/tests/parse/translations/test_pickers.py @@ -1,10 +1,10 @@ from pathlib import Path from typing import Type -import extractor.parse.translations._pickers as pickers +import wpextract.parse.translations._pickers as pickers import pytest from bs4 import BeautifulSoup -from extractor.parse.translations._resolver import TranslationLink +from wpextract.parse.translations._resolver import TranslationLink from langcodes import Language diff --git a/tests/util/test_args.py b/tests/util/test_args.py index 627c7f1..55fd94c 100644 --- a/tests/util/test_args.py +++ b/tests/util/test_args.py @@ -1,7 +1,7 @@ from argparse import ArgumentTypeError import pytest -from extractor.util.args import directory, empty_directory +from wpextract.util.args import directory, empty_directory def test_directory_when_directory(tmp_path): diff --git a/tests/util/test_file.py b/tests/util/test_file.py index 0c15b45..f9526f5 100644 --- a/tests/util/test_file.py +++ b/tests/util/test_file.py @@ -1,4 +1,4 @@ -from extractor.util.file import prefix_filename +from wpextract.util.file import prefix_filename def test_prefix_filename(): diff --git a/tests/util/test_locale.py b/tests/util/test_locale.py index 37945f8..25cea2a 100644 --- a/tests/util/test_locale.py +++ b/tests/util/test_locale.py @@ -1,5 +1,5 @@ import pytest -from extractor.util.locale import extract_locale +from wpextract.util.locale import extract_locale @pytest.mark.parametrize( diff --git a/tests/util/test_str.py b/tests/util/test_str.py index 67bdb6b..dc9914e 100644 --- a/tests/util/test_str.py +++ b/tests/util/test_str.py @@ -1,5 +1,5 @@ import pytest -from extractor.util.str import ( +from wpextract.util.str import ( ensure_suffix, remove_ends, remove_prefix,