From 3a191e4fc720b9cf0397a3e51675128edbb842a5 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 9 Jul 2024 20:10:26 +0100 Subject: [PATCH 1/5] package meta and readme updates --- README.md | 22 ++++++++++++++++++---- pyproject.toml | 4 +++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 42d9bc5..0934f11 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,30 @@ # WPextract - WordPress Site Extractor -**WPextract is a tool to create datasets from WordPress sites.** +

+ + WPextract is a tool to create datasets from WordPress sites. + +

+ + +

+ + PyPI - Version + + + Conda Version + +

- Archives posts, pages, tags, categories, media (including files), comments, and users - Uses the WordPress API to guarantee 100% accurate and complete content - Resolves internal links and media to IDs - Automatically parses multilingual sites to create parallel datasets -> [!NOTE] -> This software was developed for our EMNLP 2023 paper [_Analysing State-Backed Propaganda Websites: a New Dataset and Linguistic Study_](https://aclanthology.org/2023.emnlp-main.349/). The code has been updated since the paper was written; for archival purposes, the precise version used for the study is [available on Zenodo](https://zenodo.org/records/10008086). ## Quickstart -See the [complete documentation](#) for more detailed usage. +See the [complete documentation](https://gatenlp.github.io/wordpress-site-extractor/) for more detailed usage. 1. Install with `pipx` ```shell-session @@ -37,6 +49,8 @@ Available under the Apache 2.0 license. See [LICENSE](LICENSE) for more informat ## Citing +> This software was developed for our EMNLP 2023 paper [_Analysing State-Backed Propaganda Websites: a New Dataset and Linguistic Study_](https://aclanthology.org/2023.emnlp-main.349/). The code has been updated since the paper was written; for archival purposes, the precise version used for the study is [available on Zenodo](https://zenodo.org/records/10008086). + We'd love to hear about your use of our tool, you can [email us](mailto:frheppell1@sheffield.ac.uk) to let us know! Feel free to create issues and/or pull requests for new features or bugs. If you use this tool in published work, please cite [our EMNLP paper](https://aclanthology.org/2023.emnlp-main.349/): diff --git a/pyproject.toml b/pyproject.toml index 38fa7ca..57061bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,15 @@ [tool.poetry] name="wpextract" -version="0.0.1" +version="1.0.0a0" description="Create a dataset from the WordPress API" authors=["Freddy Heppell "] packages=[ { include = "extractor", from = "src"} ] +homepage="https://gatenlp.github.io/wordpress-site-extractor/" repository="https://github.com/GateNLP/wordpress-site-extractor" license="Apache-2.0" +readme = "README.md" [tool.poetry.scripts] wpextract = "extractor.cli.cli:main" From fbe5d20d9a34938c03c993821d83aa594952a1bb Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 9 Jul 2024 20:16:05 +0100 Subject: [PATCH 2/5] relax dependency constraints --- pyproject.toml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 57061bf..223cf59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,13 +21,13 @@ url = "https://pypi.org/simple/" [tool.poetry.dependencies] python = ">=3.9.0,<3.13" -beautifulsoup4 = ">=4.12.3" -langcodes = ">=3.4.0" -lxml = ">=5.2.2" -numpy = ">=1.26.4" -pandas = ">=2.2.2" -tqdm = ">=4.66.4" -requests = "^2.32.3" +beautifulsoup4 = ">=4.12.0" +langcodes = ">=3.3.0" +lxml = ">=5.0.0" +numpy = ">=1.23.0" +pandas = ">=1.5.2" +tqdm = ">=4.65.0" +requests = ">=2.32.3" [tool.poetry.group.dev.dependencies] build = "==0.9.*,>=0.9.0" @@ -37,6 +37,8 @@ pytest-mock = "~3.14.0" ruff = "^0.5.0" +[tool.poetry.group.docs] +optional = true [tool.poetry.group.docs.dependencies] mkdocs-material = "^9.5.28" mkdocstrings = "^0.25.1" From b5d8b31ca1f0a712a3e457b791f0b01195429d2e Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 9 Jul 2024 20:16:26 +0100 Subject: [PATCH 3/5] regen lock --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index c107442..998c4bd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1671,13 +1671,13 @@ reference = "pypi-public" [[package]] name = "setuptools" -version = "70.2.0" +version = "70.3.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-70.2.0-py3-none-any.whl", hash = "sha256:b8b8060bb426838fbe942479c90296ce976249451118ef566a5a0b7d8b78fb05"}, - {file = "setuptools-70.2.0.tar.gz", hash = "sha256:bd63e505105011b25c3c11f753f7e3b8465ea739efddaccef8f0efac2137bac1"}, + {file = "setuptools-70.3.0-py3-none-any.whl", hash = "sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc"}, + {file = "setuptools-70.3.0.tar.gz", hash = "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5"}, ] [package.extras] @@ -1888,4 +1888,4 @@ reference = "pypi-public" [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.13" -content-hash = "ace72c716a8e50e5d331cc0e619e0574561ffe2aed7f4dc427f8ce36139d9df7" +content-hash = "4839f696fac9b937ce620d57bf8345bd9c0a57d2890a03d308b5544d759a7d6e" From e8956cefcba12cf58fff58e7b58541975cf81f7d Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 9 Jul 2024 20:44:36 +0100 Subject: [PATCH 4/5] readme tweaks --- README.md | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 0934f11..cf411a2 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,9 @@ # WPextract - WordPress Site Extractor -

- - WPextract is a tool to create datasets from WordPress sites. - -

- - -

- - PyPI - Version - - - Conda Version - -

+PyPI - Version +Conda Version + +**WPextract is a tool to create datasets from WordPress sites.** - Archives posts, pages, tags, categories, media (including files), comments, and users - Uses the WordPress API to guarantee 100% accurate and complete content From 8bef3dac7ed24c280ab1220ac7f7a8b1975970ff Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 9 Jul 2024 20:56:23 +0100 Subject: [PATCH 5/5] put back gh markdown admonition --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index cf411a2..c6b796e 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ Available under the Apache 2.0 license. See [LICENSE](LICENSE) for more informat ## Citing +> [!NOTE] > This software was developed for our EMNLP 2023 paper [_Analysing State-Backed Propaganda Websites: a New Dataset and Linguistic Study_](https://aclanthology.org/2023.emnlp-main.349/). The code has been updated since the paper was written; for archival purposes, the precise version used for the study is [available on Zenodo](https://zenodo.org/records/10008086). We'd love to hear about your use of our tool, you can [email us](mailto:frheppell1@sheffield.ac.uk) to let us know! Feel free to create issues and/or pull requests for new features or bugs.