From c6e834030779f0fb59aa3888c2f3222101bbdd0f Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 3 Dec 2024 16:23:03 +0100 Subject: [PATCH] prepare version 2.0.0 (#759) * prepare version 2.0.0 * update setup and wording * docs: readme and structure * update dependabot and funding * update contributing and history files --- .github/FUNDING.yml | 2 +- .github/dependabot.yml | 18 ------------ CONTRIBUTING.md | 26 +++++++++-------- HISTORY.md | 12 ++++++-- README.md | 65 ++++++++++++++--------------------------- docs/index.rst | 43 +++++++++++++++------------ docs/used-by.rst | 17 ++++------- pyproject.toml | 11 +++---- trafilatura/__init__.py | 2 +- 9 files changed, 85 insertions(+), 111 deletions(-) delete mode 100644 .github/dependabot.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 08ccbf34..0f192886 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,6 +1,6 @@ # These are supported funding model platforms -github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +github: [adbar] patreon: # Replace with a single Patreon username open_collective: # Replace with a single Open Collective username ko_fi: adbarbaresi diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index 8d8927a7..00000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,18 +0,0 @@ -# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file -# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot - -version: 2 -updates: - - package-ecosystem: "pip" # See documentation for possible values - directory: "/" # Location of package manifests - schedule: - interval: "monthly" - # create a group of dependencies to be updated together in one pull request - groups: - # specify a name for the group, which will be used in pull request titles - # and branch names - dependencies: - # define patterns to include dependencies in the group (based on - # dependency name) - patterns: - - "*" # matches all dependencies in the package ecosystem diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d7018001..4423a4ec 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,12 +1,12 @@ ## How to contribute -Thank you for considering contributing to Trafilatura! Your contributions make the software and its documentation better. +Your contributions make the software and its documentation better. A special thanks to all the [contributors](https://github.com/adbar/trafilatura/graphs/contributors) who have played a part in Trafilatura. There are many ways to contribute, you could: * Improve the documentation: Write tutorials and guides, correct mistakes, or translate existing content. - * Find bugs and submit bug reports: Help making Trafilatura a robust and versatile tool. + * Find bugs and submit bug reports: Help making Trafilatura an even more robust tool. * Submit feature requests: Share your feedback and suggestions. * Write code: Fix bugs or add new features. @@ -14,26 +14,28 @@ There are many ways to contribute, you could: Here are some important resources: * [List of currently open issues](https://github.com/adbar/trafilatura/issues) (no pretention to exhaustivity!) - * [Roadmap and milestones](https://github.com/adbar/trafilatura/milestones) - * [How to Contribute to Open Source](https://opensource.guide/how-to-contribute/) + * [How to contribute to open source](https://opensource.guide/how-to-contribute/) -## Submitting changes +## Testing and evaluating the code -Please send a [GitHub Pull Request to trafilatura](https://github.com/adbar/trafilatura/pull/new/master) with a clear list of what you have done (read more about [pull requests](http://help.github.com/pull-requests/)). +Here is how you can run the tests and code quality checks: -**Working on your first Pull Request?** See this tutorial: [How To Create a Pull Request on GitHub](https://www.digitalocean.com/community/tutorials/how-to-create-a-pull-request-on-github) +- Install the necessary packages with `pip install trafilatura[dev]` +- Run `pytest` from trafilatura's directory, or select a particular test suite, for example `realworld_tests.py`, and run `pytest realworld_tests.py` or simply `python3 realworld_tests.py` +- Run `mypy` on the directory: `mypy trafilatura/` +- See also the [tests Readme](tests/README.rst) for information on the evaluation benchmark +Pull requests will only be accepted if they there are no errors in pytest and mypy. -A special thanks to all the [contributors](https://github.com/adbar/trafilatura/graphs/contributors) who have played a part in Trafilatura. +If you work on text extraction it is useful to check if performance is equal or better on the benchmark. -## Testing and evaluating the code +## Submitting changes -Here is how you can run the tests if you wish to correct the errors and further improve the code: +Please send a pull request to Trafilatura with a list of what you have done (read more about [pull requests](http://help.github.com/pull-requests/)). -- Run `pytest` from trafilatura's directory, or select a particular test suite, for example `realworld_tests.py`, and run `pytest realworld_tests.py` or simply `python3 realworld_tests.py` -- See also the [tests Readme](tests/README.rst) for information on the evaluation +**Working on your first Pull Request?** See this tutorial: [How To Create a Pull Request on GitHub](https://www.digitalocean.com/community/tutorials/how-to-create-a-pull-request-on-github) diff --git a/HISTORY.md b/HISTORY.md index 6dba38be..d7059653 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,7 +1,7 @@ ## History / Changelog -## future v2.0.0 +## 2.0.0 Breaking changes: - Python 3.6 and 3.7 deprecated (#709) @@ -12,6 +12,7 @@ Breaking changes: - downloads: remove `decode` argument in `fetch_url()` → use `fetch_response` instead (#724) - deprecated graphical user interface now removed (#713) - extraction: move `max_tree_size` parameter to `settings.cfg` (#742) +- use type hinting (#721, #723, #748) - see [Python](https://trafilatura.readthedocs.io/en/latest/usage-python.html#deprecations) and [CLI](https://trafilatura.readthedocs.io/en/latest/usage-cli.html#deprecations) deprecations in the docs Fixes: @@ -20,11 +21,16 @@ Fixes: - more robust mapping for conversion to HTML (#721) - CLI downloads: use all information in settings file (#734) - downloads: cleaner urllib3 code (#736) -- CLI: print URLs early for feeds and sitemaps with `--list` with @gremid (#744) +- refine table markdown output by @unsleepy22 (#752) +- extraction fix: images in text nodes by @unsleepy22 (#757) Metadata: - more robust URL extraction (#710) +Command-line interface: +- CLI: print URLs early for feeds and sitemaps with `--list` with @gremid (#744) +- CLI: add 126 exit code for high error ratio (#747) + Maintenance: - remove already deprecated functions and args (#716) - add type hints (#723, #728) @@ -33,10 +39,12 @@ Maintenance: - better debug messages in `main_extractor` (#714) - evaluation: review data, update packages, add magic_html (#731) - setup: explicit exports through `__all__` (#740) +- tests: extend coverage (#753) Documentation: - fix link in `docs/index.html` by @nzw0301 (#711) - remove docs from published packages (#743) +- update docs (#745) ## 1.12.2 diff --git a/README.md b/README.md index 77912006..f17387a6 100644 --- a/README.md +++ b/README.md @@ -32,15 +32,16 @@ required, the output can be converted to commonly used formats. Going from HTML bulk to essential parts can alleviate many problems related to text quality, by **focusing on the actual content**, -**avoiding the noise** caused by recurring elements (headers, footers -etc.), and **making sense of the data** with selected information. The -extractor is designed to be **robust and reasonably fast**, it runs in -production on millions of documents. +**avoiding the noise** caused by recurring elements like headers and footers +and by **making sense of the data and metadata** with selected information. +The extractor strikes a balance between limiting noise (precision) and +including all valid parts (recall). It is **robust and reasonably fast**. -The tool's versatility makes it **useful for quantitative and -data-driven approaches**. It is used in the academic domain and beyond -(e.g. in natural language processing, computational social science, -search engine optimization, and information security). +Trafilatura is [widely used](https://trafilatura.readthedocs.io/en/latest/used-by.html) +and integrated into [thousands of projects](https://github.com/adbar/trafilatura/network/dependents>) +by companies like HuggingFace, IBM, and Microsoft Research as well as institutions like +the Allen Institute, Stanford, the Tokyo Institute of Technology, and +the University of Munich. ### Features @@ -85,22 +86,6 @@ For more information see the [benchmark section](https://trafilatura.readthedocs and the [evaluation readme](https://github.com/adbar/trafilatura/blob/master/tests/README.rst) to run the evaluation with the latest data and packages. -**750 documents, 2236 text & 2250 boilerplate segments (2022-05-18), Python 3.8** - -| Python Package | Precision | Recall | Accuracy | F-Score | Diff. | -|----------------|-----------|--------|----------|---------|-------| -| html_text 0.5.2 | 0.529 | **0.958** | 0.554 | 0.682 | 2.2x | -| inscriptis 2.2.0 (html to txt) | 0.534 | **0.959** | 0.563 | 0.686 | 3.5x | -| newspaper3k 0.2.8 | 0.895 | 0.593 | 0.762 | 0.713 | 12x | -| justext 3.0.0 (custom) | 0.865 | 0.650 | 0.775 | 0.742 | 5.2x | -| boilerpy3 1.0.6 (article mode) | 0.814 | 0.744 | 0.787 | 0.777 | 4.1x | -| *baseline (text markup)* | 0.757 | 0.827 | 0.781 | 0.790 | **1x** | -| goose3 3.1.9 | **0.934** | 0.690 | 0.821 | 0.793 | 22x | -| readability-lxml 0.8.1 | 0.891 | 0.729 | 0.820 | 0.801 | 5.8x | -| news-please 1.5.22 | 0.898 | 0.734 | 0.826 | 0.808 | 61x | -| readabilipy 0.2.0 | 0.877 | 0.870 | 0.874 | 0.874 | 248x | -| trafilatura 1.2.2 (standard) | 0.914 | 0.904 | **0.910** | **0.909** | 7.1x | - #### Other evaluations: @@ -138,7 +123,7 @@ This package is distributed under the [Apache 2.0 license](https://www.apache.or Versions prior to v1.8.0 are under GPLv3+ license. -## Contributing +### Contributing Contributions of all kinds are welcome. Visit the [Contributing page](https://github.com/adbar/trafilatura/blob/master/CONTRIBUTING.md) @@ -152,13 +137,17 @@ who extended the docs or submitted bug reports, features and bugfixes! ## Context -Developed with practical applications of academic research in mind, this -software is part of a broader effort to derive information from web -documents. Extracting and pre-processing web texts to the exacting -standards of scientific research presents a substantial challenge. This -software package simplifies text data collection and enhances corpus -quality, it is currently used to build [text databases for linguistic -research](https://www.dwds.de/d/k-web). +This work started as a PhD project at the crossroads of linguistics and +NLP, this expertise has been instrumental in shaping Trafilatura over +the years. Initially launched to create text databases for research purposes +at the Berlin-Brandenburg Academy of Sciences (DWDS and ZDL units), +this package continues to be maintained but its future development +depends on community support. + +**If you value this software or depend on it for your product, consider +sponsoring it and contributing to its codebase**. Your support will +help maintain and enhance this popular package, ensuring its growth, +robustness, and accessibility for developers and users around the world. *Trafilatura* is an Italian word for [wire drawing](https://en.wikipedia.org/wiki/Wire_drawing) symbolizing the @@ -171,11 +160,6 @@ Reach out via ia the software repository or the [contact page](https://adrien.barbaresi.eu/) for inquiries, collaborations, or feedback. See also social networks for the latest updates. -This work started as a PhD project at the crossroads of linguistics and -NLP, this expertise has been instrumental in shaping Trafilatura over -the years. It has first been released under its current form in 2019, -its development is referenced in the following publications: - - Barbaresi, A. [Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction](https://aclanthology.org/2021.acl-demo.15/), Proceedings @@ -212,18 +196,13 @@ acquisition. Here is how to cite it: ### Software ecosystem -Case studies and publications are listed on the [Used By documentation -page](https://trafilatura.readthedocs.io/en/latest/used-by.html). - Jointly developed plugins and additional packages also contribute to the field of web data extraction and analysis: Software ecosystem Corresponding posts can be found on [Bits of -Language](https://adrien.barbaresi.eu/blog/tag/trafilatura.html). The -blog covers a range of topics from technical how-tos, updates on new -features, to discussions on text mining challenges and solutions. +Language](https://adrien.barbaresi.eu/blog/tag/trafilatura.html). Impressive, you have reached the end of the page: Thank you for your interest! diff --git a/docs/index.rst b/docs/index.rst index 02969c85..fb40e4b3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -40,9 +40,9 @@ Description Trafilatura is a **Python package and command-line tool** designed to gather text on the Web. It includes discovery, extraction and text processing components. Its main applications are **web crawling, downloads, scraping, and extraction** of main texts, metadata and comments. It aims at staying **handy and modular**: no database is required, the output can be converted to commonly used formats. -Going from raw HTML to essential parts can alleviate many problems related to text quality, first by avoiding the **noise caused by recurring elements** (headers, footers, links/blogroll etc.) and second by including information such as author and date in order to **make sense of the data**. The extractor tries to strike a balance between limiting noise (precision) and including all valid parts (recall). It also has to be **robust and reasonably fast**, it runs in production on millions of documents. +Going from raw HTML to essential parts can alleviate many problems related to text quality, by avoiding the **noise caused by recurring elements** like headers and footers and by **making sense of the data and metadata** with selected information. The extractor strikes a balance between limiting noise (precision) and including all valid parts (recall). It is **robust and reasonably fast**. -This tool can be **useful for quantitative research** in corpus linguistics, natural language processing, computational social science and beyond: it is relevant to anyone interested in data science, information extraction, text mining, and scraping-intensive use cases like search engine optimization, business analytics or information security. +Trafilatura is `widely used `_ and integrated into `thousands of projects `_ by companies like HuggingFace, IBM, and Microsoft Research as well as institutions like the Allen Institute, Stanford, the Tokyo Institute of Technology, and the University of Munich. Features @@ -120,25 +120,27 @@ Versions prior to v1.8.0 are under GPLv3+ license. Contributing ------------- +~~~~~~~~~~~~ Contributions of all kinds are welcome. Visit the `Contributing page `_ for more information. Bug reports can be filed on the `dedicated issue page `_. Many thanks to the `contributors `_ who extended the docs or submitted bug reports, features and bugfixes! -Changes -------- - -For version history and changes see the `changelog `_. - - Context ------- -Originally released to collect data for linguistic research and lexicography at the `Berlin-Brandenburg Academy of Sciences `_, Trafilatura is now `widely used `_. +This work started as a PhD project at the crossroads of linguistics and NLP, +this expertise has been instrumental in shaping Trafilatura over the years. +Initially launched to create text databases for research purposes +at the Berlin-Brandenburg Academy of Sciences (DWDS and ZDL units), +this package continues to be maintained but its future development +depends on community support. -Extracting and pre-processing web texts to the exacting standards of scientific research presents a substantial challenge. These documentation pages also provide information on `concepts behind data collection `_ as well as `tutorials `_ on how to gather web texts. +**If you value this software or depend on it for your product, consider +sponsoring it and contributing to its codebase**. Your support will +help maintain and enhance this popular package, ensuring its growth, +robustness, and accessibility for developers and users around the world. *Trafilatura* is an Italian word for `wire drawing `_ symbolizing the refinement and conversion process. It is also the way shapes of pasta are formed. @@ -148,9 +150,6 @@ Author Reach out via the software repository or the `contact page `_ for inquiries, collaborations, or feedback. See also social networks for the latest updates. -This work started as a PhD project at the crossroads of linguistics and NLP, this expertise has been instrumental in shaping Trafilatura over the years. It has first been released under its current form in 2019, its development is referenced in the following publications: - - - Barbaresi, A. `Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction `_, Proceedings of ACL/IJCNLP 2021: System Demonstrations, 2021, p. 122-131. - Barbaresi, A. "`Generic Web Content Extraction with Open-Source Software `_", Proceedings of KONVENS 2019, Kaleidoscope Abstracts, 2019. - Barbaresi, A. "`Efficient construction of metadata-enhanced web corpora `_", Proceedings of the `10th Web as Corpus Workshop (WAC-X) `_, 2016. @@ -186,8 +185,6 @@ Trafilatura is widely used in the academic domain, chiefly for data acquisition. Software ecosystem ~~~~~~~~~~~~~~~~~~ -Case studies and publications are listed on the `Used By documentation page `_. - Jointly developed plugins and additional packages also contribute to the field of web data extraction and analysis: .. image:: software-ecosystem.png @@ -195,7 +192,10 @@ Jointly developed plugins and additional packages also contribute to the field o :align: center :width: 65% -Corresponding posts on `Bits of Language `_ (blog). +Corresponding posts can be found on +`Bits of Language `_. +The blog covers a range of topics from technical how-tos, updates on new +features, to discussions on text mining challenges and solutions. Building the docs @@ -208,6 +208,13 @@ Starting from the ``docs/`` folder of the repository: +Changes +------- + +For version history and changes see the `changelog `_. + + + Further documentation ===================== @@ -222,4 +229,4 @@ Further documentation used-by background -* :ref:`genindex` +:ref:`genindex` diff --git a/docs/used-by.rst b/docs/used-by.rst index a61385e8..021953e8 100644 --- a/docs/used-by.rst +++ b/docs/used-by.rst @@ -6,9 +6,9 @@ Uses & citations Trafilatura now widely used, integrated into other software packages and cited in research publications. Notable projects and institutional users are listed on this page. -Originally released to collect data for linguistic research and lexicography at the `Berlin-Brandenburg Academy of Sciences `_, Trafilatura is used by numerous institutions, integrated into other software packages and cited in research publications across fields such as linguistics, natural language processing, social sciences, information science, and AI (large language models). +Initially released to collect data for linguistic research and lexicography at the Berlin-Brandenburg Academy of Sciences, Trafilatura is used by numerous institutions, integrated into other software packages and cited in research publications across fields such as linguistics, natural language processing, computational social science, search engine optimization, information security, and artificial intelligence (large language models). -The tool is recognized for its effectiveness in article extraction, earning accolades as the most efficient open-source library in benchmarks and academic evaluations. It supports language modeling by providing high-quality text data, aids data mining with efficient web data retrieval, and streamlines information extraction from unstructured content. In SEO and business analytics it gathers online data for insights and in information security, it monitors websites for threat detection. +The tool earns accolades as the most efficient open-source library in benchmarks and academic evaluations. It supports language modeling by providing high-quality text data, aids data mining with efficient web data retrieval, and streamlines information extraction from unstructured content. In SEO and business analytics it gathers online data for insights and in information security, it monitors websites for threat detection. If you wish to add further references, please `edit this page `_ and suggest changes by submitting a pull request. @@ -17,8 +17,8 @@ If you wish to add further references, please `edit this page `_ used to pre-train the OLMo LLM - HuggingFace with `DataTrove `_ to process, filter and deduplicate text data @@ -34,6 +34,8 @@ Known institutional users - Turku University, NLP department with `FinGPT `_ models - University of Munich (LMU), Center for Language and Information Processing, `GlotWeb project `_ +The Go port `go-trafilatura `_ is used at Microsoft Research. + Various software repositories ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -200,10 +202,3 @@ Publications citing Htmldate See `citation page of htmldate's documentation `_. - - -Ports ------ - -Go port - `go-trafilatura `_ diff --git a/pyproject.toml b/pyproject.toml index 0d352adc..0fcb7932 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Topic :: Internet :: WWW/HTTP", "Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Security", @@ -56,13 +57,13 @@ classifiers = [ ] dependencies = [ "certifi", - "charset_normalizer >= 3.2.0", - "courlan >= 1.3.1", - "htmldate >= 1.9.1", + "charset_normalizer >= 3.4.0", + "courlan >= 1.3.2", + "htmldate >= 1.9.2", "justext >= 3.0.1", # see tests on Github Actions "lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'", - "lxml >= 5.2.2 ; platform_system != 'Darwin' or python_version > '3.8'", + "lxml >= 5.3.0 ; platform_system != 'Darwin' or python_version > '3.8'", "urllib3 >= 1.26, < 3", ] @@ -104,7 +105,7 @@ all = [ "brotli", "cchardet >= 2.1.7; python_version < '3.11'", # build issue "faust-cchardet >= 2.1.19; python_version >= '3.11'", - "htmldate[speed] >= 1.9.1", + "htmldate[speed] >= 1.9.2", "py3langid >= 0.3.0", "pycurl >= 7.45.3", "urllib3[socks]", diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py index 00f81b5a..7017238a 100644 --- a/trafilatura/__init__.py +++ b/trafilatura/__init__.py @@ -7,7 +7,7 @@ __author__ = "Adrien Barbaresi and contributors" __license__ = "Apache-2.0" __copyright__ = "Copyright 2019-present, Adrien Barbaresi" -__version__ = "1.12.2" +__version__ = "2.0.0" import logging