forked from adbar/trafilatura
-
Notifications
You must be signed in to change notification settings - Fork 0
/
setup.py
112 lines (104 loc) · 4.05 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
Scrapes the main text of web pages while preserving some structure
https://github.com/adbar/trafilatura
"""
import re
from pathlib import Path
from setuptools import setup
def get_version(package):
"Return package version as listed in `__version__` in `init.py`"
# version = Path(package, '__init__.py').read_text() # Python >= 3.5
with open(str(Path(package, '__init__.py')), 'r', encoding='utf-8') as filehandle:
initfile = filehandle.read()
return re.search('__version__ = [\'"]([^\'"]+)[\'"]', initfile).group(1)
def get_long_description():
"Return the README"
with open('README.rst', 'r', encoding='utf-8') as filehandle:
long_description = filehandle.read()
# long_description += "\n\n"
# with open("CHANGELOG.md", encoding="utf8") as f:
# long_description += f.read()
return long_description
# some problems with installation solved this way
extras = {
'all': [
'cchardet >= 2.1.7',
'htmldate[speed] >= 1.2.3',
'py3langid >= 0.2.2',
'pycurl >= 7.45.1',
'urllib3[brotli]',
],
'gui': [
'Gooey >= 1.0.1',
],
}
setup(
name='trafilatura',
version=get_version('trafilatura'),
description='Web scraping library and command-line tool for text discovery and retrieval. Downloads web pages, scrapes main text and comments while preserving some structure, and converts to TXT, CSV, JSON and XML',
long_description=get_long_description(),
classifiers=[
# As from https://pypi.python.org/pypi?%3Aaction=list_classifiers
'Development Status :: 5 - Production/Stable',
#'Development Status :: 6 - Mature',
'Environment :: Console',
'Intended Audience :: Developers',
'Intended Audience :: Education',
'Intended Audience :: Information Technology',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
'Operating System :: MacOS',
'Operating System :: Microsoft',
'Operating System :: POSIX',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
#'Programming Language :: Python :: 3.11',
'Topic :: Internet :: WWW/HTTP',
'Topic :: Scientific/Engineering :: Information Analysis',
'Topic :: Security',
'Topic :: Text Editors :: Text Processing',
'Topic :: Text Processing :: Linguistic',
'Topic :: Text Processing :: Markup :: HTML',
'Topic :: Text Processing :: Markup :: Markdown',
'Topic :: Text Processing :: Markup :: XML',
'Topic :: Utilities',
],
keywords=['corpus', 'html2text', 'news-crawler', 'natural-language-processing', 'scraper', 'tei-xml', 'text-extraction', 'webscraping', 'web-scraping'],
url='https://trafilatura.readthedocs.io',
project_urls={
"Documentation": "https://trafilatura.readthedocs.io",
"Source": "https://github.com/adbar/trafilatura",
"Blog": "https://adrien.barbaresi.eu/blog/tag/trafilatura.html",
},
author='Adrien Barbaresi',
author_email='[email protected]',
license='GPLv3+',
packages=['trafilatura'],
package_data={'trafilatura': ['data/tei-schema-pickle.lzma', 'data/jt-stopwords-pickle.lzma', 'settings.cfg']},
include_package_data=True,
python_requires='>=3.6',
install_requires=[
'certifi',
'charset_normalizer >= 2.1.0',
'courlan >= 0.7.2',
'htmldate >= 1.2.3',
'justext >= 3.0.0',
'lxml >= 4.6.4',
'urllib3 >= 1.26, < 2',
],
extras_require=extras,
entry_points={
'console_scripts': [
'trafilatura=trafilatura.cli:main',
'trafilatura_gui=trafilatura.gui:main',
],
},
# platforms='any',
tests_require=['pytest'],
zip_safe=False,
)