-
Notifications
You must be signed in to change notification settings - Fork 5
/
pyproject.toml
73 lines (68 loc) · 2.29 KB
/
pyproject.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "web-content-extraction-benchmark"
version = "1.0.0"
description = "Reproduction study on web content extraction."
authors = [
"Janek Bevendorff",
"Sanket Gupta"
]
homepage = "https://webis.de"
repository = "https://github.com/chatnoir-eu/web-content-extraction-benchmark.git"
readme = "README.md"
license = "Apache-2.0"
packages = [
{include = "extraction_benchmark", from = "src"}
]
include = [
"src/extraction_benchmark/extractors/go_domdistiller/go_domdistiller_cli",
"src/extraction_benchmark/extractors/boilernet/model.h5",
"src/extraction_benchmark/extractors/boilernet/words.json",
"src/extraction_benchmark/extractors/boilernet/tags.json",
]
[tool.poetry.dependencies]
python = ">=3.9,<3.12"
awscli = ">=1.27.78" # We don't need this directly, but need to fix Poetry's slow dependency resolution
click = "^8.1.3"
cython = "^0.29.33"
jinja2 = "^3.1.2"
lz4 = "^4.3.2"
matplotlib = "^3.7.0"
tensorflow = "^2.11.0"
pandas = "^1.5.3"
levenshtein = "^0.20.9"
openpyxl = "^3.1.1"
rouge-score = "^0.1.2"
scikit-learn = "^1.2.1"
# Main Content / HTML Extractors
boilerpipe3 = "~1.3"
goose3 = "~3.1.13"
html-text = "~0.5.2"
inscriptis = "~2.3.2"
justext = "~3.0.0"
lxml = "~4.9.2"
news-please = "~1.5.22"
newspaper3k = "~0.2.8"
readability-lxml = "~0.8.1"
resiliparse = "~0.14.7"
trafilatura = "~1.4.1"
# Install local wheels for Dragnet and ExtractNet, whose upstream versions have incompatible dependencies
dragnet = [
{path = "third-party/dragnet-2.0.4-cp39-cp39-linux_x86_64.whl", python = "~3.9"},
{path = "third-party/dragnet-2.0.4-cp310-cp310-linux_x86_64.whl", python = "~3.10"},
{path = "third-party/dragnet-2.0.4-cp311-cp311-linux_x86_64.whl", python = "~3.11"},
]
extractnet = [
{path = "third-party/extractnet-2.0.7-cp39-cp39-linux_x86_64.whl", python = "~3.9"},
{path = "third-party/extractnet-2.0.7-cp310-cp310-linux_x86_64.whl", python = "~3.10"},
# extractnet has cchardet dependency, which cannot build with Python 3.11 and above
# {path = "third-party/extractnet-2.0.7-cp311-cp311-linux_x86_64.whl", python = "~3.11"},
]
# BoilerNet dependencies
html5lib = "^1.1"
numpy = "^1.24.2"
tqdm = "^4.64.1"
[tool.poetry.scripts]
wceb = "extraction_benchmark.wceb:main"