-
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathproject.yml
157 lines (140 loc) · 5.68 KB
/
project.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
title: "TEI Publisher NER"
description: "TEI Publisher Named Entity Recognition API and Support Scripts"
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
name: "ner_demo"
port: 8001
lang: "de"
# URL where TEI Publisher is running
teipublisher: "http://localhost:8080/exist/apps/tei-publisher"
# Name of the collection containing training data
# should be a path relative to TEI Publisher's data collection
training_collection: "training"
version: "0.0.0"
# "efficiency" or "accuracy"
optimize: "efficiency"
# Set your GPU ID, -1 is CPU
gpu_id: -1
# Vectors model for train-with-vectors
vectors_model: "de_core_news_md"
pipeline: "de_core_news_sm"
# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["corpus", "configs", "scripts", "packages", "models"]
# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
dev:
- clean
- convert.debug
- create-config
- train
all:
- clean
- convert
- create-config
- train
# - train-with-vectors
# - evaluate
# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
- name: "clean"
help: "Remove auxiliary files and directories"
script:
- "python scripts/cleanup.py models/${vars.name}"
- name: "cleanall"
help: "Remove auxiliary files and directories"
script:
- "python scripts/cleanup.py models/${vars.name} --all"
- name: "download"
help: "Download a spaCy model with pretrained vectors"
script:
- "python -m spacy download ${vars.vectors_model}"
- name: "convert"
help: "Convert the data to spaCy's binary format"
script:
- "python scripts/convert.py --verbose ${vars.lang} --url ${vars.teipublisher}/api/nlp/data/${vars.training_collection} corpus/train.spacy corpus/dev.spacy"
deps:
- "scripts/convert.py"
outputs:
- "corpus/train.spacy"
- "corpus/dev.spacy"
- name: "convert.debug"
help: "Convert the data to spaCy's binary format (output input json for debugging)"
script:
- "python scripts/convert.py --verbose ${vars.lang} --debug --url ${vars.teipublisher}/api/nlp/data/${vars.training_collection} corpus/train.spacy corpus/dev.spacy"
deps:
- "scripts/convert.py"
outputs:
- "corpus/train.spacy"
- "corpus/dev.spacy"
- name: "check"
help: "Check the created training data sets"
script:
- "python -m spacy debug data configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
- name: "create-config"
help: "Create a new config with an NER pipeline component"
script:
- "python -m spacy init config --lang ${vars.lang} --pipeline ner configs/config.cfg --force --optimize ${vars.optimize}"
outputs:
- "configs/config.cfg"
- name: "create-config-update"
help: "Create a config, which updates the NER component of an existing pipeline, but keeps all other components"
script:
- "python scripts/create-config.py ${vars.pipeline} ner configs/config.cfg"
deps:
- "scripts/create-config.py"
outputs:
- "configs/config.cfg"
- name: "train"
help: "Train the NER model"
script:
- "python -m spacy train configs/config.cfg --output models/${vars.name} --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.patience 50 --gpu-id ${vars.gpu_id}"
deps:
- "configs/config.cfg"
- "corpus/train.spacy"
- "corpus/dev.spacy"
outputs:
- "models/${vars.name}"
- name: "train-with-vectors"
help: "Train the NER model with vectors"
script:
- "python -m spacy train configs/config.cfg --output models/${vars.name} --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.patience 50 --gpu-id ${vars.gpu_id} --initialize.vectors ${vars.vectors_model} --components.tok2vec.model.embed.include_static_vectors true"
deps:
- "configs/config.cfg"
- "corpus/train.spacy"
- "corpus/dev.spacy"
outputs:
- "models/${vars.name}"
- name: "evaluate"
help: "Evaluate the model and export metrics"
script:
- "python -m spacy evaluate models/${vars.name}/model-best corpus/dev.spacy --output models/${vars.name}/metrics.json"
deps:
- "corpus/dev.spacy"
- "models/${vars.name}/model-best"
outputs:
- "training/metrics.json"
- name: package
help: "Package the trained model as a pip package"
script:
- "python -m spacy package models/${vars.name}/model-best packages --name ${vars.name} --version ${vars.version} --force"
deps:
- "models/${vars.name}/model-best"
outputs_no_cache:
- "packages/${vars.lang}_${vars.name}-${vars.version}/dist/${vars.lang}_${vars.name}-${vars.version}.tar.gz"
- name: visualize-model
help: Visualize the model's output interactively using Streamlit
script:
- "python -m streamlit run scripts/visualize_model.py models/${vars.name}/model-best \"I saw Shaka Khan in London.\""
deps:
- "scripts/visualize_model.py"
- "models/${vars.name}/model-best"
- name: serve
help: Run the NER API as a service to be accessed by TEI Publisher
script:
- "python -m uvicorn scripts.main:app --reload --host 0.0.0.0 --port ${vars.port}"