From d3a89c7bbcbc8cbf19b82c8b18483371b0830306 Mon Sep 17 00:00:00 2001 From: akb89 Date: Sun, 26 Aug 2018 11:49:39 -0400 Subject: [PATCH 01/15] Preparing v1.1.2: fixed gitlab URL and updated README --- README.md | 7 +++++++ REPLICATION.md | 37 +++++++++++++++++++++++++++++++++++++ setup.py | 4 ++-- 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 824fe18..6f2f248 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,13 @@ pyfn generate \ ``` To also process exemplars, add the `--with_exemplars` option +### Using preprocessing and frame semantic parsing scripts +We created a set of bash scripts to preprocess FrameNet data with various +POS taggers and dependency parsers as well as to run the `SIMPLEFRAMEID`, +`SEMAFOR` and `OPEN-SESAME` frame semantic parsers. +Check out [REPLICATION.md](REPLICATION.md) for a detailed HowTo manual. + + [release-image]:https://img.shields.io/github/release/akb89/pyfn.svg?style=flat-square [release-url]:https://github.com/akb89/pyfn/releases/latest [pypi-image]:https://img.shields.io/pypi/v/pyfn.svg?style=flat-square diff --git a/REPLICATION.md b/REPLICATION.md index fe8ffe5..6242d8c 100644 --- a/REPLICATION.md +++ b/REPLICATION.md @@ -57,6 +57,8 @@ directory named `pyfn`. Your pyfn folder structure should look like: | | |-- ... ``` +**Please strictly follow this directory structure to avoid unexpected errors. `pyfn` relies on a lot of relative path resolutions to make scripts calls shorter, and changing this directory structure can brake everything** + ## Install ``` pip3 install pyfn @@ -117,6 +119,41 @@ butterfly effects in frame semantic parsing. Those instructions can be used to compare the performances of different frame semantic parsers in various experimental setups. +## Scripts +Each script comes with a helper: check it out with `--help`! + +We have made some opinionated choices on how to use the preprocessing and +frame semantic parsing bash scripts. Those choices are primarly motivated +by constraints on running *many* experiences at once, in various experimental +setups, and on having commands that are easy to *type*. + +The main choice lies in the directory structure. Each script in the `scripts` +directory expects an XP_DIR argument that specifies the experiments ID. +When you specify: + +``` +./prepare.sh -x 042 ... +``` + +or + +``` +./preprocess.sh -x 042 +``` + +The scripts expects the data to process to be located under `.../experiments/xp_042/data` where the `experiments` dir is at the same level +as the `scripts` dir. + +### prepare.sh + +### preprocess.sh + +### semafor.sh + +### open-sesame.sh + + + ## Citation If you use pyfn please cite: ```tex diff --git a/setup.py b/setup.py index 83d8e34..db7367e 100644 --- a/setup.py +++ b/setup.py @@ -16,8 +16,8 @@ author_email='akb@3azouz.net', long_description=long_description, long_description_content_type='text/markdown', - version='1.1.0', - url='https://gitlab.unige.ch/akb/pyfn', + version='1.1.2', + url='https://gitlab.com/akb89/pyfn', download_url='https://pypi.org/project/pyfn/#files', license='MIT', keywords=['framenet', 'xml', 'marshalling', 'unmarshalling'], From 26d3b40fbeba4fd5ad0828b2a741ff5a75cdf959 Mon Sep 17 00:00:00 2001 From: akb89 Date: Sun, 26 Aug 2018 18:53:38 -0400 Subject: [PATCH 02/15] Fixed #5, updated README for clarity following #4 and updated dependencies --- README.md | 7 ++----- pyfn/main.py | 3 ++- pyfn/utils/files.py | 2 +- setup.py | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6f2f248..2e8dc65 100644 --- a/README.md +++ b/README.md @@ -74,10 +74,7 @@ For an exhaustive description of all formats, check out [FORMAT.md](FORMAT.md). ## Conversion HowTo The following sections provide examples of commands to convert FN data to and from different formats. All commands can make use of the following options: -1. `--splits`: specify which splits should be converted. Use `--splits dev` -to only process dev and test splits and guarantee no overlap between -dev and test. Use `--splits train` to process train dev and test splits and -guarantee no overlap across splits. Default to `--splits test`. +1. `--splits`: specify which splits should be converted. `--splits train` will generate all train/dev/test splits, according to data found under the fndata-1.x/{train/dev/test} directories. `--splits dev` will generate the dev and test splits according to data found under the fndata-1.x/{dev/test} directories. This option will skip the train splits but generate the same dev/test splits that would have been generated with `--splits train`. `--splits test` will generate the test splits according to data found under the fndata-1.x/test directory, and skip the train/dev splits. The test splits generated with `--splits test` will be the same as those generated with the `--splits train` and `--splits dev`. Default to `--splits test`. 2. `--output_sentences`: if specified, will output a `.sentences` file in the process, containing all raw annotated sentences, one sentence per line. 3. `--with_exemplars`: if specified, will process the exemplars (data under @@ -172,7 +169,7 @@ To also process exemplars, add the `--with_exemplars` option We created a set of bash scripts to preprocess FrameNet data with various POS taggers and dependency parsers as well as to run the `SIMPLEFRAMEID`, `SEMAFOR` and `OPEN-SESAME` frame semantic parsers. -Check out [REPLICATION.md](REPLICATION.md) for a detailed HowTo manual. +Check out [REPLICATION.md](REPLICATION.md) for a detailed HowTo manual. [release-image]:https://img.shields.io/github/release/akb89/pyfn.svg?style=flat-square diff --git a/pyfn/main.py b/pyfn/main.py index aeba3e6..7504715 100644 --- a/pyfn/main.py +++ b/pyfn/main.py @@ -51,7 +51,6 @@ def _convert(args): raise InvalidParameterError( 'Source and Target paths are the same! Please specify different ' 'source/target paths') - # TODO: add validation for input directory structure if args.source_format == 'fnxml': annosets_dict = fnxml.get_annosets_dict(args.source_path, args.splits, @@ -70,6 +69,8 @@ def _convert(args): 'need to specify the --sent parameter pointing at the ' '.sentences file absolute filepath') annosets = semaforu.unmarshall_annosets(args.source_path, args.sent) + ## Starting marshalling + os.makedirs(args.target_path, exist_ok=True) if args.target_format == 'bios': biosm.marshall_annosets_dict(annosets_dict, args.target_path, args.filter, args.output_sentences, diff --git a/pyfn/utils/files.py b/pyfn/utils/files.py index 3d2789d..81d72a3 100644 --- a/pyfn/utils/files.py +++ b/pyfn/utils/files.py @@ -73,5 +73,5 @@ def get_rolemappings_filepath(target_dirpath): def get_fr_relation_xml_filepath(splits_dirpath): - """Return the absolute path to the frRelation.xl file given splits_dirpath.""" + """Return the absolute path to the frRelation.xml file given splits_dirpath.""" return os.path.join(splits_dirpath, 'frRelation.xml') diff --git a/setup.py b/setup.py index db7367e..467c7a0 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ 'pyfn = pyfn.main:main' ], }, - tests_require=['pytest==3.7.2', 'pylint==2.1.1', 'pytest-cov==2.5.1', + tests_require=['pytest==3.7.3', 'pylint==2.1.1', 'pytest-cov==2.5.1', 'pydocstyle==2.1.1'], install_requires=['PyYAML==3.13', 'mmh3==2.5.1', 'lxml==4.2.4', 'pytz==2018.5'], From 854fdf40eeeab16b24807ded694ebb9917c36269 Mon Sep 17 00:00:00 2001 From: akb89 Date: Sun, 26 Aug 2018 19:01:30 -0400 Subject: [PATCH 03/15] Fixed link to pypi badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2e8dc65..a476a85 100644 --- a/README.md +++ b/README.md @@ -175,7 +175,7 @@ Check out [REPLICATION.md](REPLICATION.md) for a detailed HowTo manual. [release-image]:https://img.shields.io/github/release/akb89/pyfn.svg?style=flat-square [release-url]:https://github.com/akb89/pyfn/releases/latest [pypi-image]:https://img.shields.io/pypi/v/pyfn.svg?style=flat-square -[pypi-url]:https://github.com/akb89/pyfn/releases/latest +[pypi-url]:https://pypi.org/project/pyfn/ [build-image]:https://gitlab.com/akb89/pyfn/badges/master/pipeline.svg [build-url]:https://gitlab.com/akb89/pyfn/commits/master [coverage-image]:https://img.shields.io/coveralls/akb89/pyfn/master.svg?style=flat-square From 57a408a8febb8f30d3b7eee3d866d92fe04d93b4 Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 12:24:23 -0400 Subject: [PATCH 04/15] Updated pytest dependency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 467c7a0..332d8f2 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ 'pyfn = pyfn.main:main' ], }, - tests_require=['pytest==3.7.3', 'pylint==2.1.1', 'pytest-cov==2.5.1', + tests_require=['pytest==3.7.4', 'pylint==2.1.1', 'pytest-cov==2.5.1', 'pydocstyle==2.1.1'], install_requires=['PyYAML==3.13', 'mmh3==2.5.1', 'lxml==4.2.4', 'pytz==2018.5'], From 8b5592088307dce75afb1cda892df2410a0253da Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 12:24:40 -0400 Subject: [PATCH 05/15] Updated helper description --- scripts/score.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/score.sh b/scripts/score.sh index 6660141..1bbe612 100755 --- a/scripts/score.sh +++ b/scripts/score.sh @@ -5,7 +5,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/setup.sh" show_help() { cat << EOF Usage: ${0##*/} [-h] -x XP_NUM -p {semafor,open-sesame} -s {dev,test} -f {gold,predicted} -Score frame semantic parsing with the SEMEVAL scoring scripts modified by Kshirsagar et al. (2015). +Score frame semantic parsing with a modified version of the SEMEVAL scoring script. -h, --help display this help and exit -x, --xp XP_NUM xp number written as 3 digits (e.g. 001) From fad78c2b467bbf51adaccd3f7ec5ecb9e72eda16 Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 12:24:54 -0400 Subject: [PATCH 06/15] Refactored README with scripts documentation --- README.md | 452 ++++++++++++++++++++++++++++++++++++++++++++++--- REPLICATION.md | 172 ------------------- 2 files changed, 430 insertions(+), 194 deletions(-) delete mode 100644 REPLICATION.md diff --git a/README.md b/README.md index a476a85..acbf04d 100644 --- a/README.md +++ b/README.md @@ -9,15 +9,14 @@ Welcome to **pyfn**, a Python modules to process FrameNet annotation. -pyfn can be used to convert data to and from: -- FRAMENET XML: the format of the released FrameNet XML data -- SEMEVAL XML: the format of the SEMEVAL 2007 shared task 19 on frame semantic structure extraction -- SEMAFOR CoNLL: the format used by the SEMAFOR parser -- BIOS: the format used by the OPEN-SESAME parser -- CoNLL-X: the format used by various state-of-the-art POS taggers and dependency parsers (see preprocessing considerations for frame semantic parsing in [REPLICATION.md](REPLICATION.md)) +pyfn can be used to: -As well as to generate the `.csv` hierarchy files used by both SEMAFOR and -OPEN-SESAME parsers to integrate the hierarchy feature (see (Kshirsagar et al., 2015) for details). +1. [convert]() data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and +CoNLL-X +2. [preprocess]() FrameNet data using a standardized state-of-the-art pipeline +3. [run]() the SEMAFOR and OPEN-SESAME frame semantic parsers +4. [build]() your own frame semantic parser using a standard set of python models +to marshall/unmarshall FrameNet XML data This repository also accompanies the (Kabbach et al., 2018) paper: @@ -36,11 +35,6 @@ This repository also accompanies the (Kabbach et al., 2018) paper: } ``` -To use `pyfn` to replicate frame semantic parsing results for SEMAFOR, -OPEN-SESAME and SIMPLEFRAMEID on a common preprocessing pipeline, -or to replicate results reported in (Kabbach et al., 2018), -check out [REPLICATION.md](REPLICATION.md). - ## Dependencies On Unix, you may need to install the following packages: ``` @@ -68,13 +62,32 @@ When using pyfn, your FrameNet splits directory structure should follow: | | |-- lu ``` -## Formats +## Conversion + +pyfn can be used to convert data to and from: +- FRAMENET XML: the format of the released FrameNet XML data +- SEMEVAL XML: the format of the SEMEVAL 2007 shared task 19 on frame semantic structure extraction +- SEMAFOR CoNLL: the format used by the SEMAFOR parser +- BIOS: the format used by the OPEN-SESAME parser +- CoNLL-X: the format used by various state-of-the-art POS taggers and dependency parsers (see preprocessing considerations for frame semantic parsing in [below](#preprocessing-and-frame-semantic-parsing)) + +As well as to generate the `.csv` hierarchy files used by both SEMAFOR and +OPEN-SESAME parsers to integrate the hierarchy feature (see (Kshirsagar et al., 2015) for details). + For an exhaustive description of all formats, check out [FORMAT.md](FORMAT.md). -## Conversion HowTo +### HowTo + The following sections provide examples of commands to convert FN data to and from different formats. All commands can make use of the following options: -1. `--splits`: specify which splits should be converted. `--splits train` will generate all train/dev/test splits, according to data found under the fndata-1.x/{train/dev/test} directories. `--splits dev` will generate the dev and test splits according to data found under the fndata-1.x/{dev/test} directories. This option will skip the train splits but generate the same dev/test splits that would have been generated with `--splits train`. `--splits test` will generate the test splits according to data found under the fndata-1.x/test directory, and skip the train/dev splits. The test splits generated with `--splits test` will be the same as those generated with the `--splits train` and `--splits dev`. Default to `--splits test`. +1. `--splits`: specify which splits should be converted. `--splits train` will generate all +train/dev/test splits, according to data found under the fndata-1.x/{train/dev/test} +directories. `--splits dev` will generate the dev and test splits according to data found under +the fndata-1.x/{dev/test} directories. This option will skip the train splits but generate the +same dev/test splits that would have been generated with `--splits train`. `--splits test` will +generate the test splits according to data found under the fndata-1.x/test directory, and skip +the train/dev splits. The test splits generated with `--splits test` will be the same as those +generated with the `--splits train` and `--splits dev`. Default to `--splits test`. 2. `--output_sentences`: if specified, will output a `.sentences` file in the process, containing all raw annotated sentences, one sentence per line. 3. `--with_exemplars`: if specified, will process the exemplars (data under @@ -85,7 +98,7 @@ For details on `pyfn` usage, do: ```bash pyfn --help pyfn generate --help -pyfn convert --help +convert --help ``` ### From FN XML to BIOS @@ -165,11 +178,406 @@ pyfn generate \ ``` To also process exemplars, add the `--with_exemplars` option -### Using preprocessing and frame semantic parsing scripts -We created a set of bash scripts to preprocess FrameNet data with various -POS taggers and dependency parsers as well as to run the `SIMPLEFRAMEID`, -`SEMAFOR` and `OPEN-SESAME` frame semantic parsers. -Check out [REPLICATION.md](REPLICATION.md) for a detailed HowTo manual. + +## Preprocessing and Frame Semantic Parsing +pyfn ships in with a set of bash scripts to preprocess FrameNet data with +various POS taggers and dependency parsers, as well as to perform frame +semantic parsing with a variety of open-source parsers. + +Currently supported POS taggers include: +- MXPOST (Ratnaparkhi, 1996) +- NLP4J (Choi, 2016) + +Currently supported dependency parsers include: +- MST (McDonald et al., 2006) +- BIST BARCH (Kiperwasser and Goldberg, 2016) +- BIST BMST (Kiperwasser and Goldberg, 2016) + +Currently supported frame semantic parsers include: +- SIMPLEFRAMEID (Hartmann et al., 2017) for frame identification +- SEMAFOR (Kshirsagar et al., 2015) for argument identification +- OPEN-SESAME (Swayamdipta et al., 2017) for argument identification + +To request support for a POS tagger, a dependency parser or a frame semantic +parser, please create an [issue](https://github.com/akb89/pyfn/issues). + +### Download +To run the preprocessing and frame semantic parsing scripts, first download: +- [data.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/data.7z) containing all the FrameNet splits for FN 1.5 and FN 1.7 +- [lib.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/lib.7z) containing all the different external softwares (taggers, parsers, etc.) +- [resources.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/resources.7z) containing all the required resources +- [scripts.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/scripts.7z) containing the set of bash scripts to call the different parsers and preprocessing toolkits + +Extract the content of all the archives under a +directory named `pyfn`. Your pyfn folder structure should look like: +``` +. +|-- pyfn +| |-- data +| | |-- fndata-1.5-with-dev +| | |-- fndata-1.7-with-dev +| |-- lib +| | |-- bistparser +| | |-- jmx +| | |-- mstparser +| | |-- nlp4j +| | |-- open-sesame +| | |-- semafor +| | |-- semeval +| |-- resources +| | |-- bestarchybrid.model +| | |-- bestarchybrid.params +| | |-- bestfirstorder.model +| | |-- bestfirstorder.params +| | |-- config-decode-pos.xml +| | |-- nlp4j.plemma.model.all.xz +| | |-- sskip.100.vectors +| | |-- wsj.model +| |-- scripts +| | |-- CoNLLizer.py +| | |-- deparse.sh +| | |-- flatten.sh +| | |-- ... +``` + +**Please strictly follow this directory structure to avoid unexpected errors. `pyfn` relies on a lot of relative path resolutions to make scripts calls shorter, and changing this directory structure can brake everything** + +### Setup NLP4J for POS tagging + +To use NLP4J for POS tagging, modify the `resources/config-decode-pos.xml` +file by replacing the models.pos absolute path to +your `resources/nlp4j.plemma.model.all.xz`: +```xml + + ... + + /absolute/path/to/pyfn/resources/nlp4j.plemma.model.all.xz + + +``` + +### Setup DyNET for BIST or OPEN-SESAME + +If you intend to use the BIST parser for dependency parsing or +OPEN-SESAME for frame semantic parsing, you will need +to install DyNET 2.0.2 following: +``` +https://dynet.readthedocs.io/en/2.0.2/python.html +``` + +### Setup SEMAFOR +To use the SEMAFOR frame semantic parser, modify the `scripts/setup.sh` file: +```bash +# SEMAFOR options to be changed according to your env +export JAVA_HOME_BIN="/abs/path/to/java/jdk/bin" +export num_threads=2 # number of threads to use +export min_ram=4g # min RAM allocated to the JVM in GB. Corresponds to the -Xms argument +export max_ram=8g # max RAM allocated to the JVM in GB. Corresponds to the -Xmx argument + +# SEMAFOR hyperparameters +export kbest=1 # keep k-best parse +export lambda=0.000001 # hyperparameter for argument identification. Refer to Kshirsagar et al. (2015) for details. +export batch_size=4000 # number of batches processed at once for argument identification. +export save_every_k_batches=400 # for argument identification +export num_models_to_save=60 # for argument identification +``` + +### Using the SEMEVAL PERL evaluation scripts + +If you intend to use the SEMEVAL perl evaluation scripts, make sure +to have the `App::cpanminus` and `XML::Parser` modules installed: +``` +cpan App::cpanminus +cpanm XML::Parser +``` + +### Using bash scripts + +Each script comes with a helper: check it out with `--help`! + +**Careful!** most scripts expect data output by `pyfn convert ...` +to be located under `pyfn/experiments/xp_XYZ/data` where `XYZ` stands for +the experiments number and is specified using the `-x XYZ` argument, and where +the `experiments` directory is located at the same level as the `scripts` +directory. This opinionated choice has proven extremely useful in launching +scripts by batch on a large set of experiments as it avoid having to input +the full path each time. + +**Make sure to use** + +```bash +pyfn convert \ + --from ... \ + --to ... \ + --source ... \ + --target /abs/path/to/pyfn/experiments/xp_XYZ/data \ + --splits ... +``` + +**BEFORE** calling `preprocess.sh`, `prepare.sh`, `semafor.sh` or +`open-sesame.sh` + +### preprocess.sh + +Use `preprocess.sh` to POS-tag and dependency-parse FrameNet splits generated +with `pyfn convert ...`. The helper should display: + +```shell +Usage: ${0##*/} [-h] -x XP_NUM -t {mxpost,nlp4j} -p {semafor,open-sesame} [-d {mst,bmst,barch}] [-v] +Preprocess FrameNet train/dev/test splits. + + -h, --help display this help and exit + -x, --xp XP_NUM xp number written as 3 digits (e.g. 001) + -t, --tagger {mxpost,nlp4j} pos tagger to be used: 'mxpost' or 'nlp4j' + -p, --parser {semafor,open-sesame} frame semantic parser to be used: 'semafor' or 'open-sesame' + -d, --dep {mst,bmst,barch} dependency parser to be used: 'mst', 'bmst' or 'barch' + -v, --dev if set, script will also preprocess dev splits +``` + +Suppose you generated FrameNet splits for SEMAFOR using: + +```bash +pyfn convert \ + --from fnxml \ + --to semafor \ + --source /path/to/fndata-1.7-with-dev \ + --target /path/to/experiments/xp_001/data \ + --splits train \ + --output_sentences +``` + +You can preprocess those splits with NLP4J and BMST using + +```bash +./preprocess.sh -x 001 -t nlp4j -d bmst -p semafor +``` + +### prepare.sh + +Use `prepare.sh` to automatically generate misc. data required by the +frame semantic parsing pipeline, such as gold SEMEVAL XML files for scoring, +the `framenet.frame.element.map` and the hierarchy `.csv` files +used by SEMAFOR, or the `frames.xml` and `frRelations.xml` files used by +both SEMAFOR and OPEN-SESAME. The helper should display: + +```shell +Usage: ${0##*/} [-h] -x XP_NUM -p {semafor,open-sesame} -s {dev,test} -f FN_DATA_DIR [-u] [-e] +Prepare misc. data for frame semantic parsing. + + -h, --help display this help and exit + -x, --xp XP_NUM xp number written as 3 digits (e.g. 001) + -p, --parser {semafor,open-sesame} frame semantic parser to be used: 'semafor' or 'open-sesame' + -s, --splits {dev,test} which splits to score: dev or test + -f, --fn FN_DATA_DIR absolute path to FrameNet data directory + -u, --with_hierarchy if specified, will use the hierarchy feature + -e, --with_exemplars if specified, will use the exemplars +``` + +Suppose you generated FrameNet splits for SEMAFOR using: + +```bash +pyfn convert \ + --from fnxml \ + --to semafor \ + --source /path/to/fndata-1.7-with-dev \ + --target /path/to/experiments/xp_001/data \ + --splits train \ + --output_sentences +``` + +You can prepare SEMAFOR data using: + +```bash +./prepare.sh -x 001 -p semafor -s test -f /path/to/fndata-1.7-with-dev +``` + +### frameid.sh + +Use `frameid.sh` to perform frame identification using SIMPLEFRAMEID. +The helper should display: + +```shell +Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-p {semafor,open-sesame}] +Perform frame identification. + + -h, --help display this help and exit + -m, --mode train on all models or decode using a single model + -x, --xp XP_NUM xp number written as 3 digits (e.g. 001) + -p, --parser {semafor,open-sesame} formalize decoded frames for specified parser +``` + +Suppose you generated FrameNet splits for SEMAFOR using: + +```bash +pyfn convert \ + --from fnxml \ + --to semafor \ + --source /path/to/fndata-1.7-with-dev \ + --target /path/to/experiments/xp_101/data \ + --splits train \ + --output_sentences +``` + +*After preprocessing*, you can train the SIMPLEFRAMEID parser using: + +```bash +./frameid.sh -m train -x 101 +``` + +and decode (**before decoding argument identification**) using: + +```bash +./frameid.sh -m decode -x 101 -p semafor +``` + +### semafor.sh + +Use `semafor.sh` to train the SEMAFOR parser or decode the test/dev splits. +The helper should display: + +```shell +Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-s {dev,test}] [-u] +Train or decode with the SEMAFOR parser. + + -h, --help display this help and exit + -m, --mode {train,decode} semafor mode to use: train or decode + -x, --xp XP_NUM xp number written as 3 digits (e.g. 001) + -s, --splits {dev,test} which splits to use in decode mode: dev or test + -u, --with_hierarchy if specified, parser will use the hierarchy feature +``` + +Suppose you generated FrameNet splits for SEMAFOR using: + +```bash +pyfn convert \ + --from fnxml \ + --to semafor \ + --source /path/to/fndata-1.7-with-dev \ + --target /path/to/experiments/xp_001/data \ + --splits train \ + --output_sentences +``` + +*After preprocessing and preparation*, you can train the SEMAFOR parser using: + +```bash +./semafor.sh -m train -x 001 +``` + +and decode the test splits using: + +``` +./semafor.sh -m decode -x 001 -s test +``` + +### open-sesame.sh + +Use `open-sesame.sh` to train the OPEN-SESMAE parser or decode the test/dev splits. +The helper should display: + +```shell +Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-s {dev,test}] [-d] [-u] +Train or decode with the OPEN-SESAME parser. + + -h, --help display this help and exit + -m, --mode {train,decode} open-sesame mode to use: train or decode + -x, --xp XP_NUM xp number written as 3 digits (e.g. 001) + -s, --splits {dev,test} which splits to use in decode mode: dev or test + -d, --with_dep_parses if specified, parser will use dependency parses + -u, --with_hierarchy if specified, parser will use the hierarchy feature +``` + +Suppose you generated FrameNet splits for OPEN-SESAME using: + +```bash +pyfn convert \ + --from fnxml \ + --to bios \ + --source /path/to/fndata-1.7-with-dev \ + --target /path/to/experiments/xp_002/data \ + --splits train \ + --output_sentences \ + --filter overlap_fes +``` + +*After preprocessing and preparation*, you can train the SEMAFOR parser using: + +```bash +./open-sesame.sh -m train -x 002 +``` + +and decode the test splits using: + +``` +./open-sesame.sh -m decode -x 002 -s test +``` + +### score.sh + +Use `score.sh` to obtain P/R/F1 scores for frame semantic parsing on +dev/test splits with the SEMEVAL scoring script, using gold of predicted frames. +The helper should display: + +```shell +Usage: ${0##*/} [-h] -x XP_NUM -p {semafor,open-sesame} -s {dev,test} -f {gold,predicted} +Score frame semantic parsing with a modified version of the SEMEVAL scoring script. + + -h, --help display this help and exit + -x, --xp XP_NUM xp number written as 3 digits (e.g. 001) + -p, --parser {semafor,open-sesame} frame semantic parser to be used: 'semafor' or 'open-sesame' + -s, --splits {dev,test} which splits to score: dev or test + -f, --frames {gold,predicted} score with gold or predicted frames +``` + +Note that scoring is done with an updated version of the SEMEVAL perl script, +in order to obtain more robust scores across setups. For a full account +of the modifications, refer to (Kabbach et al., 2018) and to the perl scripts +located under `lib/semeval/`. + +To obtain scores for SEMAFOR using gold frames on test splits, use: + +``` +./score.sh -x XYZ -p semafor -s test -f gold +``` + +To obtain scores for SEMAFOR using predicted frames on test splits, use: + +``` +./score.sh -x XYZ -p semafor -s test -f predicted +``` + +## Replication + +The `experiments` directory provides a detailed set of instructions to +replicate all results reported in (Kabbach et al., 2018) on experimental +butterfly effects in frame semantic parsing. Those instructions can be used +to compare the performances of different frame semantic parsers in various +experimental setups. + + +## Marshalling and Unmarshalling of FrameNet XML data + +`pyfn` provides a set of Python models to process FrameNet XML data. +Those can be used to help you build you own frame semantic parser. + + +## Citation + +If you use pyfn please cite: +```tex +@InProceedings{C18-1267, + author = "Kabbach, Alexandre + and Ribeyre, Corentin + and Herbelot, Aur{\'e}lie", + title = "Butterfly Effects in Frame Semantic Parsing: impact of data processing on model ranking", + booktitle = "Proceedings of the 27th International Conference on Computational Linguistics", + year = "2018", + publisher = "Association for Computational Linguistics", + pages = "3158--3169", + location = "Santa Fe, New Mexico, USA", + url = "http://aclweb.org/anthology/C18-1267" +} +``` [release-image]:https://img.shields.io/github/release/akb89/pyfn.svg?style=flat-square diff --git a/REPLICATION.md b/REPLICATION.md deleted file mode 100644 index 6242d8c..0000000 --- a/REPLICATION.md +++ /dev/null @@ -1,172 +0,0 @@ -# Replication - -pyfn provides a set of models and utils to apply custom preprocessing -pipelines to FrameNet XML data and perform frame semantic parsing using -SEMAFOR, OPEN-SESAME or SIMPLEFRAMEID. - -Currently supported POS taggers include: -- MXPOST (Ratnaparkhi, 1996) -- NLP4J (Choi, 2016) - -Currently supported dependency parsers include: -- MST (McDonald et al., 2006) -- BIST BARCH (Kiperwasser and Goldberg, 2016) -- BIST BMST (Kiperwasser and Goldberg, 2016) - -Currently supported frame semantic parsers include: -- SIMPLEFRAMEID (Hartmann et al., 2017) for frame identification -- SEMAFOR (Kshirsagar et al., 2015) for argument identification -- OPEN-SESAME (Swayamdipta et al., 2017) for argument identification - -## Download -Download the following: -- [data.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/data.7z) containing all the FrameNet splits for FN 1.5 and FN 1.7 -- [lib.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/lib.7z) containing all the different external softwares (taggers, parsers, etc.) -- [resources.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/resources.7z) containing all the required resources -- [scripts.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/scripts.7z) containing the set of bash scripts to call the different parsers and preprocessing toolkits - -Extract the content of all the archives under a -directory named `pyfn`. Your pyfn folder structure should look like: -``` -. -|-- pyfn -| |-- data -| | |-- fndata-1.5-with-dev -| | |-- fndata-1.7-with-dev -| |-- lib -| | |-- bistparser -| | |-- jmx -| | |-- mstparser -| | |-- nlp4j -| | |-- open-sesame -| | |-- semafor -| | |-- semeval -| |-- resources -| | |-- bestarchybrid.model -| | |-- bestarchybrid.params -| | |-- bestfirstorder.model -| | |-- bestfirstorder.params -| | |-- config-decode-pos.xml -| | |-- nlp4j.plemma.model.all.xz -| | |-- sskip.100.vectors -| | |-- wsj.model -| |-- scripts -| | |-- CoNLLizer.py -| | |-- deparse.sh -| | |-- flatten.sh -| | |-- ... -``` - -**Please strictly follow this directory structure to avoid unexpected errors. `pyfn` relies on a lot of relative path resolutions to make scripts calls shorter, and changing this directory structure can brake everything** - -## Install -``` -pip3 install pyfn -``` - -## Setup - -### Using NLP4J for POS tagging -To use NLP4J for POS tagging, modify the `resources/config-decode-pos.xml` -file by replacing the models.pos absolute path to -your `resources/nlp4j.plemma.model.all.xz`: -```xml - - ... - - /absolute/path/to/pyfn/resources/nlp4j.plemma.model.all.xz - - -``` - -### Using BIST or OPEN-SESAME -If you intend to use the BIST for dependency parsing or -OPEN-SESAME for frame semantic parsing, you need -to install DyNET 2.0.2 following: -``` -https://dynet.readthedocs.io/en/2.0.2/python.html -``` - -### Using SEMAFOR -To use the SEMAFOR frame semantic parser, modify the `scripts/setup.sh` file: -```bash -# SEMAFOR options to be changed according to your env -export JAVA_HOME_BIN="/abs/path/to/java/jdk/bin" -export num_threads=2 # number of threads to use -export min_ram=4g # min RAM allocated to the JVM in GB. Corresponds to the -Xms argument -export max_ram=8g # max RAM allocated to the JVM in GB. Corresponds to the -Xmx argument - -# SEMAFOR hyperparameters -export kbest=1 # keep k-best parse -export lambda=0.000001 # hyperparameter for argument identification. Refer to Kshirsagar et al. (2015) for details. -export batch_size=4000 # number of batches processed at once for argument identification. -export save_every_k_batches=400 # for argument identification -export num_models_to_save=60 # for argument identification -``` - -### Using the SEMEVAL PERL evaluation scripts -If you intend to use the SEMEVAL perl evaluation scripts, make sure -to have the `App::cpanminus` and `XML::Parser` modules installed: -``` -cpan App::cpanminus -cpanm XML::Parser -``` - -## Replication -The `experiments` directory provides a detailed set of instructions to -replicate all results reported in (Kabbach et al., 2018) on experimental -butterfly effects in frame semantic parsing. Those instructions can be used -to compare the performances of different frame semantic parsers in various -experimental setups. - -## Scripts -Each script comes with a helper: check it out with `--help`! - -We have made some opinionated choices on how to use the preprocessing and -frame semantic parsing bash scripts. Those choices are primarly motivated -by constraints on running *many* experiences at once, in various experimental -setups, and on having commands that are easy to *type*. - -The main choice lies in the directory structure. Each script in the `scripts` -directory expects an XP_DIR argument that specifies the experiments ID. -When you specify: - -``` -./prepare.sh -x 042 ... -``` - -or - -``` -./preprocess.sh -x 042 -``` - -The scripts expects the data to process to be located under `.../experiments/xp_042/data` where the `experiments` dir is at the same level -as the `scripts` dir. - -### prepare.sh - -### preprocess.sh - -### semafor.sh - -### open-sesame.sh - - - -## Citation -If you use pyfn please cite: -```tex -@InProceedings{C18-1267, - author = "Kabbach, Alexandre - and Ribeyre, Corentin - and Herbelot, Aur{\'e}lie", - title = "Butterfly Effects in Frame Semantic Parsing: impact of data processing on model ranking", - booktitle = "Proceedings of the 27th International Conference on Computational Linguistics", - year = "2018", - publisher = "Association for Computational Linguistics", - pages = "3158--3169", - location = "Santa Fe, New Mexico, USA", - url = "http://aclweb.org/anthology/C18-1267" -} -``` From f35b694631caf9e53cfd0f88f5a4945b69506dca Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 15:57:24 -0400 Subject: [PATCH 07/15] Updated description and version --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 332d8f2..bd69abd 100644 --- a/setup.py +++ b/setup.py @@ -11,12 +11,12 @@ setup( name='pyfn', - description='A python module to process FrameNet XML data', + description='A python module to process data for Frame Semantic Parsing', author='Alexandre Kabbach', author_email='akb@3azouz.net', long_description=long_description, long_description_content_type='text/markdown', - version='1.1.2', + version='1.2.0', url='https://gitlab.com/akb89/pyfn', download_url='https://pypi.org/project/pyfn/#files', license='MIT', From 7564e2791b0d40801a13eb9a902222d45b0f79fe Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 15:57:46 -0400 Subject: [PATCH 08/15] Added exception for invalid parameters --- pyfn/marshalling/unmarshallers/framenet.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyfn/marshalling/unmarshallers/framenet.py b/pyfn/marshalling/unmarshallers/framenet.py index 3f1c612..fe0b25a 100644 --- a/pyfn/marshalling/unmarshallers/framenet.py +++ b/pyfn/marshalling/unmarshallers/framenet.py @@ -10,6 +10,7 @@ import pyfn.utils.filter as f_utils import pyfn.utils.xml as xml_utils +from pyfn.exceptions.parameter import InvalidParameterError from pyfn.exceptions.xml import XMLProcessingError from pyfn.models.annotationset import AnnotationSet @@ -310,7 +311,7 @@ def _extract_ft_annosets(ft_filepaths, fe_dict, flatten=False): def extract_annosets(splits_dirpath, with_fulltexts, with_exemplars, - fe_dict, flatten=False): + fe_dict={}, flatten=False): """Return a list of pyfn.AnnotationSet extracted from splits paths. The splits directory should contain two subdirectories name 'fulltext' @@ -387,6 +388,10 @@ def _get_fe_dict(frame_xml_filepaths): def _get_annosets_dict_from_fn_xml(fn_splits_dirpath, splits, with_exemplars): + if splits not in ('train', 'dev', 'test'): + raise InvalidParameterError( + 'Invalid splits name `{}`. Should be `train`, `dev` or `test`' + .format(splits)) fe_dict = _get_fe_dict(xml_utils.get_xml_filepaths(fn_splits_dirpath, 'frame')) if splits == 'test': From 44edfdf9b07ce322d1d710fee6a07bec46aa5255 Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 15:58:02 -0400 Subject: [PATCH 09/15] Refactored function name for consistency across package --- pyfn/marshalling/unmarshallers/semeval.py | 4 ++-- tests/test_unmarshallers_semeval.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyfn/marshalling/unmarshallers/semeval.py b/pyfn/marshalling/unmarshallers/semeval.py index 2056c65..d762cc6 100644 --- a/pyfn/marshalling/unmarshallers/semeval.py +++ b/pyfn/marshalling/unmarshallers/semeval.py @@ -5,12 +5,12 @@ import pyfn.marshalling.unmarshallers.framenet as fn_unmarshaller -__all__ = ['unmarshall_semeval07_xml'] +__all__ = ['unmarshall_annosets'] logger = logging.getLogger(__name__) -def unmarshall_semeval07_xml(xml_filepath, fe_dict, flatten=False): +def unmarshall_annosets(xml_filepath, fe_dict={}, flatten=False): """Unmarshall a SemEval 2007 FrameNet XML file from file path. Return a generator of AnnotationSet instances extracted from the diff --git a/tests/test_unmarshallers_semeval.py b/tests/test_unmarshallers_semeval.py index 7a7062c..90833ae 100644 --- a/tests/test_unmarshallers_semeval.py +++ b/tests/test_unmarshallers_semeval.py @@ -7,7 +7,7 @@ SEMEVAL_XML_FILE = os.path.join(os.path.dirname(__file__), 'resources', 'semeval.xml') -semeval_annosets_list = list(semeval_unmarshaller.unmarshall_semeval07_xml(SEMEVAL_XML_FILE, {})) +semeval_annosets_list = list(semeval_unmarshaller.unmarshall_annosets(SEMEVAL_XML_FILE, {})) def test_semeval_annoset(): From f2d00aba5dc6174dcf832f2b735c86635a73fa30 Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 15:58:16 -0400 Subject: [PATCH 10/15] Updated documentation --- README.md | 191 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 171 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index acbf04d..8736718 100644 --- a/README.md +++ b/README.md @@ -7,15 +7,15 @@ [![FrameNet][framenet-image]][framenet-url] [![MIT License][license-image]][license-url] -Welcome to **pyfn**, a Python modules to process FrameNet annotation. +Welcome to `pyfn`, a Python modules to process FrameNet annotation. -pyfn can be used to: +`pyfn` can be used to: -1. [convert]() data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and +1. [convert](#conversion) data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and CoNLL-X -2. [preprocess]() FrameNet data using a standardized state-of-the-art pipeline -3. [run]() the SEMAFOR and OPEN-SESAME frame semantic parsers -4. [build]() your own frame semantic parser using a standard set of python models +2. [preprocess](#preprocessing-and-frame-semantic-parsing) FrameNet data using a standardized state-of-the-art pipeline +3. [run](#preprocessing-and-frame-semantic-parsing) the SEMAFOR and OPEN-SESAME frame semantic parsers +4. [build](#marshalling-and-unmarshalling-framenet-xml-data) your own frame semantic parser using a standard set of python models to marshall/unmarshall FrameNet XML data This repository also accompanies the (Kabbach et al., 2018) paper: @@ -47,10 +47,10 @@ pip3 install pyfn ``` ## Use -When using pyfn, your FrameNet splits directory structure should follow: +When using `pyfn`, your FrameNet splits directory structure should follow: ``` . -|-- fndata-1.x +|-- fndata-1.x-with-dev | |-- train | | |-- fulltext | | |-- lu @@ -60,11 +60,14 @@ When using pyfn, your FrameNet splits directory structure should follow: | |-- test | | |-- fulltext | | |-- lu +| |-- frame +| |-- frRelation.xml +| |-- semTypes.xml ``` ## Conversion -pyfn can be used to convert data to and from: +`pyfn` can be used to convert data to and from: - FRAMENET XML: the format of the released FrameNet XML data - SEMEVAL XML: the format of the SEMEVAL 2007 shared task 19 on frame semantic structure extraction - SEMAFOR CoNLL: the format used by the SEMAFOR parser @@ -98,7 +101,7 @@ For details on `pyfn` usage, do: ```bash pyfn --help pyfn generate --help -convert --help +pyfn convert --help ``` ### From FN XML to BIOS @@ -180,7 +183,7 @@ To also process exemplars, add the `--with_exemplars` option ## Preprocessing and Frame Semantic Parsing -pyfn ships in with a set of bash scripts to preprocess FrameNet data with +`pyfn` ships in with a set of bash scripts to preprocess FrameNet data with various POS taggers and dependency parsers, as well as to perform frame semantic parsing with a variety of open-source parsers. @@ -199,7 +202,7 @@ Currently supported frame semantic parsers include: - OPEN-SESAME (Swayamdipta et al., 2017) for argument identification To request support for a POS tagger, a dependency parser or a frame semantic -parser, please create an [issue](https://github.com/akb89/pyfn/issues). +parser, please create an [issue](https://github.com/akb89/pyfn/issues) on Github/Gitlab. ### Download To run the preprocessing and frame semantic parsing scripts, first download: @@ -322,7 +325,7 @@ pyfn convert \ Use `preprocess.sh` to POS-tag and dependency-parse FrameNet splits generated with `pyfn convert ...`. The helper should display: -```shell +``` Usage: ${0##*/} [-h] -x XP_NUM -t {mxpost,nlp4j} -p {semafor,open-sesame} [-d {mst,bmst,barch}] [-v] Preprocess FrameNet train/dev/test splits. @@ -360,7 +363,7 @@ the `framenet.frame.element.map` and the hierarchy `.csv` files used by SEMAFOR, or the `frames.xml` and `frRelations.xml` files used by both SEMAFOR and OPEN-SESAME. The helper should display: -```shell +``` Usage: ${0##*/} [-h] -x XP_NUM -p {semafor,open-sesame} -s {dev,test} -f FN_DATA_DIR [-u] [-e] Prepare misc. data for frame semantic parsing. @@ -396,7 +399,7 @@ You can prepare SEMAFOR data using: Use `frameid.sh` to perform frame identification using SIMPLEFRAMEID. The helper should display: -```shell +``` Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-p {semafor,open-sesame}] Perform frame identification. @@ -435,7 +438,7 @@ and decode (**before decoding argument identification**) using: Use `semafor.sh` to train the SEMAFOR parser or decode the test/dev splits. The helper should display: -```shell +``` Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-s {dev,test}] [-u] Train or decode with the SEMAFOR parser. @@ -475,7 +478,7 @@ and decode the test splits using: Use `open-sesame.sh` to train the OPEN-SESMAE parser or decode the test/dev splits. The helper should display: -```shell +``` Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-s {dev,test}] [-d] [-u] Train or decode with the OPEN-SESAME parser. @@ -518,7 +521,7 @@ Use `score.sh` to obtain P/R/F1 scores for frame semantic parsing on dev/test splits with the SEMEVAL scoring script, using gold of predicted frames. The helper should display: -```shell +``` Usage: ${0##*/} [-h] -x XP_NUM -p {semafor,open-sesame} -s {dev,test} -f {gold,predicted} Score frame semantic parsing with a modified version of the SEMEVAL scoring script. @@ -555,15 +558,163 @@ to compare the performances of different frame semantic parsers in various experimental setups. -## Marshalling and Unmarshalling of FrameNet XML data +## Marshalling and Unmarshalling FrameNet XML data `pyfn` provides a set of Python models to process FrameNet XML data. Those can be used to help you build you own frame semantic parser. +The core of the `pyfn` models is the `AnnotationSet` corresponding to an +XML `` tag. It stores various information +regarding a given set of FrameNet annotation for a given target in a given sentence. +The notable innovations are the `labelstore` and the `valenceunitstore`, which +store FrameNet labels (FE/PT/GF) in their original formats, and in custom +formats which may prove useful for frame semantic parsing. + +Explore the various models under the `pyfn.models` directory of the `pyfn` +package. + +### Unmarshalling FrameNet XML data + +To convert a list of fulltext.xml files and/or lu.xml files to a generator +over `pyfn.AnnotationSet` objects, with no overlap between train/dev/test splits, use: + +```python +import pyfn.marshalling.unmarshallers.framenet as fn_unmarshaller + +if __name__ == '__main__': + splits_dirpath = '/abs/path/to/framenet-1.x-with-dev/' + splits = 'train' + with_exemplars = False + annosets_dict = fn_unmarshaller.get_annosets_dict(splits_dirpath, + splits, with_exemplars) +``` +`splits_dirpath` should point at the directory containing train/dev/test +splits directories (see detailed structure [above](#use)). + +`get_annosets_dict` will return a string to AnnotationSet generator dict. +It will ensure no overlap between train/dev/test splits. + +Calling `get_annosets_dict` on `splits='test'` will return a dictionary +with a single `'test'` key. Calling `get_annosets_dict` on `splits='dev'` +will return a dictionary with two keys: `'dev'` and `'test'`. +Calling `get_annosets_dict` on `splits='train'` will return a dictionary +with three keys: `'train'`, `'dev'` and `'test'`. + +To iterate over the list of AnnotationSet objects of each key, you can +then do: + +```python +for (splits, annosets) in annosets_dict.items(): + print('Iterating over annotationsets for splits: {}'.format(splits)) + for annoset in annosets: + print('annoset with #id = {}'.format(annoset._id)) +``` + +Or simply, to iterate over a specific key values (such as train annosets): + +```python +for annoset in annosets_dict['train']: + print('annoset with #id = {}'.format(annoset._id)) +``` + +Note that for performance, annosets is not a list but a generator. + + +### Unmarshalling OPEN-SESAME BIOS data + +To convert a `.bios` file with its corresponding `.sentences` file to +a generator over `pyfn.AnnotationSet` objects, use: + +```python +import pyfn.marshalling.unmarshallers.bios as bios_unmarshaller + +if __name__ == '__main__': + bios_filepath = '/abs/path/to/.bios' + sent_filepath = '/abs/path/to/.sentences' + annosets = bios_unmarshaller.unmarshall_annosets(bios_filepath, + sent_filepath) + for annoset in annosets: + print('annoset with #id = {}'.format(annoset._id)) +``` + +**Important** the `.bios` and `.sentences` files must have been generated +with `pyfn convert ... --to bios ...` with the `--filter overlap_fes` +parameter. + +### Unmarshalling SEMAFOR CONLL data + +To convert a `.frame.elements` file with its corresponding `.sentences` +file to a generator over `pyfn.AnnotationSet` objects, use: + +```python +import pyfn.marshalling.unmarshallers.semafor as semafor_unmarshaller + +if __name__ == '__main__': + semafor_filepath = '/abs/path/to/.frame.elements' + sent_filepath = '/abs/path/to/.sentences' + annosets = semafor_unmarshaller.unmarshall_annosets(semafor_filepath, + sent_filepath) + for annoset in annosets: + print('annoset with #id = {}'.format(annoset._id)) +``` + +### Unmarshalling SEMEVAL XML data + +To convert a SEMEVAL `.xml` file with its corresponding `.sentences` +file to a generator over `pyfn.AnnotationSet` objects, use: + +```python +import pyfn.marshalling.unmarshallers.semeval as semeval_unmarshaller + +if __name__ == '__main__': + xml_filepath = '/abs/path/to/semeval/.xml' + annosetss = semeval_unmarshaller.unmarshall_annosets(xml_filepath) +``` + +By default `unmarshall_annosets` for SEMEVAL will return a generator over embedded annotationsets. To iterate over a single annotationset, use: + +```python +for annosets in annosetss: + for annoset in annosets: + print('annoset with #id = {}'.format(annoset._id)) +``` + +To return a 'flat' list of annosets, pass in the `flatten=True` parameter: + +```python +import pyfn.marshalling.unmarshallers.semeval as semeval_unmarshaller + +if __name__ == '__main__': + xml_filepath = '/abs/path/to/semeval/.xml' + annosets = semeval_unmarshaller.unmarshall_annosets(xml_filepath, flatten=True) + for annoset in annosets: + print('annoset with #id = {}'.format(annoset._id)) +``` + +### Marshalling to FrameNet XML data + +To convert a list of `pyfn.AnnotationSet` objects to a FrameNet-style `.xml` file, use: + +```python + +``` + + +### Marshalling to OPEN-SESAME BIOS data + +To convert a list of `pyfn.AnnotationSet` objects to OPEN-SESAME-style `.bios`, use: + +### Marshalling to SEMAFOR CONLL data + +To convert a list of `pyfn.AnnotationSet` objects to SEMAFOR-style `.frame.elements`, use: + +### Marshalling to SEMEVAL XML data + +To convert a list of `pyfn.AnnotationSet` objects to SEMEVAL-style `.xml`, use: ## Citation -If you use pyfn please cite: +If you use `pyfn` please cite: ```tex @InProceedings{C18-1267, author = "Kabbach, Alexandre From becd65bc8049cca5a1b20cac983598500a8ffb7d Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 16:02:18 -0400 Subject: [PATCH 11/15] Attempt to fix anchors --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8736718..79c705e 100644 --- a/README.md +++ b/README.md @@ -11,11 +11,11 @@ Welcome to `pyfn`, a Python modules to process FrameNet annotation. `pyfn` can be used to: -1. [convert](#conversion) data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and +1. [convert](##conversion) data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and CoNLL-X -2. [preprocess](#preprocessing-and-frame-semantic-parsing) FrameNet data using a standardized state-of-the-art pipeline -3. [run](#preprocessing-and-frame-semantic-parsing) the SEMAFOR and OPEN-SESAME frame semantic parsers -4. [build](#marshalling-and-unmarshalling-framenet-xml-data) your own frame semantic parser using a standard set of python models +2. [preprocess](##preprocessing-and-frame-semantic-parsing) FrameNet data using a standardized state-of-the-art pipeline +3. [run](##preprocessing-and-frame-semantic-parsing) the SEMAFOR and OPEN-SESAME frame semantic parsers +4. [build](##marshalling-and-unmarshalling-framenet-xml-data) your own frame semantic parser using a standard set of python models to marshall/unmarshall FrameNet XML data This repository also accompanies the (Kabbach et al., 2018) paper: From cee032a2214ebd34e98e8bb989588d28316c6dd9 Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 16:03:20 -0400 Subject: [PATCH 12/15] backtracking --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 79c705e..8736718 100644 --- a/README.md +++ b/README.md @@ -11,11 +11,11 @@ Welcome to `pyfn`, a Python modules to process FrameNet annotation. `pyfn` can be used to: -1. [convert](##conversion) data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and +1. [convert](#conversion) data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and CoNLL-X -2. [preprocess](##preprocessing-and-frame-semantic-parsing) FrameNet data using a standardized state-of-the-art pipeline -3. [run](##preprocessing-and-frame-semantic-parsing) the SEMAFOR and OPEN-SESAME frame semantic parsers -4. [build](##marshalling-and-unmarshalling-framenet-xml-data) your own frame semantic parser using a standard set of python models +2. [preprocess](#preprocessing-and-frame-semantic-parsing) FrameNet data using a standardized state-of-the-art pipeline +3. [run](#preprocessing-and-frame-semantic-parsing) the SEMAFOR and OPEN-SESAME frame semantic parsers +4. [build](#marshalling-and-unmarshalling-framenet-xml-data) your own frame semantic parser using a standard set of python models to marshall/unmarshall FrameNet XML data This repository also accompanies the (Kabbach et al., 2018) paper: From e59fa65c81a667dba75728bb72e4b74e66ca60c0 Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 16:41:01 -0400 Subject: [PATCH 13/15] Updated docstrings and added support for marshalling doc --- README.md | 25 +++++++++++----------- pyfn/marshalling/marshallers/bios.py | 17 ++++++++++++++- pyfn/marshalling/marshallers/semafor.py | 14 ++++++++++++ pyfn/marshalling/marshallers/semeval.py | 9 +++++++- pyfn/marshalling/unmarshallers/framenet.py | 4 +++- pyfn/marshalling/unmarshallers/semeval.py | 4 +++- 6 files changed, 56 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 8736718..be3feb0 100644 --- a/README.md +++ b/README.md @@ -691,26 +691,25 @@ if __name__ == '__main__': print('annoset with #id = {}'.format(annoset._id)) ``` -### Marshalling to FrameNet XML data +### Marshalling to OPEN-SESAME BIOS -To convert a list of `pyfn.AnnotationSet` objects to a FrameNet-style `.xml` file, use: +To convert a dict of `splits` to `pyfn.AnnotationSet` objects to OPEN-SESAME-style `.bios`, refer to +`pyfn.marshalling.marshallers.bios.marshall_annosets_dict` -```python - -``` - - -### Marshalling to OPEN-SESAME BIOS data +### Marshalling to SEMAFOR CONLL -To convert a list of `pyfn.AnnotationSet` objects to OPEN-SESAME-style `.bios`, use: +To convert a dict of `splits` to `pyfn.AnnotationSet` objects to SEMAFOR-style `.frame.elements`, refer to +`pyfn.marshalling.marshallers.semafor.marshall_annosets_dict` -### Marshalling to SEMAFOR CONLL data +### Marshalling to SEMEVAL XML -To convert a list of `pyfn.AnnotationSet` objects to SEMAFOR-style `.frame.elements`, use: +To convert a list of `pyfn.AnnotationSet` objects to SEMEVAL-style `.xml`, +refer to `pyfn.marshalling.marshallers.semeval.marshall_annosets` -### Marshalling to SEMEVAL XML data +### Marshalling to .csv hierarchy -To convert a list of `pyfn.AnnotationSet` objects to SEMEVAL-style `.xml`, use: +To convert a list of relations to a `.csv` file, refer to +`pyfn.marshalling.marshallers.hierarchy.marshall_relations` ## Citation diff --git a/pyfn/marshalling/marshallers/bios.py b/pyfn/marshalling/marshallers/bios.py index d5262d4..208f93a 100644 --- a/pyfn/marshalling/marshallers/bios.py +++ b/pyfn/marshalling/marshallers/bios.py @@ -154,7 +154,22 @@ def _marshall_bios(annosets, filtering_options, sent_dict, bios_filepath, def marshall_annosets_dict(annosets_dict, target_dirpath, filtering_options, output_sentences, excluded_frames, excluded_sentences, excluded_annosets): - """Convert a dict of {splits:pyfn.AnnotationSet} to BIOS splits files.""" + """Convert a dict of {splits:pyfn.AnnotationSet} to BIOS splits files. + + Args + ---- + annosets_dict: a splits to annosets dictionary (as generated by + the framenet unmarshaller). + target_dirpath: the absolute path to the target directory where to + save the output file(s) + filtering_options: a list of options to pass to the pyfn.utils.filter. + ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces') + output_sentences: True or False. Whether or not to also output a .sentences file + listing all sentences (string), one per line. + excluded_frames: a list of frame #id to exclude from the output + excluded_sentences: a list of sentence #id to exclude from the output + excluded_annosets: a list of annotationset #id to exclude from the output + """ for splits_name, annosets in annosets_dict.items(): bios_filepath = files_utils.get_bios_filepath(target_dirpath, splits_name) diff --git a/pyfn/marshalling/marshallers/semafor.py b/pyfn/marshalling/marshallers/semafor.py index 8acc78c..0777b88 100644 --- a/pyfn/marshalling/marshallers/semafor.py +++ b/pyfn/marshalling/marshallers/semafor.py @@ -148,6 +148,20 @@ def marshall_annosets_dict(annosets_dict, target_dirpath, filtering_options, both frame and frame element labels depending on filtering options. The dev/test splits will be converted to a .frames file containing frame labels only. + + Args + ---- + annosets_dict: a splits to annosets dictionary (as generated by + the framenet unmarshaller). + target_dirpath: the absolute path to the target directory where to + save the output file(s) + filtering_options: a list of options to pass to the pyfn.utils.filter. + ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces') + output_sentences: True or False. Whether or not to also output a .sentences file + listing all sentences (string), one per line. + excluded_frames: a list of frame #id to exclude from the output + excluded_sentences: a list of sentence #id to exclude from the output + excluded_annosets: a list of annotationset #id to exclude from the output """ for splits_name, annosets in annosets_dict.items(): logger.info('Marshalling {} splits to semafor format' diff --git a/pyfn/marshalling/marshallers/semeval.py b/pyfn/marshalling/marshallers/semeval.py index 5a64699..e8204ad 100644 --- a/pyfn/marshalling/marshallers/semeval.py +++ b/pyfn/marshalling/marshallers/semeval.py @@ -112,7 +112,14 @@ def _marshall_annosets(annosets, output_filepath, excluded_frames, def marshall_annosets(annosets, output_filepath, excluded_frames, excluded_sentences, excluded_annosets): - """Marshall a list of pyfn.AnnotationSet objects to SEMEVAL XML.""" + """Marshall a list of pyfn.AnnotationSet objects to SEMEVAL XML. + + annosets: a list of annosets to marshall. + output_filepath: the absolute path to the output .xml file + excluded_frames: a list of frame #id to exclude from the output + excluded_sentences: a list of sentence #id to exclude from the output + excluded_annosets: a list of annotationset #id to exclude from the output + """ logger.info('Marshalling pyfn.AnnotationSet objects to SEMEVAL XML...') if not annosets: raise InvalidParameterError('Input pyfn.AnnotationSet list is empty') diff --git a/pyfn/marshalling/unmarshallers/framenet.py b/pyfn/marshalling/unmarshallers/framenet.py index fe0b25a..3abf952 100644 --- a/pyfn/marshalling/unmarshallers/framenet.py +++ b/pyfn/marshalling/unmarshallers/framenet.py @@ -311,7 +311,7 @@ def _extract_ft_annosets(ft_filepaths, fe_dict, flatten=False): def extract_annosets(splits_dirpath, with_fulltexts, with_exemplars, - fe_dict={}, flatten=False): + fe_dict=None, flatten=False): """Return a list of pyfn.AnnotationSet extracted from splits paths. The splits directory should contain two subdirectories name 'fulltext' @@ -322,6 +322,8 @@ def extract_annosets(splits_dirpath, with_fulltexts, with_exemplars, """ logger.info('Extracting pyfn.AnnotationSet items from {}' .format(splits_dirpath)) + if fe_dict is None: + fe_dict = {} ft_annosets = [] ex_annosets = [] if with_fulltexts: diff --git a/pyfn/marshalling/unmarshallers/semeval.py b/pyfn/marshalling/unmarshallers/semeval.py index d762cc6..c77602b 100644 --- a/pyfn/marshalling/unmarshallers/semeval.py +++ b/pyfn/marshalling/unmarshallers/semeval.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -def unmarshall_annosets(xml_filepath, fe_dict={}, flatten=False): +def unmarshall_annosets(xml_filepath, fe_dict=None, flatten=False): """Unmarshall a SemEval 2007 FrameNet XML file from file path. Return a generator of AnnotationSet instances extracted from the @@ -24,6 +24,8 @@ def unmarshall_annosets(xml_filepath, fe_dict={}, flatten=False): """ logger.info('Unmarshalling SemEval FrameNet XML file: {}' .format(xml_filepath)) + if fe_dict is None: + fe_dict = {} # pylint: disable=R1702 for documents_tag in etree.parse(xml_filepath).getroot().findall( 'documents'): From ec90c45e86b1161a4e77efec5a6b91eec5b3494e Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 16:42:46 -0400 Subject: [PATCH 14/15] Updated README --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index be3feb0..7793c27 100644 --- a/README.md +++ b/README.md @@ -469,7 +469,7 @@ pyfn convert \ and decode the test splits using: -``` +```bash ./semafor.sh -m decode -x 001 -s test ``` @@ -511,7 +511,7 @@ pyfn convert \ and decode the test splits using: -``` +```bash ./open-sesame.sh -m decode -x 002 -s test ``` @@ -539,13 +539,13 @@ located under `lib/semeval/`. To obtain scores for SEMAFOR using gold frames on test splits, use: -``` +```bash ./score.sh -x XYZ -p semafor -s test -f gold ``` To obtain scores for SEMAFOR using predicted frames on test splits, use: -``` +```bash ./score.sh -x XYZ -p semafor -s test -f predicted ``` From c20c8ececcf9da5388f6ff210c7f9d5bb8955fea Mon Sep 17 00:00:00 2001 From: akb89 Date: Thu, 30 Aug 2018 16:54:20 -0400 Subject: [PATCH 15/15] Fixed docstring indentation --- pyfn/marshalling/marshallers/bios.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pyfn/marshalling/marshallers/bios.py b/pyfn/marshalling/marshallers/bios.py index 208f93a..f83c70b 100644 --- a/pyfn/marshalling/marshallers/bios.py +++ b/pyfn/marshalling/marshallers/bios.py @@ -156,19 +156,19 @@ def marshall_annosets_dict(annosets_dict, target_dirpath, filtering_options, excluded_sentences, excluded_annosets): """Convert a dict of {splits:pyfn.AnnotationSet} to BIOS splits files. - Args - ---- - annosets_dict: a splits to annosets dictionary (as generated by - the framenet unmarshaller). - target_dirpath: the absolute path to the target directory where to - save the output file(s) - filtering_options: a list of options to pass to the pyfn.utils.filter. - ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces') - output_sentences: True or False. Whether or not to also output a .sentences file - listing all sentences (string), one per line. - excluded_frames: a list of frame #id to exclude from the output - excluded_sentences: a list of sentence #id to exclude from the output - excluded_annosets: a list of annotationset #id to exclude from the output + Args + ---- + annosets_dict: a splits to annosets dictionary (as generated by + the framenet unmarshaller). + target_dirpath: the absolute path to the target directory where to + save the output file(s) + filtering_options: a list of options to pass to the pyfn.utils.filter. + ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces') + output_sentences: True or False. Whether or not to also output a .sentences file + listing all sentences (string), one per line. + excluded_frames: a list of frame #id to exclude from the output + excluded_sentences: a list of sentence #id to exclude from the output + excluded_annosets: a list of annotationset #id to exclude from the output """ for splits_name, annosets in annosets_dict.items(): bios_filepath = files_utils.get_bios_filepath(target_dirpath,