From d3a89c7bbcbc8cbf19b82c8b18483371b0830306 Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Sun, 26 Aug 2018 11:49:39 -0400
Subject: [PATCH 01/15] Preparing v1.1.2: fixed gitlab URL and updated README

---
 README.md      |  7 +++++++
 REPLICATION.md | 37 +++++++++++++++++++++++++++++++++++++
 setup.py       |  4 ++--
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 824fe18..6f2f248 100644
--- a/README.md
+++ b/README.md
@@ -168,6 +168,13 @@ pyfn generate \
 ```
 To also process exemplars, add the `--with_exemplars` option
 
+### Using preprocessing and frame semantic parsing scripts
+We created a set of bash scripts to preprocess FrameNet data with various
+POS taggers and dependency parsers as well as to run the `SIMPLEFRAMEID`,
+`SEMAFOR` and `OPEN-SESAME` frame semantic parsers.
+Check out [REPLICATION.md](REPLICATION.md) for a detailed HowTo manual. 
+
+
 [release-image]:https://img.shields.io/github/release/akb89/pyfn.svg?style=flat-square
 [release-url]:https://github.com/akb89/pyfn/releases/latest
 [pypi-image]:https://img.shields.io/pypi/v/pyfn.svg?style=flat-square
diff --git a/REPLICATION.md b/REPLICATION.md
index fe8ffe5..6242d8c 100644
--- a/REPLICATION.md
+++ b/REPLICATION.md
@@ -57,6 +57,8 @@ directory named `pyfn`. Your pyfn folder structure should look like:
 |   |   |-- ...
 ```
 
+**Please strictly follow this directory structure to avoid unexpected errors. `pyfn` relies on a lot of relative path resolutions to make scripts calls shorter, and changing this directory structure can brake everything**
+
 ## Install
 ```
 pip3 install pyfn
@@ -117,6 +119,41 @@ butterfly effects in frame semantic parsing. Those instructions can be used
 to compare the performances of different frame semantic parsers in various
 experimental setups.
 
+## Scripts
+Each script comes with a helper: check it out with `--help`!
+
+We have made some opinionated choices on how to use the preprocessing and
+frame semantic parsing bash scripts. Those choices are primarly motivated
+by constraints on running *many* experiences at once, in various experimental
+setups, and on having commands that are easy to *type*.
+
+The main choice lies in the directory structure. Each script in the `scripts`
+directory expects an XP_DIR argument that specifies the experiments ID.
+When you specify:
+
+```
+./prepare.sh -x 042 ...
+```
+
+or
+
+```
+./preprocess.sh -x 042
+```
+
+The scripts expects the data to process to be located under `.../experiments/xp_042/data` where the `experiments` dir is at the same level
+as the `scripts` dir.
+
+### prepare.sh
+
+### preprocess.sh
+
+### semafor.sh
+
+### open-sesame.sh
+
+
+
 ## Citation
 If you use pyfn please cite:
 ```tex
diff --git a/setup.py b/setup.py
index 83d8e34..db7367e 100644
--- a/setup.py
+++ b/setup.py
@@ -16,8 +16,8 @@
     author_email='akb@3azouz.net',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    version='1.1.0',
-    url='https://gitlab.unige.ch/akb/pyfn',
+    version='1.1.2',
+    url='https://gitlab.com/akb89/pyfn',
     download_url='https://pypi.org/project/pyfn/#files',
     license='MIT',
     keywords=['framenet', 'xml', 'marshalling', 'unmarshalling'],

From 26d3b40fbeba4fd5ad0828b2a741ff5a75cdf959 Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Sun, 26 Aug 2018 18:53:38 -0400
Subject: [PATCH 02/15] Fixed #5, updated README for clarity following #4 and
 updated dependencies

---
 README.md           | 7 ++-----
 pyfn/main.py        | 3 ++-
 pyfn/utils/files.py | 2 +-
 setup.py            | 2 +-
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 6f2f248..2e8dc65 100644
--- a/README.md
+++ b/README.md
@@ -74,10 +74,7 @@ For an exhaustive description of all formats, check out [FORMAT.md](FORMAT.md).
 ## Conversion HowTo
 The following sections provide examples of commands to convert FN data
 to and from different formats. All commands can make use of the following options:
-1. `--splits`: specify which splits should be converted. Use `--splits dev`
-to only process dev and test splits and guarantee no overlap between
-dev and test. Use `--splits train` to process train dev and test splits and
-guarantee no overlap across splits. Default to `--splits test`.
+1. `--splits`: specify which splits should be converted. `--splits train` will generate all train/dev/test splits, according to data found under the fndata-1.x/{train/dev/test} directories. `--splits dev` will generate the dev and test splits according to data found under the fndata-1.x/{dev/test} directories. This option will skip the train splits but generate the same dev/test splits that would have been generated with `--splits train`. `--splits test` will generate the test splits according to data found under the fndata-1.x/test directory, and skip the train/dev splits. The test splits generated with `--splits test` will be the same as those generated with the `--splits train` and `--splits dev`. Default to `--splits test`.
 2. `--output_sentences`: if specified, will output a `.sentences` file
 in the process, containing all raw annotated sentences, one sentence per line.
 3. `--with_exemplars`: if specified, will process the exemplars (data under
@@ -172,7 +169,7 @@ To also process exemplars, add the `--with_exemplars` option
 We created a set of bash scripts to preprocess FrameNet data with various
 POS taggers and dependency parsers as well as to run the `SIMPLEFRAMEID`,
 `SEMAFOR` and `OPEN-SESAME` frame semantic parsers.
-Check out [REPLICATION.md](REPLICATION.md) for a detailed HowTo manual. 
+Check out [REPLICATION.md](REPLICATION.md) for a detailed HowTo manual.
 
 
 [release-image]:https://img.shields.io/github/release/akb89/pyfn.svg?style=flat-square
diff --git a/pyfn/main.py b/pyfn/main.py
index aeba3e6..7504715 100644
--- a/pyfn/main.py
+++ b/pyfn/main.py
@@ -51,7 +51,6 @@ def _convert(args):
         raise InvalidParameterError(
             'Source and Target paths are the same! Please specify different '
             'source/target paths')
-    # TODO: add validation for input directory structure
     if args.source_format == 'fnxml':
         annosets_dict = fnxml.get_annosets_dict(args.source_path,
                                                 args.splits,
@@ -70,6 +69,8 @@ def _convert(args):
                 'need to specify the --sent parameter pointing at the '
                 '.sentences file absolute filepath')
         annosets = semaforu.unmarshall_annosets(args.source_path, args.sent)
+    ## Starting marshalling
+    os.makedirs(args.target_path, exist_ok=True)
     if args.target_format == 'bios':
         biosm.marshall_annosets_dict(annosets_dict, args.target_path,
                                      args.filter, args.output_sentences,
diff --git a/pyfn/utils/files.py b/pyfn/utils/files.py
index 3d2789d..81d72a3 100644
--- a/pyfn/utils/files.py
+++ b/pyfn/utils/files.py
@@ -73,5 +73,5 @@ def get_rolemappings_filepath(target_dirpath):
 
 
 def get_fr_relation_xml_filepath(splits_dirpath):
-    """Return the absolute path to the frRelation.xl file given splits_dirpath."""
+    """Return the absolute path to the frRelation.xml file given splits_dirpath."""
     return os.path.join(splits_dirpath, 'frRelation.xml')
diff --git a/setup.py b/setup.py
index db7367e..467c7a0 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@
             'pyfn = pyfn.main:main'
         ],
     },
-    tests_require=['pytest==3.7.2', 'pylint==2.1.1', 'pytest-cov==2.5.1',
+    tests_require=['pytest==3.7.3', 'pylint==2.1.1', 'pytest-cov==2.5.1',
                    'pydocstyle==2.1.1'],
     install_requires=['PyYAML==3.13', 'mmh3==2.5.1', 'lxml==4.2.4',
                       'pytz==2018.5'],

From 854fdf40eeeab16b24807ded694ebb9917c36269 Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Sun, 26 Aug 2018 19:01:30 -0400
Subject: [PATCH 03/15] Fixed link to pypi badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2e8dc65..a476a85 100644
--- a/README.md
+++ b/README.md
@@ -175,7 +175,7 @@ Check out [REPLICATION.md](REPLICATION.md) for a detailed HowTo manual.
 [release-image]:https://img.shields.io/github/release/akb89/pyfn.svg?style=flat-square
 [release-url]:https://github.com/akb89/pyfn/releases/latest
 [pypi-image]:https://img.shields.io/pypi/v/pyfn.svg?style=flat-square
-[pypi-url]:https://github.com/akb89/pyfn/releases/latest
+[pypi-url]:https://pypi.org/project/pyfn/
 [build-image]:https://gitlab.com/akb89/pyfn/badges/master/pipeline.svg
 [build-url]:https://gitlab.com/akb89/pyfn/commits/master
 [coverage-image]:https://img.shields.io/coveralls/akb89/pyfn/master.svg?style=flat-square

From 57a408a8febb8f30d3b7eee3d866d92fe04d93b4 Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 12:24:23 -0400
Subject: [PATCH 04/15] Updated pytest dependency

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 467c7a0..332d8f2 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@
             'pyfn = pyfn.main:main'
         ],
     },
-    tests_require=['pytest==3.7.3', 'pylint==2.1.1', 'pytest-cov==2.5.1',
+    tests_require=['pytest==3.7.4', 'pylint==2.1.1', 'pytest-cov==2.5.1',
                    'pydocstyle==2.1.1'],
     install_requires=['PyYAML==3.13', 'mmh3==2.5.1', 'lxml==4.2.4',
                       'pytz==2018.5'],

From 8b5592088307dce75afb1cda892df2410a0253da Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 12:24:40 -0400
Subject: [PATCH 05/15] Updated helper description

---
 scripts/score.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/score.sh b/scripts/score.sh
index 6660141..1bbe612 100755
--- a/scripts/score.sh
+++ b/scripts/score.sh
@@ -5,7 +5,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/setup.sh"
 show_help() {
 cat << EOF
 Usage: ${0##*/} [-h] -x XP_NUM -p {semafor,open-sesame} -s {dev,test} -f {gold,predicted}
-Score frame semantic parsing with the SEMEVAL scoring scripts modified by Kshirsagar et al. (2015).
+Score frame semantic parsing with a modified version of the SEMEVAL scoring script.
 
   -h, --help                           display this help and exit
   -x, --xp      XP_NUM                 xp number written as 3 digits (e.g. 001)

From fad78c2b467bbf51adaccd3f7ec5ecb9e72eda16 Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 12:24:54 -0400
Subject: [PATCH 06/15] Refactored README with scripts documentation

---
 README.md      | 452 ++++++++++++++++++++++++++++++++++++++++++++++---
 REPLICATION.md | 172 -------------------
 2 files changed, 430 insertions(+), 194 deletions(-)
 delete mode 100644 REPLICATION.md

diff --git a/README.md b/README.md
index a476a85..acbf04d 100644
--- a/README.md
+++ b/README.md
@@ -9,15 +9,14 @@
 
 Welcome to **pyfn**, a Python modules to process FrameNet annotation.
 
-pyfn can be used to convert data to and from:
-- FRAMENET XML: the format of the released FrameNet XML data
-- SEMEVAL XML: the format of the SEMEVAL 2007 shared task 19 on frame semantic structure extraction
-- SEMAFOR CoNLL: the format used by the SEMAFOR parser
-- BIOS: the format used by the OPEN-SESAME parser
-- CoNLL-X: the format used by various state-of-the-art POS taggers and dependency parsers (see preprocessing considerations for frame semantic parsing in [REPLICATION.md](REPLICATION.md))
+pyfn can be used to:
 
-As well as to generate the `.csv` hierarchy files used by both SEMAFOR and
-OPEN-SESAME parsers to integrate the hierarchy feature (see (Kshirsagar et al., 2015) for details).
+1. [convert]() data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and
+CoNLL-X
+2. [preprocess]() FrameNet data using a standardized state-of-the-art pipeline
+3. [run]() the SEMAFOR and OPEN-SESAME frame semantic parsers
+4. [build]() your own frame semantic parser using a standard set of python models
+to marshall/unmarshall FrameNet XML data
 
 This repository also accompanies the (Kabbach et al., 2018) paper:
 
@@ -36,11 +35,6 @@ This repository also accompanies the (Kabbach et al., 2018) paper:
 }
 ```
 
-To use `pyfn` to replicate frame semantic parsing results for SEMAFOR,
-OPEN-SESAME and SIMPLEFRAMEID on a common preprocessing pipeline,
-or to replicate results reported in (Kabbach et al., 2018),
-check out [REPLICATION.md](REPLICATION.md).
-
 ## Dependencies
 On Unix, you may need to install the following packages:
 ```
@@ -68,13 +62,32 @@ When using pyfn, your FrameNet splits directory structure should follow:
 |   |   |-- lu
 ```
 
-## Formats
+## Conversion
+
+pyfn can be used to convert data to and from:
+- FRAMENET XML: the format of the released FrameNet XML data
+- SEMEVAL XML: the format of the SEMEVAL 2007 shared task 19 on frame semantic structure extraction
+- SEMAFOR CoNLL: the format used by the SEMAFOR parser
+- BIOS: the format used by the OPEN-SESAME parser
+- CoNLL-X: the format used by various state-of-the-art POS taggers and dependency parsers (see preprocessing considerations for frame semantic parsing in [below](#preprocessing-and-frame-semantic-parsing))
+
+As well as to generate the `.csv` hierarchy files used by both SEMAFOR and
+OPEN-SESAME parsers to integrate the hierarchy feature (see (Kshirsagar et al., 2015) for details).
+
 For an exhaustive description of all formats, check out [FORMAT.md](FORMAT.md).
 
-## Conversion HowTo
+### HowTo
+
 The following sections provide examples of commands to convert FN data
 to and from different formats. All commands can make use of the following options:
-1. `--splits`: specify which splits should be converted. `--splits train` will generate all train/dev/test splits, according to data found under the fndata-1.x/{train/dev/test} directories. `--splits dev` will generate the dev and test splits according to data found under the fndata-1.x/{dev/test} directories. This option will skip the train splits but generate the same dev/test splits that would have been generated with `--splits train`. `--splits test` will generate the test splits according to data found under the fndata-1.x/test directory, and skip the train/dev splits. The test splits generated with `--splits test` will be the same as those generated with the `--splits train` and `--splits dev`. Default to `--splits test`.
+1. `--splits`: specify which splits should be converted. `--splits train` will generate all
+train/dev/test splits, according to data found under the fndata-1.x/{train/dev/test}
+directories. `--splits dev` will generate the dev and test splits according to data found under
+the fndata-1.x/{dev/test} directories. This option will skip the train splits but generate the
+same dev/test splits that would have been generated with `--splits train`. `--splits test` will
+generate the test splits according to data found under the fndata-1.x/test directory, and skip
+the train/dev splits. The test splits generated with `--splits test` will be the same as those
+generated with the `--splits train` and `--splits dev`. Default to `--splits test`.
 2. `--output_sentences`: if specified, will output a `.sentences` file
 in the process, containing all raw annotated sentences, one sentence per line.
 3. `--with_exemplars`: if specified, will process the exemplars (data under
@@ -85,7 +98,7 @@ For details on `pyfn` usage, do:
 ```bash
 pyfn --help
 pyfn generate --help
-pyfn convert --help
+convert --help
 ```
 
 ### From FN XML to BIOS
@@ -165,11 +178,406 @@ pyfn generate \
 ```
 To also process exemplars, add the `--with_exemplars` option
 
-### Using preprocessing and frame semantic parsing scripts
-We created a set of bash scripts to preprocess FrameNet data with various
-POS taggers and dependency parsers as well as to run the `SIMPLEFRAMEID`,
-`SEMAFOR` and `OPEN-SESAME` frame semantic parsers.
-Check out [REPLICATION.md](REPLICATION.md) for a detailed HowTo manual.
+
+## Preprocessing and Frame Semantic Parsing
+pyfn ships in with a set of bash scripts to preprocess FrameNet data with
+various POS taggers and dependency parsers, as well as to perform frame
+semantic parsing with a variety of open-source parsers.
+
+Currently supported POS taggers include:
+- MXPOST (Ratnaparkhi, 1996)
+- NLP4J (Choi, 2016)
+
+Currently supported dependency parsers include:
+- MST (McDonald et al., 2006)
+- BIST BARCH (Kiperwasser and Goldberg, 2016)
+- BIST BMST (Kiperwasser and Goldberg, 2016)
+
+Currently supported frame semantic parsers include:
+- SIMPLEFRAMEID (Hartmann et al., 2017) for frame identification
+- SEMAFOR (Kshirsagar et al., 2015) for argument identification
+- OPEN-SESAME (Swayamdipta et al., 2017) for argument identification
+
+To request support for a POS tagger, a dependency parser or a frame semantic
+parser, please create an [issue](https://github.com/akb89/pyfn/issues).
+
+### Download
+To run the preprocessing and frame semantic parsing scripts, first download:
+- [data.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/data.7z) containing all the FrameNet splits for FN 1.5 and FN 1.7
+- [lib.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/lib.7z) containing all the different external softwares (taggers, parsers, etc.)
+- [resources.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/resources.7z) containing all the required resources
+- [scripts.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/scripts.7z) containing the set of bash scripts to call the different parsers and preprocessing toolkits
+
+Extract the content of all the archives under a
+directory named `pyfn`. Your pyfn folder structure should look like:
+```
+.
+|-- pyfn
+|   |-- data
+|   |   |-- fndata-1.5-with-dev
+|   |   |-- fndata-1.7-with-dev
+|   |-- lib
+|   |   |-- bistparser
+|   |   |-- jmx
+|   |   |-- mstparser
+|   |   |-- nlp4j
+|   |   |-- open-sesame
+|   |   |-- semafor
+|   |   |-- semeval
+|   |-- resources
+|   |   |-- bestarchybrid.model
+|   |   |-- bestarchybrid.params
+|   |   |-- bestfirstorder.model
+|   |   |-- bestfirstorder.params
+|   |   |-- config-decode-pos.xml
+|   |   |-- nlp4j.plemma.model.all.xz
+|   |   |-- sskip.100.vectors
+|   |   |-- wsj.model
+|   |-- scripts
+|   |   |-- CoNLLizer.py
+|   |   |-- deparse.sh
+|   |   |-- flatten.sh
+|   |   |-- ...
+```
+
+**Please strictly follow this directory structure to avoid unexpected errors. `pyfn` relies on a lot of relative path resolutions to make scripts calls shorter, and changing this directory structure can brake everything**
+
+### Setup NLP4J for POS tagging
+
+To use NLP4J for POS tagging, modify the `resources/config-decode-pos.xml`
+file by replacing the models.pos absolute path to
+your `resources/nlp4j.plemma.model.all.xz`:
+```xml
+<configuration>
+	...
+	<models>
+		<pos>/absolute/path/to/pyfn/resources/nlp4j.plemma.model.all.xz</pos>
+	</models>
+</configuration>
+```
+
+### Setup DyNET for BIST or OPEN-SESAME
+
+If you intend to use the BIST parser for dependency parsing or
+OPEN-SESAME for frame semantic parsing, you will need
+to install DyNET 2.0.2 following:
+```
+https://dynet.readthedocs.io/en/2.0.2/python.html
+```
+
+### Setup SEMAFOR
+To use the SEMAFOR frame semantic parser, modify the `scripts/setup.sh` file:
+```bash
+# SEMAFOR options to be changed according to your env
+export JAVA_HOME_BIN="/abs/path/to/java/jdk/bin"
+export num_threads=2 # number of threads to use
+export min_ram=4g # min RAM allocated to the JVM in GB. Corresponds to the -Xms argument
+export max_ram=8g # max RAM allocated to the JVM in GB. Corresponds to the -Xmx argument
+
+# SEMAFOR hyperparameters
+export kbest=1 # keep k-best parse
+export lambda=0.000001 # hyperparameter for argument identification. Refer to Kshirsagar et al. (2015) for details.
+export batch_size=4000 # number of batches processed at once for argument identification.
+export save_every_k_batches=400 # for argument identification
+export num_models_to_save=60 # for argument identification
+```
+
+### Using the SEMEVAL PERL evaluation scripts
+
+If you intend to use the SEMEVAL perl evaluation scripts, make sure
+to have the `App::cpanminus` and `XML::Parser` modules installed:
+```
+cpan App::cpanminus
+cpanm XML::Parser
+```
+
+### Using bash scripts
+
+Each script comes with a helper: check it out with `--help`!
+
+**Careful!** most scripts expect data output by `pyfn convert ...`
+to be located under `pyfn/experiments/xp_XYZ/data` where `XYZ` stands for
+the experiments number and is specified using the `-x XYZ` argument, and where
+the `experiments` directory is located at the same level as the `scripts`
+directory. This opinionated choice has proven extremely useful in launching
+scripts by batch on a large set of experiments as it avoid having to input
+the full path each time.
+
+**Make sure to use**
+
+```bash
+pyfn convert \
+  --from ... \
+  --to ... \
+  --source ... \
+  --target /abs/path/to/pyfn/experiments/xp_XYZ/data \
+  --splits ...
+```
+
+**BEFORE** calling `preprocess.sh`, `prepare.sh`, `semafor.sh` or
+`open-sesame.sh`
+
+### preprocess.sh
+
+Use `preprocess.sh` to POS-tag and dependency-parse FrameNet splits generated
+with `pyfn convert ...`. The helper should display:
+
+```shell
+Usage: ${0##*/} [-h] -x XP_NUM -t {mxpost,nlp4j} -p {semafor,open-sesame} [-d {mst,bmst,barch}] [-v]
+Preprocess FrameNet train/dev/test splits.
+
+  -h, --help                           display this help and exit
+  -x, --xp      XP_NUM                 xp number written as 3 digits (e.g. 001)
+  -t, --tagger  {mxpost,nlp4j}         pos tagger to be used: 'mxpost' or 'nlp4j'
+  -p, --parser  {semafor,open-sesame}  frame semantic parser to be used: 'semafor' or 'open-sesame'
+  -d, --dep     {mst,bmst,barch}       dependency parser to be used: 'mst', 'bmst' or 'barch'
+  -v, --dev                            if set, script will also preprocess dev splits
+```
+
+Suppose you generated FrameNet splits for SEMAFOR using:
+
+```bash
+pyfn convert \
+  --from fnxml \
+  --to semafor \
+  --source /path/to/fndata-1.7-with-dev \
+  --target /path/to/experiments/xp_001/data \
+  --splits train \
+  --output_sentences
+```
+
+You can preprocess those splits with NLP4J and BMST using
+
+```bash
+./preprocess.sh -x 001 -t nlp4j -d bmst -p semafor
+```
+
+### prepare.sh
+
+Use `prepare.sh` to automatically generate misc. data required by the
+frame semantic parsing pipeline, such as gold SEMEVAL XML files for scoring,
+the `framenet.frame.element.map` and the hierarchy `.csv` files
+used by SEMAFOR, or the `frames.xml` and `frRelations.xml` files used by
+both SEMAFOR and OPEN-SESAME. The helper should display:
+
+```shell
+Usage: ${0##*/} [-h] -x XP_NUM -p {semafor,open-sesame} -s {dev,test} -f FN_DATA_DIR [-u] [-e]
+Prepare misc. data for frame semantic parsing.
+
+  -h, --help                                   display this help and exit
+  -x, --xp              XP_NUM                 xp number written as 3 digits (e.g. 001)
+  -p, --parser          {semafor,open-sesame}  frame semantic parser to be used: 'semafor' or 'open-sesame'
+  -s, --splits          {dev,test}             which splits to score: dev or test
+  -f, --fn              FN_DATA_DIR            absolute path to FrameNet data directory
+  -u, --with_hierarchy                         if specified, will use the hierarchy feature
+  -e, --with_exemplars                         if specified, will use the exemplars
+```
+
+Suppose you generated FrameNet splits for SEMAFOR using:
+
+```bash
+pyfn convert \
+  --from fnxml \
+  --to semafor \
+  --source /path/to/fndata-1.7-with-dev \
+  --target /path/to/experiments/xp_001/data \
+  --splits train \
+  --output_sentences
+```
+
+You can prepare SEMAFOR data using:
+
+```bash
+./prepare.sh -x 001 -p semafor -s test -f /path/to/fndata-1.7-with-dev
+```
+
+### frameid.sh
+
+Use `frameid.sh` to perform frame identification using SIMPLEFRAMEID.
+The helper should display:
+
+```shell
+Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-p {semafor,open-sesame}]
+Perform frame identification.
+
+  -h, --help                            display this help and exit
+  -m, --mode                            train on all models or decode using a single model
+  -x, --xp       XP_NUM                 xp number written as 3 digits (e.g. 001)
+  -p, --parser   {semafor,open-sesame}  formalize decoded frames for specified parser
+```
+
+Suppose you generated FrameNet splits for SEMAFOR using:
+
+```bash
+pyfn convert \
+  --from fnxml \
+  --to semafor \
+  --source /path/to/fndata-1.7-with-dev \
+  --target /path/to/experiments/xp_101/data \
+  --splits train \
+  --output_sentences
+```
+
+*After preprocessing*, you can train the SIMPLEFRAMEID parser using:
+
+```bash
+./frameid.sh -m train -x 101
+```
+
+and decode (**before decoding argument identification**) using:
+
+```bash
+./frameid.sh -m decode -x 101 -p semafor
+```
+
+### semafor.sh
+
+Use `semafor.sh` to train the SEMAFOR parser or decode the test/dev splits.
+The helper should display:
+
+```shell
+Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-s {dev,test}] [-u]
+Train or decode with the SEMAFOR parser.
+
+  -h, --help                             display this help and exit
+  -m, --mode            {train,decode}   semafor mode to use: train or decode
+  -x, --xp              XP_NUM           xp number written as 3 digits (e.g. 001)
+  -s, --splits          {dev,test}       which splits to use in decode mode: dev or test
+  -u, --with_hierarchy                   if specified, parser will use the hierarchy feature
+```
+
+Suppose you generated FrameNet splits for SEMAFOR using:
+
+```bash
+pyfn convert \
+  --from fnxml \
+  --to semafor \
+  --source /path/to/fndata-1.7-with-dev \
+  --target /path/to/experiments/xp_001/data \
+  --splits train \
+  --output_sentences
+```
+
+*After preprocessing and preparation*, you can train the SEMAFOR parser using:
+
+```bash
+./semafor.sh -m train -x 001
+```
+
+and decode the test splits using:
+
+```
+./semafor.sh -m decode -x 001 -s test
+```
+
+### open-sesame.sh
+
+Use `open-sesame.sh` to train the OPEN-SESMAE parser or decode the test/dev splits.
+The helper should display:
+
+```shell
+Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-s {dev,test}] [-d] [-u]
+Train or decode with the OPEN-SESAME parser.
+
+  -h, --help                              display this help and exit
+  -m, --mode              {train,decode}  open-sesame mode to use: train or decode
+  -x, --xp                XP_NUM          xp number written as 3 digits (e.g. 001)
+  -s, --splits            {dev,test}      which splits to use in decode mode: dev or test
+  -d, --with_dep_parses                   if specified, parser will use dependency parses
+  -u, --with_hierarchy                    if specified, parser will use the hierarchy feature
+```
+
+Suppose you generated FrameNet splits for OPEN-SESAME using:
+
+```bash
+pyfn convert \
+  --from fnxml \
+  --to bios \
+  --source /path/to/fndata-1.7-with-dev \
+  --target /path/to/experiments/xp_002/data \
+  --splits train \
+  --output_sentences \
+  --filter overlap_fes
+```
+
+*After preprocessing and preparation*, you can train the SEMAFOR parser using:
+
+```bash
+./open-sesame.sh -m train -x 002
+```
+
+and decode the test splits using:
+
+```
+./open-sesame.sh -m decode -x 002 -s test
+```
+
+### score.sh
+
+Use `score.sh` to obtain P/R/F1 scores for frame semantic parsing on
+dev/test splits with the SEMEVAL scoring script, using gold of predicted frames.
+The helper should display:
+
+```shell
+Usage: ${0##*/} [-h] -x XP_NUM -p {semafor,open-sesame} -s {dev,test} -f {gold,predicted}
+Score frame semantic parsing with a modified version of the SEMEVAL scoring script.
+
+  -h, --help                           display this help and exit
+  -x, --xp      XP_NUM                 xp number written as 3 digits (e.g. 001)
+  -p, --parser  {semafor,open-sesame}  frame semantic parser to be used: 'semafor' or 'open-sesame'
+  -s, --splits  {dev,test}             which splits to score: dev or test
+  -f, --frames  {gold,predicted}       score with gold or predicted frames
+```
+
+Note that scoring is done with an updated version of the SEMEVAL perl script,
+in order to obtain more robust scores across setups. For a full account
+of the modifications, refer to (Kabbach et al., 2018) and to the perl scripts
+located under `lib/semeval/`.
+
+To obtain scores for SEMAFOR using gold frames on test splits, use:
+
+```
+./score.sh -x XYZ -p semafor -s test -f gold
+```
+
+To obtain scores for SEMAFOR using predicted frames on test splits, use:
+
+```
+./score.sh -x XYZ -p semafor -s test -f predicted
+```
+
+## Replication
+
+The `experiments` directory provides a detailed set of instructions to
+replicate all results reported in (Kabbach et al., 2018) on experimental
+butterfly effects in frame semantic parsing. Those instructions can be used
+to compare the performances of different frame semantic parsers in various
+experimental setups.
+
+
+## Marshalling and Unmarshalling of FrameNet XML data
+
+`pyfn` provides a set of Python models to process FrameNet XML data.
+Those can be used to help you build you own frame semantic parser.
+
+
+## Citation
+
+If you use pyfn please cite:
+```tex
+@InProceedings{C18-1267,
+  author = 	"Kabbach, Alexandre
+		and Ribeyre, Corentin
+		and Herbelot, Aur{\'e}lie",
+  title = 	"Butterfly Effects in Frame Semantic Parsing: impact of data processing on model ranking",
+  booktitle = 	"Proceedings of the 27th International Conference on Computational Linguistics",
+  year = 	"2018",
+  publisher = 	"Association for Computational Linguistics",
+  pages = 	"3158--3169",
+  location = 	"Santa Fe, New Mexico, USA",
+  url = 	"http://aclweb.org/anthology/C18-1267"
+}
+```
 
 
 [release-image]:https://img.shields.io/github/release/akb89/pyfn.svg?style=flat-square
diff --git a/REPLICATION.md b/REPLICATION.md
deleted file mode 100644
index 6242d8c..0000000
--- a/REPLICATION.md
+++ /dev/null
@@ -1,172 +0,0 @@
-# Replication
-
-pyfn provides a set of models and utils to apply custom preprocessing
-pipelines to FrameNet XML data and perform frame semantic parsing using
-SEMAFOR, OPEN-SESAME or SIMPLEFRAMEID.
-
-Currently supported POS taggers include:
-- MXPOST (Ratnaparkhi, 1996)
-- NLP4J (Choi, 2016)
-
-Currently supported dependency parsers include:
-- MST (McDonald et al., 2006)
-- BIST BARCH (Kiperwasser and Goldberg, 2016)
-- BIST BMST (Kiperwasser and Goldberg, 2016)
-
-Currently supported frame semantic parsers include:
-- SIMPLEFRAMEID (Hartmann et al., 2017) for frame identification
-- SEMAFOR (Kshirsagar et al., 2015) for argument identification
-- OPEN-SESAME (Swayamdipta et al., 2017) for argument identification
-
-## Download
-Download the following:
-- [data.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/data.7z) containing all the FrameNet splits for FN 1.5 and FN 1.7
-- [lib.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/lib.7z) containing all the different external softwares (taggers, parsers, etc.)
-- [resources.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/resources.7z) containing all the required resources
-- [scripts.7z](https://github.com/akb89/pyfn/releases/download/v1.0.0/scripts.7z) containing the set of bash scripts to call the different parsers and preprocessing toolkits
-
-Extract the content of all the archives under a
-directory named `pyfn`. Your pyfn folder structure should look like:
-```
-.
-|-- pyfn
-|   |-- data
-|   |   |-- fndata-1.5-with-dev
-|   |   |-- fndata-1.7-with-dev
-|   |-- lib
-|   |   |-- bistparser
-|   |   |-- jmx
-|   |   |-- mstparser
-|   |   |-- nlp4j
-|   |   |-- open-sesame
-|   |   |-- semafor
-|   |   |-- semeval
-|   |-- resources
-|   |   |-- bestarchybrid.model
-|   |   |-- bestarchybrid.params
-|   |   |-- bestfirstorder.model
-|   |   |-- bestfirstorder.params
-|   |   |-- config-decode-pos.xml
-|   |   |-- nlp4j.plemma.model.all.xz
-|   |   |-- sskip.100.vectors
-|   |   |-- wsj.model
-|   |-- scripts
-|   |   |-- CoNLLizer.py
-|   |   |-- deparse.sh
-|   |   |-- flatten.sh
-|   |   |-- ...
-```
-
-**Please strictly follow this directory structure to avoid unexpected errors. `pyfn` relies on a lot of relative path resolutions to make scripts calls shorter, and changing this directory structure can brake everything**
-
-## Install
-```
-pip3 install pyfn
-```
-
-## Setup
-
-### Using NLP4J for POS tagging
-To use NLP4J for POS tagging, modify the `resources/config-decode-pos.xml`
-file by replacing the models.pos absolute path to
-your `resources/nlp4j.plemma.model.all.xz`:
-```xml
-<configuration>
-	...
-	<models>
-		<pos>/absolute/path/to/pyfn/resources/nlp4j.plemma.model.all.xz</pos>
-	</models>
-</configuration>
-```
-
-### Using BIST or OPEN-SESAME
-If you intend to use the BIST for dependency parsing or
-OPEN-SESAME for frame semantic parsing, you need
-to install DyNET 2.0.2 following:
-```
-https://dynet.readthedocs.io/en/2.0.2/python.html
-```
-
-### Using SEMAFOR
-To use the SEMAFOR frame semantic parser, modify the `scripts/setup.sh` file:
-```bash
-# SEMAFOR options to be changed according to your env
-export JAVA_HOME_BIN="/abs/path/to/java/jdk/bin"
-export num_threads=2 # number of threads to use
-export min_ram=4g # min RAM allocated to the JVM in GB. Corresponds to the -Xms argument
-export max_ram=8g # max RAM allocated to the JVM in GB. Corresponds to the -Xmx argument
-
-# SEMAFOR hyperparameters
-export kbest=1 # keep k-best parse
-export lambda=0.000001 # hyperparameter for argument identification. Refer to Kshirsagar et al. (2015) for details.
-export batch_size=4000 # number of batches processed at once for argument identification.
-export save_every_k_batches=400 # for argument identification
-export num_models_to_save=60 # for argument identification
-```
-
-### Using the SEMEVAL PERL evaluation scripts
-If you intend to use the SEMEVAL perl evaluation scripts, make sure
-to have the `App::cpanminus` and `XML::Parser` modules installed:
-```
-cpan App::cpanminus
-cpanm XML::Parser
-```
-
-## Replication
-The `experiments` directory provides a detailed set of instructions to
-replicate all results reported in (Kabbach et al., 2018) on experimental
-butterfly effects in frame semantic parsing. Those instructions can be used
-to compare the performances of different frame semantic parsers in various
-experimental setups.
-
-## Scripts
-Each script comes with a helper: check it out with `--help`!
-
-We have made some opinionated choices on how to use the preprocessing and
-frame semantic parsing bash scripts. Those choices are primarly motivated
-by constraints on running *many* experiences at once, in various experimental
-setups, and on having commands that are easy to *type*.
-
-The main choice lies in the directory structure. Each script in the `scripts`
-directory expects an XP_DIR argument that specifies the experiments ID.
-When you specify:
-
-```
-./prepare.sh -x 042 ...
-```
-
-or
-
-```
-./preprocess.sh -x 042
-```
-
-The scripts expects the data to process to be located under `.../experiments/xp_042/data` where the `experiments` dir is at the same level
-as the `scripts` dir.
-
-### prepare.sh
-
-### preprocess.sh
-
-### semafor.sh
-
-### open-sesame.sh
-
-
-
-## Citation
-If you use pyfn please cite:
-```tex
-@InProceedings{C18-1267,
-  author = 	"Kabbach, Alexandre
-		and Ribeyre, Corentin
-		and Herbelot, Aur{\'e}lie",
-  title = 	"Butterfly Effects in Frame Semantic Parsing: impact of data processing on model ranking",
-  booktitle = 	"Proceedings of the 27th International Conference on Computational Linguistics",
-  year = 	"2018",
-  publisher = 	"Association for Computational Linguistics",
-  pages = 	"3158--3169",
-  location = 	"Santa Fe, New Mexico, USA",
-  url = 	"http://aclweb.org/anthology/C18-1267"
-}
-```

From f35b694631caf9e53cfd0f88f5a4945b69506dca Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 15:57:24 -0400
Subject: [PATCH 07/15] Updated description and version

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 332d8f2..bd69abd 100644
--- a/setup.py
+++ b/setup.py
@@ -11,12 +11,12 @@
 
 setup(
     name='pyfn',
-    description='A python module to process FrameNet XML data',
+    description='A python module to process data for Frame Semantic Parsing',
     author='Alexandre Kabbach',
     author_email='akb@3azouz.net',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    version='1.1.2',
+    version='1.2.0',
     url='https://gitlab.com/akb89/pyfn',
     download_url='https://pypi.org/project/pyfn/#files',
     license='MIT',

From 7564e2791b0d40801a13eb9a902222d45b0f79fe Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 15:57:46 -0400
Subject: [PATCH 08/15] Added exception for invalid parameters

---
 pyfn/marshalling/unmarshallers/framenet.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pyfn/marshalling/unmarshallers/framenet.py b/pyfn/marshalling/unmarshallers/framenet.py
index 3f1c612..fe0b25a 100644
--- a/pyfn/marshalling/unmarshallers/framenet.py
+++ b/pyfn/marshalling/unmarshallers/framenet.py
@@ -10,6 +10,7 @@
 import pyfn.utils.filter as f_utils
 import pyfn.utils.xml as xml_utils
 
+from pyfn.exceptions.parameter import InvalidParameterError
 from pyfn.exceptions.xml import XMLProcessingError
 
 from pyfn.models.annotationset import AnnotationSet
@@ -310,7 +311,7 @@ def _extract_ft_annosets(ft_filepaths, fe_dict, flatten=False):
 
 
 def extract_annosets(splits_dirpath, with_fulltexts, with_exemplars,
-                     fe_dict, flatten=False):
+                     fe_dict={}, flatten=False):
     """Return a list of pyfn.AnnotationSet extracted from splits paths.
 
     The splits directory should contain two subdirectories name 'fulltext'
@@ -387,6 +388,10 @@ def _get_fe_dict(frame_xml_filepaths):
 
 
 def _get_annosets_dict_from_fn_xml(fn_splits_dirpath, splits, with_exemplars):
+    if splits not in ('train', 'dev', 'test'):
+        raise InvalidParameterError(
+            'Invalid splits name `{}`. Should be `train`, `dev` or `test`'
+            .format(splits))
     fe_dict = _get_fe_dict(xml_utils.get_xml_filepaths(fn_splits_dirpath,
                                                        'frame'))
     if splits == 'test':

From 44edfdf9b07ce322d1d710fee6a07bec46aa5255 Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 15:58:02 -0400
Subject: [PATCH 09/15] Refactored function name for consistency across package

---
 pyfn/marshalling/unmarshallers/semeval.py | 4 ++--
 tests/test_unmarshallers_semeval.py       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyfn/marshalling/unmarshallers/semeval.py b/pyfn/marshalling/unmarshallers/semeval.py
index 2056c65..d762cc6 100644
--- a/pyfn/marshalling/unmarshallers/semeval.py
+++ b/pyfn/marshalling/unmarshallers/semeval.py
@@ -5,12 +5,12 @@
 
 import pyfn.marshalling.unmarshallers.framenet as fn_unmarshaller
 
-__all__ = ['unmarshall_semeval07_xml']
+__all__ = ['unmarshall_annosets']
 
 logger = logging.getLogger(__name__)
 
 
-def unmarshall_semeval07_xml(xml_filepath, fe_dict, flatten=False):
+def unmarshall_annosets(xml_filepath, fe_dict={}, flatten=False):
     """Unmarshall a SemEval 2007 FrameNet XML file from file path.
 
     Return a generator of AnnotationSet instances extracted from the
diff --git a/tests/test_unmarshallers_semeval.py b/tests/test_unmarshallers_semeval.py
index 7a7062c..90833ae 100644
--- a/tests/test_unmarshallers_semeval.py
+++ b/tests/test_unmarshallers_semeval.py
@@ -7,7 +7,7 @@
 
 SEMEVAL_XML_FILE = os.path.join(os.path.dirname(__file__), 'resources', 'semeval.xml')
 
-semeval_annosets_list = list(semeval_unmarshaller.unmarshall_semeval07_xml(SEMEVAL_XML_FILE, {}))
+semeval_annosets_list = list(semeval_unmarshaller.unmarshall_annosets(SEMEVAL_XML_FILE, {}))
 
 
 def test_semeval_annoset():

From f2d00aba5dc6174dcf832f2b735c86635a73fa30 Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 15:58:16 -0400
Subject: [PATCH 10/15] Updated documentation

---
 README.md | 191 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 171 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index acbf04d..8736718 100644
--- a/README.md
+++ b/README.md
@@ -7,15 +7,15 @@
 [![FrameNet][framenet-image]][framenet-url]
 [![MIT License][license-image]][license-url]
 
-Welcome to **pyfn**, a Python modules to process FrameNet annotation.
+Welcome to `pyfn`, a Python modules to process FrameNet annotation.
 
-pyfn can be used to:
+`pyfn` can be used to:
 
-1. [convert]() data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and
+1. [convert](#conversion) data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and
 CoNLL-X
-2. [preprocess]() FrameNet data using a standardized state-of-the-art pipeline
-3. [run]() the SEMAFOR and OPEN-SESAME frame semantic parsers
-4. [build]() your own frame semantic parser using a standard set of python models
+2. [preprocess](#preprocessing-and-frame-semantic-parsing) FrameNet data using a standardized state-of-the-art pipeline
+3. [run](#preprocessing-and-frame-semantic-parsing) the SEMAFOR and OPEN-SESAME frame semantic parsers
+4. [build](#marshalling-and-unmarshalling-framenet-xml-data) your own frame semantic parser using a standard set of python models
 to marshall/unmarshall FrameNet XML data
 
 This repository also accompanies the (Kabbach et al., 2018) paper:
@@ -47,10 +47,10 @@ pip3 install pyfn
 ```
 
 ## Use
-When using pyfn, your FrameNet splits directory structure should follow:
+When using `pyfn`, your FrameNet splits directory structure should follow:
 ```
 .
-|-- fndata-1.x
+|-- fndata-1.x-with-dev
 |   |-- train
 |   |   |-- fulltext
 |   |   |-- lu
@@ -60,11 +60,14 @@ When using pyfn, your FrameNet splits directory structure should follow:
 |   |-- test
 |   |   |-- fulltext
 |   |   |-- lu
+|   |-- frame
+|   |-- frRelation.xml
+|   |-- semTypes.xml
 ```
 
 ## Conversion
 
-pyfn can be used to convert data to and from:
+`pyfn` can be used to convert data to and from:
 - FRAMENET XML: the format of the released FrameNet XML data
 - SEMEVAL XML: the format of the SEMEVAL 2007 shared task 19 on frame semantic structure extraction
 - SEMAFOR CoNLL: the format used by the SEMAFOR parser
@@ -98,7 +101,7 @@ For details on `pyfn` usage, do:
 ```bash
 pyfn --help
 pyfn generate --help
-convert --help
+pyfn convert --help
 ```
 
 ### From FN XML to BIOS
@@ -180,7 +183,7 @@ To also process exemplars, add the `--with_exemplars` option
 
 
 ## Preprocessing and Frame Semantic Parsing
-pyfn ships in with a set of bash scripts to preprocess FrameNet data with
+`pyfn` ships in with a set of bash scripts to preprocess FrameNet data with
 various POS taggers and dependency parsers, as well as to perform frame
 semantic parsing with a variety of open-source parsers.
 
@@ -199,7 +202,7 @@ Currently supported frame semantic parsers include:
 - OPEN-SESAME (Swayamdipta et al., 2017) for argument identification
 
 To request support for a POS tagger, a dependency parser or a frame semantic
-parser, please create an [issue](https://github.com/akb89/pyfn/issues).
+parser, please create an [issue](https://github.com/akb89/pyfn/issues) on Github/Gitlab.
 
 ### Download
 To run the preprocessing and frame semantic parsing scripts, first download:
@@ -322,7 +325,7 @@ pyfn convert \
 Use `preprocess.sh` to POS-tag and dependency-parse FrameNet splits generated
 with `pyfn convert ...`. The helper should display:
 
-```shell
+```
 Usage: ${0##*/} [-h] -x XP_NUM -t {mxpost,nlp4j} -p {semafor,open-sesame} [-d {mst,bmst,barch}] [-v]
 Preprocess FrameNet train/dev/test splits.
 
@@ -360,7 +363,7 @@ the `framenet.frame.element.map` and the hierarchy `.csv` files
 used by SEMAFOR, or the `frames.xml` and `frRelations.xml` files used by
 both SEMAFOR and OPEN-SESAME. The helper should display:
 
-```shell
+```
 Usage: ${0##*/} [-h] -x XP_NUM -p {semafor,open-sesame} -s {dev,test} -f FN_DATA_DIR [-u] [-e]
 Prepare misc. data for frame semantic parsing.
 
@@ -396,7 +399,7 @@ You can prepare SEMAFOR data using:
 Use `frameid.sh` to perform frame identification using SIMPLEFRAMEID.
 The helper should display:
 
-```shell
+```
 Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-p {semafor,open-sesame}]
 Perform frame identification.
 
@@ -435,7 +438,7 @@ and decode (**before decoding argument identification**) using:
 Use `semafor.sh` to train the SEMAFOR parser or decode the test/dev splits.
 The helper should display:
 
-```shell
+```
 Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-s {dev,test}] [-u]
 Train or decode with the SEMAFOR parser.
 
@@ -475,7 +478,7 @@ and decode the test splits using:
 Use `open-sesame.sh` to train the OPEN-SESMAE parser or decode the test/dev splits.
 The helper should display:
 
-```shell
+```
 Usage: ${0##*/} [-h] -m {train,decode} -x XP_NUM [-s {dev,test}] [-d] [-u]
 Train or decode with the OPEN-SESAME parser.
 
@@ -518,7 +521,7 @@ Use `score.sh` to obtain P/R/F1 scores for frame semantic parsing on
 dev/test splits with the SEMEVAL scoring script, using gold of predicted frames.
 The helper should display:
 
-```shell
+```
 Usage: ${0##*/} [-h] -x XP_NUM -p {semafor,open-sesame} -s {dev,test} -f {gold,predicted}
 Score frame semantic parsing with a modified version of the SEMEVAL scoring script.
 
@@ -555,15 +558,163 @@ to compare the performances of different frame semantic parsers in various
 experimental setups.
 
 
-## Marshalling and Unmarshalling of FrameNet XML data
+## Marshalling and Unmarshalling FrameNet XML data
 
 `pyfn` provides a set of Python models to process FrameNet XML data.
 Those can be used to help you build you own frame semantic parser.
 
+The core of the `pyfn` models is the `AnnotationSet` corresponding to an
+XML `<annotationSet>` tag. It stores various information
+regarding a given set of FrameNet annotation for a given target in a given sentence.
+The notable innovations are the `labelstore` and the `valenceunitstore`, which
+store FrameNet labels (FE/PT/GF) in their original formats, and in custom
+formats which may prove useful for frame semantic parsing.
+
+Explore the various models under the `pyfn.models` directory of the `pyfn`
+package.
+
+### Unmarshalling FrameNet XML data
+
+To convert a list of fulltext.xml files and/or lu.xml files to a generator
+over `pyfn.AnnotationSet` objects, with no overlap between train/dev/test splits, use:
+
+```python
+import pyfn.marshalling.unmarshallers.framenet as fn_unmarshaller
+
+if __name__ == '__main__':
+  splits_dirpath = '/abs/path/to/framenet-1.x-with-dev/'
+  splits = 'train'
+  with_exemplars = False
+  annosets_dict = fn_unmarshaller.get_annosets_dict(splits_dirpath,
+                                                    splits, with_exemplars)
+```
+`splits_dirpath` should point at the directory containing train/dev/test
+splits directories (see detailed structure [above](#use)).
+
+`get_annosets_dict` will return a string to AnnotationSet generator dict.
+It will ensure no overlap between train/dev/test splits.
+
+Calling `get_annosets_dict` on `splits='test'` will return a dictionary
+with a single `'test'` key. Calling `get_annosets_dict` on `splits='dev'`
+will return a dictionary with two keys: `'dev'` and `'test'`.
+Calling `get_annosets_dict` on `splits='train'` will return a dictionary
+with three keys: `'train'`, `'dev'` and `'test'`.
+
+To iterate over the list of AnnotationSet objects of each key, you can
+then do:
+
+```python
+for (splits, annosets) in annosets_dict.items():
+  print('Iterating over annotationsets for splits: {}'.format(splits))
+  for annoset in annosets:
+    print('annoset with #id = {}'.format(annoset._id))
+```
+
+Or simply, to iterate over a specific key values (such as train annosets):
+
+```python
+for annoset in annosets_dict['train']:
+    print('annoset with #id = {}'.format(annoset._id))
+```
+
+Note that for performance, annosets is not a list but a generator.
+
+
+### Unmarshalling OPEN-SESAME BIOS data
+
+To convert a `.bios` file with its corresponding `.sentences` file to
+a generator over `pyfn.AnnotationSet` objects, use:
+
+```python
+import pyfn.marshalling.unmarshallers.bios as bios_unmarshaller
+
+if __name__ == '__main__':
+  bios_filepath = '/abs/path/to/.bios'
+  sent_filepath = '/abs/path/to/.sentences'
+  annosets = bios_unmarshaller.unmarshall_annosets(bios_filepath,
+                                                   sent_filepath)
+  for annoset in annosets:
+    print('annoset with #id = {}'.format(annoset._id))
+```
+
+**Important** the `.bios` and `.sentences` files must have been generated
+with `pyfn convert ... --to bios ...` with the `--filter overlap_fes`
+parameter.
+
+### Unmarshalling SEMAFOR CONLL data
+
+To convert a `.frame.elements` file with its corresponding `.sentences`
+file to a generator over `pyfn.AnnotationSet` objects, use:
+
+```python
+import pyfn.marshalling.unmarshallers.semafor as semafor_unmarshaller
+
+if __name__ == '__main__':
+  semafor_filepath = '/abs/path/to/.frame.elements'
+  sent_filepath = '/abs/path/to/.sentences'
+  annosets = semafor_unmarshaller.unmarshall_annosets(semafor_filepath,
+                                                      sent_filepath)
+  for annoset in annosets:
+    print('annoset with #id = {}'.format(annoset._id))
+```
+
+### Unmarshalling SEMEVAL XML data
+
+To convert a SEMEVAL `.xml` file with its corresponding `.sentences`
+file to a generator over `pyfn.AnnotationSet` objects, use:
+
+```python
+import pyfn.marshalling.unmarshallers.semeval as semeval_unmarshaller
+
+if __name__ == '__main__':
+  xml_filepath = '/abs/path/to/semeval/.xml'
+  annosetss = semeval_unmarshaller.unmarshall_annosets(xml_filepath)
+```
+
+By default `unmarshall_annosets` for SEMEVAL will return a generator over embedded annotationsets. To iterate over a single annotationset, use:
+
+```python
+for annosets in annosetss:
+  for annoset in annosets:
+    print('annoset with #id = {}'.format(annoset._id))
+```
+
+To return a 'flat' list of annosets, pass in the `flatten=True` parameter:
+
+```python
+import pyfn.marshalling.unmarshallers.semeval as semeval_unmarshaller
+
+if __name__ == '__main__':
+  xml_filepath = '/abs/path/to/semeval/.xml'
+  annosets = semeval_unmarshaller.unmarshall_annosets(xml_filepath, flatten=True)
+  for annoset in annosets:
+    print('annoset with #id = {}'.format(annoset._id))
+```
+
+### Marshalling to FrameNet XML data
+
+To convert a list of `pyfn.AnnotationSet` objects to a FrameNet-style `.xml` file, use:
+
+```python
+
+```
+
+
+### Marshalling to OPEN-SESAME BIOS data
+
+To convert a list of `pyfn.AnnotationSet` objects to OPEN-SESAME-style `.bios`, use:
+
+### Marshalling to SEMAFOR CONLL data
+
+To convert a list of `pyfn.AnnotationSet` objects to SEMAFOR-style `.frame.elements`, use:
+
+### Marshalling to SEMEVAL XML data
+
+To convert a list of `pyfn.AnnotationSet` objects to SEMEVAL-style `.xml`, use:
 
 ## Citation
 
-If you use pyfn please cite:
+If you use `pyfn` please cite:
 ```tex
 @InProceedings{C18-1267,
   author = 	"Kabbach, Alexandre

From becd65bc8049cca5a1b20cac983598500a8ffb7d Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 16:02:18 -0400
Subject: [PATCH 11/15] Attempt to fix anchors

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 8736718..79c705e 100644
--- a/README.md
+++ b/README.md
@@ -11,11 +11,11 @@ Welcome to `pyfn`, a Python modules to process FrameNet annotation.
 
 `pyfn` can be used to:
 
-1. [convert](#conversion) data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and
+1. [convert](##conversion) data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and
 CoNLL-X
-2. [preprocess](#preprocessing-and-frame-semantic-parsing) FrameNet data using a standardized state-of-the-art pipeline
-3. [run](#preprocessing-and-frame-semantic-parsing) the SEMAFOR and OPEN-SESAME frame semantic parsers
-4. [build](#marshalling-and-unmarshalling-framenet-xml-data) your own frame semantic parser using a standard set of python models
+2. [preprocess](##preprocessing-and-frame-semantic-parsing) FrameNet data using a standardized state-of-the-art pipeline
+3. [run](##preprocessing-and-frame-semantic-parsing) the SEMAFOR and OPEN-SESAME frame semantic parsers
+4. [build](##marshalling-and-unmarshalling-framenet-xml-data) your own frame semantic parser using a standard set of python models
 to marshall/unmarshall FrameNet XML data
 
 This repository also accompanies the (Kabbach et al., 2018) paper:

From cee032a2214ebd34e98e8bb989588d28316c6dd9 Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 16:03:20 -0400
Subject: [PATCH 12/15] backtracking

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 79c705e..8736718 100644
--- a/README.md
+++ b/README.md
@@ -11,11 +11,11 @@ Welcome to `pyfn`, a Python modules to process FrameNet annotation.
 
 `pyfn` can be used to:
 
-1. [convert](##conversion) data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and
+1. [convert](#conversion) data to and from FRAMENET XML, SEMEVAL XML, SEMAFOR CoNLL, BIOS and
 CoNLL-X
-2. [preprocess](##preprocessing-and-frame-semantic-parsing) FrameNet data using a standardized state-of-the-art pipeline
-3. [run](##preprocessing-and-frame-semantic-parsing) the SEMAFOR and OPEN-SESAME frame semantic parsers
-4. [build](##marshalling-and-unmarshalling-framenet-xml-data) your own frame semantic parser using a standard set of python models
+2. [preprocess](#preprocessing-and-frame-semantic-parsing) FrameNet data using a standardized state-of-the-art pipeline
+3. [run](#preprocessing-and-frame-semantic-parsing) the SEMAFOR and OPEN-SESAME frame semantic parsers
+4. [build](#marshalling-and-unmarshalling-framenet-xml-data) your own frame semantic parser using a standard set of python models
 to marshall/unmarshall FrameNet XML data
 
 This repository also accompanies the (Kabbach et al., 2018) paper:

From e59fa65c81a667dba75728bb72e4b74e66ca60c0 Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 16:41:01 -0400
Subject: [PATCH 13/15] Updated docstrings and added support for marshalling
 doc

---
 README.md                                  | 25 +++++++++++-----------
 pyfn/marshalling/marshallers/bios.py       | 17 ++++++++++++++-
 pyfn/marshalling/marshallers/semafor.py    | 14 ++++++++++++
 pyfn/marshalling/marshallers/semeval.py    |  9 +++++++-
 pyfn/marshalling/unmarshallers/framenet.py |  4 +++-
 pyfn/marshalling/unmarshallers/semeval.py  |  4 +++-
 6 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 8736718..be3feb0 100644
--- a/README.md
+++ b/README.md
@@ -691,26 +691,25 @@ if __name__ == '__main__':
     print('annoset with #id = {}'.format(annoset._id))
 ```
 
-### Marshalling to FrameNet XML data
+### Marshalling to OPEN-SESAME BIOS
 
-To convert a list of `pyfn.AnnotationSet` objects to a FrameNet-style `.xml` file, use:
+To convert a dict of `splits` to `pyfn.AnnotationSet` objects to OPEN-SESAME-style `.bios`, refer to
+`pyfn.marshalling.marshallers.bios.marshall_annosets_dict`
 
-```python
-
-```
-
-
-### Marshalling to OPEN-SESAME BIOS data
+### Marshalling to SEMAFOR CONLL
 
-To convert a list of `pyfn.AnnotationSet` objects to OPEN-SESAME-style `.bios`, use:
+To convert a dict of `splits` to `pyfn.AnnotationSet` objects to SEMAFOR-style `.frame.elements`, refer to
+`pyfn.marshalling.marshallers.semafor.marshall_annosets_dict`
 
-### Marshalling to SEMAFOR CONLL data
+### Marshalling to SEMEVAL XML
 
-To convert a list of `pyfn.AnnotationSet` objects to SEMAFOR-style `.frame.elements`, use:
+To convert a list of `pyfn.AnnotationSet` objects to SEMEVAL-style `.xml`,
+refer to `pyfn.marshalling.marshallers.semeval.marshall_annosets`
 
-### Marshalling to SEMEVAL XML data
+### Marshalling to .csv hierarchy
 
-To convert a list of `pyfn.AnnotationSet` objects to SEMEVAL-style `.xml`, use:
+To convert a list of relations to a `.csv` file, refer to
+`pyfn.marshalling.marshallers.hierarchy.marshall_relations`
 
 ## Citation
 
diff --git a/pyfn/marshalling/marshallers/bios.py b/pyfn/marshalling/marshallers/bios.py
index d5262d4..208f93a 100644
--- a/pyfn/marshalling/marshallers/bios.py
+++ b/pyfn/marshalling/marshallers/bios.py
@@ -154,7 +154,22 @@ def _marshall_bios(annosets, filtering_options, sent_dict, bios_filepath,
 def marshall_annosets_dict(annosets_dict, target_dirpath, filtering_options,
                            output_sentences, excluded_frames,
                            excluded_sentences, excluded_annosets):
-    """Convert a dict of {splits:pyfn.AnnotationSet} to BIOS splits files."""
+    """Convert a dict of {splits:pyfn.AnnotationSet} to BIOS splits files.
+
+       Args
+       ----
+           annosets_dict: a splits to annosets dictionary (as generated by
+           the framenet unmarshaller).
+           target_dirpath: the absolute path to the target directory where to
+           save the output file(s)
+           filtering_options: a list of options to pass to the pyfn.utils.filter.
+           ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces')
+           output_sentences: True or False. Whether or not to also output a .sentences file
+           listing all sentences (string), one per line.
+           excluded_frames: a list of frame #id to exclude from the output
+           excluded_sentences: a list of sentence #id to exclude from the output
+           excluded_annosets: a list of annotationset #id to exclude from the output
+    """
     for splits_name, annosets in annosets_dict.items():
         bios_filepath = files_utils.get_bios_filepath(target_dirpath,
                                                       splits_name)
diff --git a/pyfn/marshalling/marshallers/semafor.py b/pyfn/marshalling/marshallers/semafor.py
index 8acc78c..0777b88 100644
--- a/pyfn/marshalling/marshallers/semafor.py
+++ b/pyfn/marshalling/marshallers/semafor.py
@@ -148,6 +148,20 @@ def marshall_annosets_dict(annosets_dict, target_dirpath, filtering_options,
     both frame and frame element labels depending on filtering options.
     The dev/test splits will be converted to a .frames file containing
     frame labels only.
+
+     Args
+     ----
+         annosets_dict: a splits to annosets dictionary (as generated by
+         the framenet unmarshaller).
+         target_dirpath: the absolute path to the target directory where to
+         save the output file(s)
+         filtering_options: a list of options to pass to the pyfn.utils.filter.
+         ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces')
+         output_sentences: True or False. Whether or not to also output a .sentences file
+         listing all sentences (string), one per line.
+         excluded_frames: a list of frame #id to exclude from the output
+         excluded_sentences: a list of sentence #id to exclude from the output
+         excluded_annosets: a list of annotationset #id to exclude from the output
     """
     for splits_name, annosets in annosets_dict.items():
         logger.info('Marshalling {} splits to semafor format'
diff --git a/pyfn/marshalling/marshallers/semeval.py b/pyfn/marshalling/marshallers/semeval.py
index 5a64699..e8204ad 100644
--- a/pyfn/marshalling/marshallers/semeval.py
+++ b/pyfn/marshalling/marshallers/semeval.py
@@ -112,7 +112,14 @@ def _marshall_annosets(annosets, output_filepath, excluded_frames,
 
 def marshall_annosets(annosets, output_filepath, excluded_frames,
                       excluded_sentences, excluded_annosets):
-    """Marshall a list of pyfn.AnnotationSet objects to SEMEVAL XML."""
+    """Marshall a list of pyfn.AnnotationSet objects to SEMEVAL XML.
+
+    annosets: a list of annosets to marshall.
+    output_filepath: the absolute path to the output .xml file
+    excluded_frames: a list of frame #id to exclude from the output
+    excluded_sentences: a list of sentence #id to exclude from the output
+    excluded_annosets: a list of annotationset #id to exclude from the output
+    """
     logger.info('Marshalling pyfn.AnnotationSet objects to SEMEVAL XML...')
     if not annosets:
         raise InvalidParameterError('Input pyfn.AnnotationSet list is empty')
diff --git a/pyfn/marshalling/unmarshallers/framenet.py b/pyfn/marshalling/unmarshallers/framenet.py
index fe0b25a..3abf952 100644
--- a/pyfn/marshalling/unmarshallers/framenet.py
+++ b/pyfn/marshalling/unmarshallers/framenet.py
@@ -311,7 +311,7 @@ def _extract_ft_annosets(ft_filepaths, fe_dict, flatten=False):
 
 
 def extract_annosets(splits_dirpath, with_fulltexts, with_exemplars,
-                     fe_dict={}, flatten=False):
+                     fe_dict=None, flatten=False):
     """Return a list of pyfn.AnnotationSet extracted from splits paths.
 
     The splits directory should contain two subdirectories name 'fulltext'
@@ -322,6 +322,8 @@ def extract_annosets(splits_dirpath, with_fulltexts, with_exemplars,
     """
     logger.info('Extracting pyfn.AnnotationSet items from {}'
                 .format(splits_dirpath))
+    if fe_dict is None:
+        fe_dict = {}
     ft_annosets = []
     ex_annosets = []
     if with_fulltexts:
diff --git a/pyfn/marshalling/unmarshallers/semeval.py b/pyfn/marshalling/unmarshallers/semeval.py
index d762cc6..c77602b 100644
--- a/pyfn/marshalling/unmarshallers/semeval.py
+++ b/pyfn/marshalling/unmarshallers/semeval.py
@@ -10,7 +10,7 @@
 logger = logging.getLogger(__name__)
 
 
-def unmarshall_annosets(xml_filepath, fe_dict={}, flatten=False):
+def unmarshall_annosets(xml_filepath, fe_dict=None, flatten=False):
     """Unmarshall a SemEval 2007 FrameNet XML file from file path.
 
     Return a generator of AnnotationSet instances extracted from the
@@ -24,6 +24,8 @@ def unmarshall_annosets(xml_filepath, fe_dict={}, flatten=False):
     """
     logger.info('Unmarshalling SemEval FrameNet XML file: {}'
                 .format(xml_filepath))
+    if fe_dict is None:
+        fe_dict = {}
     # pylint: disable=R1702
     for documents_tag in etree.parse(xml_filepath).getroot().findall(
             'documents'):

From ec90c45e86b1161a4e77efec5a6b91eec5b3494e Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 16:42:46 -0400
Subject: [PATCH 14/15] Updated README

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index be3feb0..7793c27 100644
--- a/README.md
+++ b/README.md
@@ -469,7 +469,7 @@ pyfn convert \
 
 and decode the test splits using:
 
-```
+```bash
 ./semafor.sh -m decode -x 001 -s test
 ```
 
@@ -511,7 +511,7 @@ pyfn convert \
 
 and decode the test splits using:
 
-```
+```bash
 ./open-sesame.sh -m decode -x 002 -s test
 ```
 
@@ -539,13 +539,13 @@ located under `lib/semeval/`.
 
 To obtain scores for SEMAFOR using gold frames on test splits, use:
 
-```
+```bash
 ./score.sh -x XYZ -p semafor -s test -f gold
 ```
 
 To obtain scores for SEMAFOR using predicted frames on test splits, use:
 
-```
+```bash
 ./score.sh -x XYZ -p semafor -s test -f predicted
 ```
 

From c20c8ececcf9da5388f6ff210c7f9d5bb8955fea Mon Sep 17 00:00:00 2001
From: akb89 <akb@3azouz.net>
Date: Thu, 30 Aug 2018 16:54:20 -0400
Subject: [PATCH 15/15] Fixed docstring indentation

---
 pyfn/marshalling/marshallers/bios.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/pyfn/marshalling/marshallers/bios.py b/pyfn/marshalling/marshallers/bios.py
index 208f93a..f83c70b 100644
--- a/pyfn/marshalling/marshallers/bios.py
+++ b/pyfn/marshalling/marshallers/bios.py
@@ -156,19 +156,19 @@ def marshall_annosets_dict(annosets_dict, target_dirpath, filtering_options,
                            excluded_sentences, excluded_annosets):
     """Convert a dict of {splits:pyfn.AnnotationSet} to BIOS splits files.
 
-       Args
-       ----
-           annosets_dict: a splits to annosets dictionary (as generated by
-           the framenet unmarshaller).
-           target_dirpath: the absolute path to the target directory where to
-           save the output file(s)
-           filtering_options: a list of options to pass to the pyfn.utils.filter.
-           ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces')
-           output_sentences: True or False. Whether or not to also output a .sentences file
-           listing all sentences (string), one per line.
-           excluded_frames: a list of frame #id to exclude from the output
-           excluded_sentences: a list of sentence #id to exclude from the output
-           excluded_annosets: a list of annotationset #id to exclude from the output
+    Args
+    ----
+        annosets_dict: a splits to annosets dictionary (as generated by
+        the framenet unmarshaller).
+        target_dirpath: the absolute path to the target directory where to
+        save the output file(s)
+        filtering_options: a list of options to pass to the pyfn.utils.filter.
+        ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces')
+        output_sentences: True or False. Whether or not to also output a .sentences file
+        listing all sentences (string), one per line.
+        excluded_frames: a list of frame #id to exclude from the output
+        excluded_sentences: a list of sentence #id to exclude from the output
+        excluded_annosets: a list of annotationset #id to exclude from the output
     """
     for splits_name, annosets in annosets_dict.items():
         bios_filepath = files_utils.get_bios_filepath(target_dirpath,