From 8ae69186491d9ba23cf590696654bc3ab1105ef4 Mon Sep 17 00:00:00 2001 From: Emerson Rocha Date: Thu, 11 Nov 2021 16:52:59 -0300 Subject: [PATCH] fititnt/hxltm-action#5: data-normalization #1 --- .gitignore | 5 +++- CHANGELOG.md | 10 +++++++ README.md | 8 +++--- scripts/data-original-download.sh | 2 +- scripts/data-original-prepare.sh | 43 +++++++++++++++++++++++++++++-- 5 files changed, 61 insertions(+), 7 deletions(-) create mode 100644 CHANGELOG.md diff --git a/.gitignore b/.gitignore index 71dbdef..cd87353 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,7 @@ data/original/terminologies.zip data/original/tico19-testset.zip !.gitignore !README.md -tmp/ \ No newline at end of file +tmp/ + +# temp +data/original/terminology/facebook/*.csv \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..5693b7f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# Changelog + +## [Unreleased] +### Added +- TODO + +## [0.9.0] - 2020-11-11 +### Added +- **Fiat lux!** +- Draft of scripts to download data from TICO-19 original sources diff --git a/README.md b/README.md index 6f7c1ce..0ba79bf 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ # tico-19-hxltm -**[draft] Public domain datasets from Translation Initiative for COVID-19 -on the format HXLTM (Multilingual Terminology in Humanitarian Language Exchange)** +**[draft] Public domain datasets from +[Translation Initiative for COVID-19](tico-19.github.io) on the format +HXLTM (Multilingual Terminology in Humanitarian Language Exchange).** -> TODO: move to @EticaAI organization +> TODO: move to [@EticaAI](https://github.com/EticaAI) organization and + publish on a subdomain. ## License diff --git a/scripts/data-original-download.sh b/scripts/data-original-download.sh index 0a34bf1..a98c039 100755 --- a/scripts/data-original-download.sh +++ b/scripts/data-original-download.sh @@ -10,7 +10,7 @@ # # OPTIONS: --- # -# REQUIREMENTS: --- +# REQUIREMENTS: - git # BUGS: --- # NOTES: --- # AUTHORS: Emerson Rocha diff --git a/scripts/data-original-prepare.sh b/scripts/data-original-prepare.sh index d6ba97c..3bf181e 100755 --- a/scripts/data-original-prepare.sh +++ b/scripts/data-original-prepare.sh @@ -9,7 +9,8 @@ # # OPTIONS: --- # -# REQUIREMENTS: --- +# REQUIREMENTS: - rename +# - rsync # BUGS: --- # NOTES: --- # AUTHORS: Emerson Rocha @@ -22,6 +23,7 @@ # ============================================================================== set -e +PWD_NOW=$(pwd) TMP_DIR="tmp" DATA_DIR="data" DATA_ORIGINAL_DIR="data/original" @@ -29,7 +31,44 @@ DATA_ORIGINAL_GIT_DIR="tmp/original-git" set -x rsync --archive --verbose "${DATA_ORIGINAL_GIT_DIR}/data/" "$DATA_ORIGINAL_DIR/" -set +x +# set +x + + +# cd "$DATA_ORIGINAL_DIR/terminologies" +# pwd + +# Copy +find "$DATA_ORIGINAL_DIR/terminologies/" -name 'f_*' -type f -exec cp "{}" "$DATA_ORIGINAL_DIR/terminology/facebook" \; + +# Rename +# find "$DATA_ORIGINAL_DIR/terminology/facebook/" -name 'f_*' -type f -exec ls "{}" \; + +# rename 's/f_//' "$DATA_ORIGINAL_DIR/terminology/facebook/*.csv" + +# find "$DATA_ORIGINAL_DIR/terminology/facebook/" -name 'f_*' -type f -exec rename 's/f_//_' "{}" \; + + +# echo 'oi' +# find f_* -type f | sed -n "s/f_//" | xargs print +# echo 'bye' +# echo 'oi2' +# find f_* -type f -exec sed -n "s/f_//" {} \; +# echo 'bye2' +# echo 'oi2' +# find ./ -type f -exec sed -i -e 's/f_//g' {} \; +# echo 'bye2' +# find f_* -type f -print0 +# # ecfind f_* -type f -print0 | xargs --null -I{} mv {} {}_renamed +# echo 'bye3' + + + +# find . -type f | +# sed -n "s/\(.*\)factory\.py$/& \1service\.py/p" | +# xargs -p -n 2 mv + +# for + # if [ ! -d "${DATA_ORIGINAL_DIR}/terminology" ]; then # mkdir "${DATA_ORIGINAL_DIR}/terminology"