From 70d67162ec689aa6ac735b63c6d94f1bb9e2b887 Mon Sep 17 00:00:00 2001 From: Florent Ravenel Date: Mon, 4 Dec 2023 22:20:59 +0100 Subject: [PATCH 1/2] feat(RegEx): Add Remove emojis from text --- RegEx/RegEx_Remove_emojis_from_text.ipynb | 252 ++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 RegEx/RegEx_Remove_emojis_from_text.ipynb diff --git a/RegEx/RegEx_Remove_emojis_from_text.ipynb b/RegEx/RegEx_Remove_emojis_from_text.ipynb new file mode 100644 index 0000000000..2bed7cc18d --- /dev/null +++ b/RegEx/RegEx_Remove_emojis_from_text.ipynb @@ -0,0 +1,252 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a5079ec3-1272-4ba9-8087-9ca2775b875c", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "\"Naas\"" + ] + }, + { + "cell_type": "markdown", + "id": "c2fb4534-d4bc-4a8d-aa7e-38fd8cbbb8e8", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "# RegEx - Remove emojis from text" + ] + }, + { + "cell_type": "markdown", + "id": "574b8aa0-d7f4-4968-bf81-cb9cb16fc2a9", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Tags:** #regex #text #emoji #remove #string #python" + ] + }, + { + "cell_type": "markdown", + "id": "88c2e429-f0e0-44e1-a4ff-9ce3ea2ba00a", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel/)" + ] + }, + { + "cell_type": "markdown", + "id": "fba0b1cc-4994-44a2-9907-b29f6d97ebe7", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Last update:** 2023-12-04 (Created: 2023-12-04)" + ] + }, + { + "cell_type": "markdown", + "id": "94759cd1-345e-404e-a314-91c437f218bf", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Description:** This notebook will show how to remove emojis from a text using RegEx and Python. It is usefull for organizations that need to clean text from emojis." + ] + }, + { + "cell_type": "markdown", + "id": "696eb3c2-cfbf-4774-9212-8b6c561366c1", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**References:**\n- [Regular Expressions - Python Documentation](https://docs.python.org/3/library/re.html)\n- [Remove Emojis from Text - Stack Overflow](https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python)" + ] + }, + { + "cell_type": "markdown", + "id": "6dec562d-c2e4-428e-99e3-4e4b296531dd", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Input" + ] + }, + { + "cell_type": "markdown", + "id": "f3212bcb-8f14-48e8-851c-5641f82f2472", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae43c8fe-0b1b-4d08-9288-f57bd10e33e8", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": "import re", + "outputs": [] + }, + { + "cell_type": "markdown", + "id": "331c8e10-7d6b-4ca8-8fa4-9fb9703d8d66", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Setup variables\n- `text`: Text containing emojis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f34796c6-2a63-420e-b9aa-908dcd5e3d06", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": "text = \"This is a text with emojis \ud83d\ude0a\ud83d\ude0a\ud83d\ude0a\"", + "outputs": [] + }, + { + "cell_type": "markdown", + "id": "f761910f-065c-4283-9862-b7a3d4e0bda3", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Model" + ] + }, + { + "cell_type": "markdown", + "id": "dcfa31dd-e669-4560-ab25-39a9d44acdf6", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Remove emojis from text" + ] + }, + { + "cell_type": "markdown", + "id": "62834ce1-dee1-4dee-a047-2cca8f1e6ba5", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "This function will remove all emojis from a text using RegEx." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9f88fa6-8e9c-4a2d-a7ea-630c4b03fe18", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": "def remove_emojis(text):\n return re.sub(r\"[^\\w\\s]\", \"\", text)", + "outputs": [] + }, + { + "cell_type": "markdown", + "id": "97a7aac6-6f66-451c-800e-450ca28f6bba", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Output" + ] + }, + { + "cell_type": "markdown", + "id": "61492b45-5d3b-4da5-9adf-016273345594", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Display result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58b4118c-3a12-4bdd-8425-bb2d02039619", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": "print(remove_emojis(text))", + "outputs": [] + }, + { + "cell_type": "markdown", + "id": "f527d917-519a-47ab-93b8-1264ca217b80", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file From 5537c6e1bbf43d45eb99142d014df6bbfb032137 Mon Sep 17 00:00:00 2001 From: Florent Ravenel Date: Tue, 5 Dec 2023 11:46:53 +0100 Subject: [PATCH 2/2] feat: update template --- RegEx/RegEx_Remove_emojis_from_text.ipynb | 61 ++++++++++++++++++----- template.ipynb | 11 +--- 2 files changed, 50 insertions(+), 22 deletions(-) diff --git a/RegEx/RegEx_Remove_emojis_from_text.ipynb b/RegEx/RegEx_Remove_emojis_from_text.ipynb index 2bed7cc18d..a7d59b8e81 100644 --- a/RegEx/RegEx_Remove_emojis_from_text.ipynb +++ b/RegEx/RegEx_Remove_emojis_from_text.ipynb @@ -52,7 +52,7 @@ "tags": [] }, "source": [ - "**Last update:** 2023-12-04 (Created: 2023-12-04)" + "**Last update:** 2023-12-05 (Created: 2023-12-05)" ] }, { @@ -63,7 +63,7 @@ "tags": [] }, "source": [ - "**Description:** This notebook will show how to remove emojis from a text using RegEx and Python. It is usefull for organizations that need to clean text from emojis." + "**Description:** This notebook will show how to remove emojis from a text using RegEx and Python." ] }, { @@ -74,7 +74,9 @@ "tags": [] }, "source": [ - "**References:**\n- [Regular Expressions - Python Documentation](https://docs.python.org/3/library/re.html)\n- [Remove Emojis from Text - Stack Overflow](https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python)" + "**References:**\n", + "- [Regular Expressions - Python Documentation](https://docs.python.org/3/library/re.html)\n", + "- [Remove Emojis from Text - Stack Overflow](https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python)" ] }, { @@ -107,8 +109,10 @@ "papermill": {}, "tags": [] }, - "source": "import re", - "outputs": [] + "outputs": [], + "source": [ + "import re" + ] }, { "cell_type": "markdown", @@ -118,7 +122,8 @@ "tags": [] }, "source": [ - "### Setup variables\n- `text`: Text containing emojis" + "### Setup variables\n", + "- `text`: Text containing emojis" ] }, { @@ -129,8 +134,10 @@ "papermill": {}, "tags": [] }, - "source": "text = \"This is a text with emojis \ud83d\ude0a\ud83d\ude0a\ud83d\ude0a\"", - "outputs": [] + "outputs": [], + "source": [ + "text = \"This is a text with emojis 😊😊😊\"" + ] }, { "cell_type": "markdown", @@ -173,8 +180,34 @@ "papermill": {}, "tags": [] }, - "source": "def remove_emojis(text):\n return re.sub(r\"[^\\w\\s]\", \"\", text)", - "outputs": [] + "outputs": [], + "source": [ + "def remove_emojis(text):\n", + " # Emoji pattern\n", + " emoji_pattern = re.compile(\"[\"\n", + " u\"\\U0001F600-\\U0001F64F\" # emoticons\n", + " u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\n", + " u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\n", + " u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\n", + " u\"\\U00002500-\\U00002BEF\" # chinese char\n", + " u\"\\U00002702-\\U000027B0\"\n", + " u\"\\U00002702-\\U000027B0\"\n", + " u\"\\U000024C2-\\U0001F251\"\n", + " u\"\\U0001f926-\\U0001f937\"\n", + " u\"\\U00010000-\\U0010ffff\"\n", + " u\"\\u2640-\\u2642\"\n", + " u\"\\u2600-\\u2B55\"\n", + " u\"\\u200d\"\n", + " u\"\\u23cf\"\n", + " u\"\\u23e9\"\n", + " u\"\\u231a\"\n", + " u\"\\ufe0f\" # dingbats\n", + " u\"\\u3030\"\n", + " \"]+\", flags=re.UNICODE)\n", + " # Remove emojis from the text\n", + " text = emoji_pattern.sub(r'', text)\n", + " return text.strip()" + ] }, { "cell_type": "markdown", @@ -206,8 +239,10 @@ "papermill": {}, "tags": [] }, - "source": "print(remove_emojis(text))", - "outputs": [] + "outputs": [], + "source": [ + "print(remove_emojis(text))" + ] }, { "cell_type": "markdown", @@ -249,4 +284,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/template.ipynb b/template.ipynb index 4a816365f5..63580c3239 100644 --- a/template.ipynb +++ b/template.ipynb @@ -2,20 +2,13 @@ "cells": [ { "cell_type": "markdown", - "id": "latin-packing", + "id": "88c104cc-bf08-4242-821b-b3a40908152a", "metadata": { - "execution": { - "iopub.execute_input": "2021-02-23T14:22:16.610471Z", - "iopub.status.busy": "2021-02-23T14:22:16.610129Z", - "iopub.status.idle": "2021-02-23T14:22:16.627784Z", - "shell.execute_reply": "2021-02-23T14:22:16.626866Z", - "shell.execute_reply.started": "2021-02-23T14:22:16.610384Z" - }, "papermill": {}, "tags": [] }, "source": [ - "\"Naas\"" + "\"Naas.png\"" ] }, {