From 0e5a6f0b15e3db2635f106f76694f9d3f46435c8 Mon Sep 17 00:00:00 2001
From: Lea Gihlein <85543649+lea-33@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:09:20 +0100
Subject: [PATCH] Add Export_to_DALIA.ipynb

---
 scripts/Export_to_DALIA.ipynb | 2252 +++++++++++++++++++++++++++++++++
 1 file changed, 2252 insertions(+)
 create mode 100644 scripts/Export_to_DALIA.ipynb
diff --git a/scripts/Export_to_DALIA.ipynb b/scripts/Export_to_DALIA.ipynb
new file mode 100644
index 00000000..f8568ac0
--- /dev/null
+++ b/scripts/Export_to_DALIA.ipynb
@@ -0,0 +1,2252 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7a19e5fa-6f3a-4a8a-9244-6bf9fdebad76",
+   "metadata": {},
+   "source": [
+    "### Test Conversion of yml to DALIA format"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e055672-a937-4e46-926e-fdf6c527d628",
+   "metadata": {},
+   "source": [
+    "#### Load the Yml as a pandas DF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7396751e-9b56-4bf6-bc35-e6e38f6c108c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>authors</th>\n",
+       "      <th>name</th>\n",
+       "      <th>tags</th>\n",
+       "      <th>type</th>\n",
+       "      <th>url</th>\n",
+       "      <th>license</th>\n",
+       "      <th>event_date</th>\n",
+       "      <th>event_location</th>\n",
+       "      <th>description</th>\n",
+       "      <th>num_downloads</th>\n",
+       "      <th>publication_date</th>\n",
+       "      <th>fingerprint</th>\n",
+       "      <th>author</th>\n",
+       "      <th>submission_date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[Elisabeth Kugler]</td>\n",
+       "      <td>Sharing Your Poster on Figshare: A Community G...</td>\n",
+       "      <td>[Sharing, Research Data Management]</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://focalplane.biologists.com/2023/07/26/s...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Marcelo Zoccoler]</td>\n",
+       "      <td>Running Deep-Learning Scripts in the BiA-PoL O...</td>\n",
+       "      <td>[Python, Artificial Intelligence, Bioimage Ana...</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/marcelo_zoccoler...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Browsing the Open Microscopy Image Data Resour...</td>\n",
+       "      <td>[OMERO, Python]</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/robert_haase/bro...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[Mara Lampert]</td>\n",
+       "      <td>Getting started with Mambaforge and Python</td>\n",
+       "      <td>[Python, Conda, Mamba]</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/mara_lampert/get...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>[Jennifer Waters]</td>\n",
+       "      <td>Promoting Data Management at the Nikon Imaging...</td>\n",
+       "      <td>[Research Data Management]</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://datamanagement.hms.harvard.edu/news/pr...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              authors                                               name  \\\n",
+       "0  [Elisabeth Kugler]  Sharing Your Poster on Figshare: A Community G...   \n",
+       "1  [Marcelo Zoccoler]  Running Deep-Learning Scripts in the BiA-PoL O...   \n",
+       "2      [Robert Haase]  Browsing the Open Microscopy Image Data Resour...   \n",
+       "3      [Mara Lampert]         Getting started with Mambaforge and Python   \n",
+       "4   [Jennifer Waters]  Promoting Data Management at the Nikon Imaging...   \n",
+       "\n",
+       "                                                tags    type  \\\n",
+       "0                [Sharing, Research Data Management]  [Blog]   \n",
+       "1  [Python, Artificial Intelligence, Bioimage Ana...  [Blog]   \n",
+       "2                                    [OMERO, Python]  [Blog]   \n",
+       "3                             [Python, Conda, Mamba]  [Blog]   \n",
+       "4                         [Research Data Management]  [Blog]   \n",
+       "\n",
+       "                                                 url    license event_date  \\\n",
+       "0  https://focalplane.biologists.com/2023/07/26/s...        NaN        NaN   \n",
+       "1  https://biapol.github.io/blog/marcelo_zoccoler...  CC-BY-4.0        NaN   \n",
+       "2  https://biapol.github.io/blog/robert_haase/bro...  CC-BY-4.0        NaN   \n",
+       "3  https://biapol.github.io/blog/mara_lampert/get...  CC-BY-4.0        NaN   \n",
+       "4  https://datamanagement.hms.harvard.edu/news/pr...        NaN        NaN   \n",
+       "\n",
+       "  event_location description  num_downloads publication_date fingerprint  \\\n",
+       "0            NaN         NaN            NaN              NaN         NaN   \n",
+       "1            NaN         NaN            NaN              NaN         NaN   \n",
+       "2            NaN         NaN            NaN              NaN         NaN   \n",
+       "3            NaN         NaN            NaN              NaN         NaN   \n",
+       "4            NaN         NaN            NaN              NaN         NaN   \n",
+       "\n",
+       "  author submission_date  \n",
+       "0    NaN             NaN  \n",
+       "1    NaN             NaN  \n",
+       "2    NaN             NaN  \n",
+       "3    NaN             NaN  \n",
+       "4    NaN             NaN  "
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# This file exports selected data as csv file\n",
+    "source = \"../resources/\"\n",
+    "#destination = './docs/export/training_materials.csv'\n",
+    "destination = '../docs/export/DALIA_training_materials.csv'\n",
+    "\n",
+    "from generate_link_lists import load_dataframe\n",
+    "\n",
+    "df = load_dataframe(source)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c65fd523-cffc-4f20-9ed4-1012d4a33b1b",
+   "metadata": {},
+   "source": [
+    "#### 1. Change the entries with a author column by writing those entries to the authors column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f2bf12bd-56dc-49b4-902d-e68050d715ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>authors</th>\n",
+       "      <th>name</th>\n",
+       "      <th>tags</th>\n",
+       "      <th>type</th>\n",
+       "      <th>url</th>\n",
+       "      <th>license</th>\n",
+       "      <th>event_date</th>\n",
+       "      <th>event_location</th>\n",
+       "      <th>description</th>\n",
+       "      <th>num_downloads</th>\n",
+       "      <th>publication_date</th>\n",
+       "      <th>fingerprint</th>\n",
+       "      <th>author</th>\n",
+       "      <th>submission_date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>394</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Virtual-I2K-2024-multiview-stitcher</td>\n",
+       "      <td>[Big Data, Bioimageanalysis]</td>\n",
+       "      <td>[Github Repository, Tutorial]</td>\n",
+       "      <td>[https://github.com/m-albert/Virtual-I2K-2024-...</td>\n",
+       "      <td>BSD-3-CLAUSE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Repository accompanying the multiview-stitcher...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2024-10-30T07:38:11+00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Marvin Albert</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>397</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Prompt-Engineering-LLMs-Course</td>\n",
+       "      <td>[Llms, Prompt Engineering, Code Generation]</td>\n",
+       "      <td>[Github Repository, Tutorial]</td>\n",
+       "      <td>https://github.com/HelmholtzAI-Consultants-Mun...</td>\n",
+       "      <td>MIT</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td></td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2024-09-11T07:45:30+00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Isra Mekki</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    authors                                 name  \\\n",
+       "394     NaN  Virtual-I2K-2024-multiview-stitcher   \n",
+       "397     NaN       Prompt-Engineering-LLMs-Course   \n",
+       "\n",
+       "                                            tags  \\\n",
+       "394                 [Big Data, Bioimageanalysis]   \n",
+       "397  [Llms, Prompt Engineering, Code Generation]   \n",
+       "\n",
+       "                              type  \\\n",
+       "394  [Github Repository, Tutorial]   \n",
+       "397  [Github Repository, Tutorial]   \n",
+       "\n",
+       "                                                   url       license  \\\n",
+       "394  [https://github.com/m-albert/Virtual-I2K-2024-...  BSD-3-CLAUSE   \n",
+       "397  https://github.com/HelmholtzAI-Consultants-Mun...           MIT   \n",
+       "\n",
+       "    event_date event_location  \\\n",
+       "394        NaN            NaN   \n",
+       "397        NaN            NaN   \n",
+       "\n",
+       "                                           description  num_downloads  \\\n",
+       "394  Repository accompanying the multiview-stitcher...            NaN   \n",
+       "397                                                               NaN   \n",
+       "\n",
+       "              publication_date fingerprint         author submission_date  \n",
+       "394  2024-10-30T07:38:11+00:00         NaN  Marvin Albert             NaN  \n",
+       "397  2024-09-11T07:45:30+00:00         NaN     Isra Mekki             NaN  "
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#check which entries have 'author' column\n",
+    "df[df['author'].notna()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1371bc6c-23d9-46db-857c-41dd73e861c2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>authors</th>\n",
+       "      <th>name</th>\n",
+       "      <th>tags</th>\n",
+       "      <th>type</th>\n",
+       "      <th>url</th>\n",
+       "      <th>license</th>\n",
+       "      <th>event_date</th>\n",
+       "      <th>event_location</th>\n",
+       "      <th>description</th>\n",
+       "      <th>num_downloads</th>\n",
+       "      <th>publication_date</th>\n",
+       "      <th>fingerprint</th>\n",
+       "      <th>author</th>\n",
+       "      <th>submission_date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>394</th>\n",
+       "      <td>Marvin Albert</td>\n",
+       "      <td>Virtual-I2K-2024-multiview-stitcher</td>\n",
+       "      <td>[Big Data, Bioimageanalysis]</td>\n",
+       "      <td>[Github Repository, Tutorial]</td>\n",
+       "      <td>[https://github.com/m-albert/Virtual-I2K-2024-...</td>\n",
+       "      <td>BSD-3-CLAUSE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Repository accompanying the multiview-stitcher...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2024-10-30T07:38:11+00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Marvin Albert</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>397</th>\n",
+       "      <td>Isra Mekki</td>\n",
+       "      <td>Prompt-Engineering-LLMs-Course</td>\n",
+       "      <td>[Llms, Prompt Engineering, Code Generation]</td>\n",
+       "      <td>[Github Repository, Tutorial]</td>\n",
+       "      <td>https://github.com/HelmholtzAI-Consultants-Mun...</td>\n",
+       "      <td>MIT</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td></td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2024-09-11T07:45:30+00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Isra Mekki</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           authors                                 name  \\\n",
+       "394  Marvin Albert  Virtual-I2K-2024-multiview-stitcher   \n",
+       "397     Isra Mekki       Prompt-Engineering-LLMs-Course   \n",
+       "\n",
+       "                                            tags  \\\n",
+       "394                 [Big Data, Bioimageanalysis]   \n",
+       "397  [Llms, Prompt Engineering, Code Generation]   \n",
+       "\n",
+       "                              type  \\\n",
+       "394  [Github Repository, Tutorial]   \n",
+       "397  [Github Repository, Tutorial]   \n",
+       "\n",
+       "                                                   url       license  \\\n",
+       "394  [https://github.com/m-albert/Virtual-I2K-2024-...  BSD-3-CLAUSE   \n",
+       "397  https://github.com/HelmholtzAI-Consultants-Mun...           MIT   \n",
+       "\n",
+       "    event_date event_location  \\\n",
+       "394        NaN            NaN   \n",
+       "397        NaN            NaN   \n",
+       "\n",
+       "                                           description  num_downloads  \\\n",
+       "394  Repository accompanying the multiview-stitcher...            NaN   \n",
+       "397                                                               NaN   \n",
+       "\n",
+       "              publication_date fingerprint         author submission_date  \n",
+       "394  2024-10-30T07:38:11+00:00         NaN  Marvin Albert             NaN  \n",
+       "397  2024-09-11T07:45:30+00:00         NaN     Isra Mekki             NaN  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Iterate over rows to change the information to the authors column\n",
+    "for index, entry in df[df['author'].notna()].iterrows():\n",
+    "    df.loc[index, 'authors'] = entry['author']\n",
+    "    \n",
+    "df[df['author'].notna()]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51bdf851-201b-4c7d-8f9e-49ba5fb00eac",
+   "metadata": {},
+   "source": [
+    "#### 2. Exclude entries without mandatory attributes (License, Authors, Title, Link)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "11c39326-61e1-422d-99df-240f4b9b5c86",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>authors</th>\n",
+       "      <th>name</th>\n",
+       "      <th>tags</th>\n",
+       "      <th>type</th>\n",
+       "      <th>url</th>\n",
+       "      <th>license</th>\n",
+       "      <th>event_date</th>\n",
+       "      <th>event_location</th>\n",
+       "      <th>description</th>\n",
+       "      <th>num_downloads</th>\n",
+       "      <th>publication_date</th>\n",
+       "      <th>fingerprint</th>\n",
+       "      <th>author</th>\n",
+       "      <th>submission_date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Marcelo Zoccoler]</td>\n",
+       "      <td>Running Deep-Learning Scripts in the BiA-PoL O...</td>\n",
+       "      <td>[Python, Artificial Intelligence, Bioimage Ana...</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/marcelo_zoccoler...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Browsing the Open Microscopy Image Data Resour...</td>\n",
+       "      <td>[OMERO, Python]</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/robert_haase/bro...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[Mara Lampert]</td>\n",
+       "      <td>Getting started with Mambaforge and Python</td>\n",
+       "      <td>[Python, Conda, Mamba]</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/mara_lampert/get...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Managing Scientific Python environments using ...</td>\n",
+       "      <td>[Python, Conda, Mamba]</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://focalplane.biologists.com/2022/12/08/m...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>[Robert Haase et al.]</td>\n",
+       "      <td>BioImage Analysis Notebooks</td>\n",
+       "      <td>[Python, Bioimage Analysis]</td>\n",
+       "      <td>[Book, Notebook]</td>\n",
+       "      <td>https://haesleinhuepf.github.io/BioImageAnalys...</td>\n",
+       "      <td>[CC-BY-4.0, BSD-3-CLAUSE]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  authors                                               name  \\\n",
+       "1      [Marcelo Zoccoler]  Running Deep-Learning Scripts in the BiA-PoL O...   \n",
+       "2          [Robert Haase]  Browsing the Open Microscopy Image Data Resour...   \n",
+       "3          [Mara Lampert]         Getting started with Mambaforge and Python   \n",
+       "9          [Robert Haase]  Managing Scientific Python environments using ...   \n",
+       "29  [Robert Haase et al.]                        BioImage Analysis Notebooks   \n",
+       "\n",
+       "                                                 tags              type  \\\n",
+       "1   [Python, Artificial Intelligence, Bioimage Ana...            [Blog]   \n",
+       "2                                     [OMERO, Python]            [Blog]   \n",
+       "3                              [Python, Conda, Mamba]            [Blog]   \n",
+       "9                              [Python, Conda, Mamba]            [Blog]   \n",
+       "29                        [Python, Bioimage Analysis]  [Book, Notebook]   \n",
+       "\n",
+       "                                                  url  \\\n",
+       "1   https://biapol.github.io/blog/marcelo_zoccoler...   \n",
+       "2   https://biapol.github.io/blog/robert_haase/bro...   \n",
+       "3   https://biapol.github.io/blog/mara_lampert/get...   \n",
+       "9   https://focalplane.biologists.com/2022/12/08/m...   \n",
+       "29  https://haesleinhuepf.github.io/BioImageAnalys...   \n",
+       "\n",
+       "                      license event_date event_location description  \\\n",
+       "1                   CC-BY-4.0        NaN            NaN         NaN   \n",
+       "2                   CC-BY-4.0        NaN            NaN         NaN   \n",
+       "3                   CC-BY-4.0        NaN            NaN         NaN   \n",
+       "9                   CC-BY-4.0        NaN            NaN         NaN   \n",
+       "29  [CC-BY-4.0, BSD-3-CLAUSE]        NaN            NaN         NaN   \n",
+       "\n",
+       "    num_downloads publication_date fingerprint author submission_date  \n",
+       "1             NaN              NaN         NaN    NaN             NaN  \n",
+       "2             NaN              NaN         NaN    NaN             NaN  \n",
+       "3             NaN              NaN         NaN    NaN             NaN  \n",
+       "9             NaN              NaN         NaN    NaN             NaN  \n",
+       "29            NaN              NaN         NaN    NaN             NaN  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = df[~df['license'].str.lower().isin(['unknown']) & df['license'].notna() & df['authors'].notna() & df['name'].notna()& df['url'].notna()]\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "5e708904-0161-4fb6-8bf8-c2f6dc3dbbea",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total number of entries found: 536\n",
+      "Number of entries found with all mandatory entries: 330\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f'Total number of entries found: {len(df)}')\n",
+    "print(f'Number of entries found with all mandatory entries: {len(data)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8ce34a4c-f14f-40b0-8254-a4234d1f9d23",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>authors</th>\n",
+       "      <th>name</th>\n",
+       "      <th>tags</th>\n",
+       "      <th>type</th>\n",
+       "      <th>url</th>\n",
+       "      <th>license</th>\n",
+       "      <th>event_date</th>\n",
+       "      <th>event_location</th>\n",
+       "      <th>description</th>\n",
+       "      <th>num_downloads</th>\n",
+       "      <th>publication_date</th>\n",
+       "      <th>fingerprint</th>\n",
+       "      <th>author</th>\n",
+       "      <th>submission_date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Marcelo Zoccoler]</td>\n",
+       "      <td>Running Deep-Learning Scripts in the BiA-PoL O...</td>\n",
+       "      <td>[Python, Artificial Intelligence, Bioimage Ana...</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/marcelo_zoccoler...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Browsing the Open Microscopy Image Data Resour...</td>\n",
+       "      <td>[OMERO, Python]</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/robert_haase/bro...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[Mara Lampert]</td>\n",
+       "      <td>Getting started with Mambaforge and Python</td>\n",
+       "      <td>[Python, Conda, Mamba]</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/mara_lampert/get...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Managing Scientific Python environments using ...</td>\n",
+       "      <td>[Python, Conda, Mamba]</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://focalplane.biologists.com/2022/12/08/m...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>[Robert Haase et al.]</td>\n",
+       "      <td>BioImage Analysis Notebooks</td>\n",
+       "      <td>[Python, Bioimage Analysis]</td>\n",
+       "      <td>[Book, Notebook]</td>\n",
+       "      <td>https://haesleinhuepf.github.io/BioImageAnalys...</td>\n",
+       "      <td>[CC-BY-4.0, BSD-3-CLAUSE]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  authors                                               name  \\\n",
+       "1      [Marcelo Zoccoler]  Running Deep-Learning Scripts in the BiA-PoL O...   \n",
+       "2          [Robert Haase]  Browsing the Open Microscopy Image Data Resour...   \n",
+       "3          [Mara Lampert]         Getting started with Mambaforge and Python   \n",
+       "9          [Robert Haase]  Managing Scientific Python environments using ...   \n",
+       "29  [Robert Haase et al.]                        BioImage Analysis Notebooks   \n",
+       "\n",
+       "                                                 tags              type  \\\n",
+       "1   [Python, Artificial Intelligence, Bioimage Ana...            [Blog]   \n",
+       "2                                     [OMERO, Python]            [Blog]   \n",
+       "3                              [Python, Conda, Mamba]            [Blog]   \n",
+       "9                              [Python, Conda, Mamba]            [Blog]   \n",
+       "29                        [Python, Bioimage Analysis]  [Book, Notebook]   \n",
+       "\n",
+       "                                                  url  \\\n",
+       "1   https://biapol.github.io/blog/marcelo_zoccoler...   \n",
+       "2   https://biapol.github.io/blog/robert_haase/bro...   \n",
+       "3   https://biapol.github.io/blog/mara_lampert/get...   \n",
+       "9   https://focalplane.biologists.com/2022/12/08/m...   \n",
+       "29  https://haesleinhuepf.github.io/BioImageAnalys...   \n",
+       "\n",
+       "                      license event_date event_location description  \\\n",
+       "1                   CC-BY-4.0        NaN            NaN         NaN   \n",
+       "2                   CC-BY-4.0        NaN            NaN         NaN   \n",
+       "3                   CC-BY-4.0        NaN            NaN         NaN   \n",
+       "9                   CC-BY-4.0        NaN            NaN         NaN   \n",
+       "29  [CC-BY-4.0, BSD-3-CLAUSE]        NaN            NaN         NaN   \n",
+       "\n",
+       "    num_downloads publication_date fingerprint author submission_date  \n",
+       "1             NaN              NaN         NaN    NaN             NaN  \n",
+       "2             NaN              NaN         NaN    NaN             NaN  \n",
+       "3             NaN              NaN         NaN    NaN             NaN  \n",
+       "9             NaN              NaN         NaN    NaN             NaN  \n",
+       "29            NaN              NaN         NaN    NaN             NaN  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b18efe8-8833-478a-9971-8ec727580fa1",
+   "metadata": {},
+   "source": [
+    "#### 3. Change the format of the **Tags** and **License** columns to fit the DALIA format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "5318cd0d-7b81-47df-bab6-e64a8afbf9a2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_12251/210055857.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  data[\"tags\"] = data[\"tags\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #Tags\n",
+      "/tmp/ipykernel_12251/210055857.py:2: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  data[\"license\"] = data[\"license\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #License\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>authors</th>\n",
+       "      <th>name</th>\n",
+       "      <th>tags</th>\n",
+       "      <th>type</th>\n",
+       "      <th>url</th>\n",
+       "      <th>license</th>\n",
+       "      <th>event_date</th>\n",
+       "      <th>event_location</th>\n",
+       "      <th>description</th>\n",
+       "      <th>num_downloads</th>\n",
+       "      <th>publication_date</th>\n",
+       "      <th>fingerprint</th>\n",
+       "      <th>author</th>\n",
+       "      <th>submission_date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Marcelo Zoccoler]</td>\n",
+       "      <td>Running Deep-Learning Scripts in the BiA-PoL O...</td>\n",
+       "      <td>Python * Artificial Intelligence * Bioimage An...</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/marcelo_zoccoler...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Browsing the Open Microscopy Image Data Resour...</td>\n",
+       "      <td>OMERO * Python</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/robert_haase/bro...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[Mara Lampert]</td>\n",
+       "      <td>Getting started with Mambaforge and Python</td>\n",
+       "      <td>Python * Conda * Mamba</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/mara_lampert/get...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Managing Scientific Python environments using ...</td>\n",
+       "      <td>Python * Conda * Mamba</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://focalplane.biologists.com/2022/12/08/m...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>[Robert Haase et al.]</td>\n",
+       "      <td>BioImage Analysis Notebooks</td>\n",
+       "      <td>Python * Bioimage Analysis</td>\n",
+       "      <td>[Book, Notebook]</td>\n",
+       "      <td>https://haesleinhuepf.github.io/BioImageAnalys...</td>\n",
+       "      <td>CC-BY-4.0 * BSD-3-CLAUSE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  authors                                               name  \\\n",
+       "1      [Marcelo Zoccoler]  Running Deep-Learning Scripts in the BiA-PoL O...   \n",
+       "2          [Robert Haase]  Browsing the Open Microscopy Image Data Resour...   \n",
+       "3          [Mara Lampert]         Getting started with Mambaforge and Python   \n",
+       "9          [Robert Haase]  Managing Scientific Python environments using ...   \n",
+       "29  [Robert Haase et al.]                        BioImage Analysis Notebooks   \n",
+       "\n",
+       "                                                 tags              type  \\\n",
+       "1   Python * Artificial Intelligence * Bioimage An...            [Blog]   \n",
+       "2                                      OMERO * Python            [Blog]   \n",
+       "3                              Python * Conda * Mamba            [Blog]   \n",
+       "9                              Python * Conda * Mamba            [Blog]   \n",
+       "29                         Python * Bioimage Analysis  [Book, Notebook]   \n",
+       "\n",
+       "                                                  url  \\\n",
+       "1   https://biapol.github.io/blog/marcelo_zoccoler...   \n",
+       "2   https://biapol.github.io/blog/robert_haase/bro...   \n",
+       "3   https://biapol.github.io/blog/mara_lampert/get...   \n",
+       "9   https://focalplane.biologists.com/2022/12/08/m...   \n",
+       "29  https://haesleinhuepf.github.io/BioImageAnalys...   \n",
+       "\n",
+       "                     license event_date event_location description  \\\n",
+       "1                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "2                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "3                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "9                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "29  CC-BY-4.0 * BSD-3-CLAUSE        NaN            NaN         NaN   \n",
+       "\n",
+       "    num_downloads publication_date fingerprint author submission_date  \n",
+       "1             NaN              NaN         NaN    NaN             NaN  \n",
+       "2             NaN              NaN         NaN    NaN             NaN  \n",
+       "3             NaN              NaN         NaN    NaN             NaN  \n",
+       "9             NaN              NaN         NaN    NaN             NaN  \n",
+       "29            NaN              NaN         NaN    NaN             NaN  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[\"tags\"] = data[\"tags\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #Tags\n",
+    "data[\"license\"] = data[\"license\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #License\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ac322332-6c61-4764-b8c2-760c33518429",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_12251/2387137408.py:21: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  data[\"license\"] = data[\"license\"].replace(license_mapping)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Map the License Entries to valid input\n",
+    "license_mapping = {\n",
+    "    'APACHE-2.0 LICENSE' : 'Apache-2.0',\n",
+    "    'CC0 1.0 UNIVERSAL' : 'CC0-1.0',\n",
+    "    'CC-BY-4.0 * BSD-3-CLAUSE' : 'CC-BY-4.0 * BSD-3-Clause',\n",
+    "    'CC0 (MOSTLY, BUT CAN DIFFER DEPENDING ON RESOURCE)' : 'CC0-1.0',\n",
+    "    'CCY-BY-SA-4.0' : 'CC-BY-SA-4.0',\n",
+    "    'YOUTTUBE STANDARD LICENSE' : 'YOUTUBE STANDARD LICENSE',\n",
+    "    'CC-BY-NC-SA' : 'CC-BY-NC-SA-4.0',\n",
+    "    'BSD3-CLAUSE' : 'BSD-3-Clause',\n",
+    "    'CC-ZERO' : 'CC0-1.0',\n",
+    "    'BSD 3-Clause \"New\" or \"Revised\" License' : 'BSD-3-Clause',\n",
+    "    'cc-by-4.0' : ' CC-BY-4.0',\n",
+    "    'Creative Commons Attribution Share Alike 4.0 International' : 'CC-BY-SA-4.0',\n",
+    "    'GNU General Public License v3.0' : 'GPL-3.0-only',\n",
+    "    'CC BY-NC-SA 4.0' : 'CC-BY-NC-SA-4.0',\n",
+    "    'BSD-3-CLAUSE' : 'BSD-3-Clause',\n",
+    "    'BSD-2-CLAUSE' : 'BSD-2-Clause',\n",
+    "    'APACHE-2.0' : 'Apache-2.0'\n",
+    "}\n",
+    "data[\"license\"] = data[\"license\"].replace(license_mapping)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03fbeeb5-67d4-403c-a780-8757f738b9bb",
+   "metadata": {},
+   "source": [
+    "#### 4. Morph the **Type** Column into the **LearningResourceType** and **MediaType** Column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8a104889-190a-4504-af64-c5a019392ad3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create Mapping for the Type Column:\n",
+    "type_to_learning_resource = {\n",
+    "    \"Application\": \"Software Application\",\n",
+    "    \"Big Data\": \"Data\",\n",
+    "    \"Bioimage Analysis\": \"Other\",\n",
+    "    \"Blog\": \"Web Page\",\n",
+    "    \"Blog Post\": \"Text\",\n",
+    "    \"Book\": \"Book\",\n",
+    "    \"Book Chapter\": \"Book\",\n",
+    "    \"Code\": None,\n",
+    "    \"Collection\": \"Other\",\n",
+    "    \"Conference Abstract\": \"Text\",\n",
+    "    \"Data\": \"Data\",\n",
+    "    \"Document\": \"Text\",\n",
+    "    \"Documentation\": \"Text\",\n",
+    "    \"Event\": \"Other\",\n",
+    "    \"Forum Post\": \"Text\",\n",
+    "    \"Github Repository\": \"Other\",\n",
+    "    \"Jupyter Book\": \"Code Notebook\",\n",
+    "    \"Notebook\": \"Code Notebook\",\n",
+    "    \"Online Course\": \"Course\",\n",
+    "    \"Online Tutorial\": \"Tutorial\",\n",
+    "    \"Open Source Software\": \"Software Application\",\n",
+    "    \"Poster\": \"Poster\",\n",
+    "    \"Practicals\": \"Course\",\n",
+    "    \"Preprint\": \"Text\",\n",
+    "    \"Presentation\": \"Presentation\",\n",
+    "    \"Publication\": \"Article\",\n",
+    "    \"Python\": None,\n",
+    "    \"Report\": \"Report\",\n",
+    "    \"Slide\": \"Presentation\",\n",
+    "    \"Slides\": \"Presentation\",\n",
+    "    \"Tutorial\": \"Tutorial\",\n",
+    "    \"Video\": None,\n",
+    "    \"Videos\": None,\n",
+    "    \"Website\": \"Web Page\",\n",
+    "    \"Workshop\": \"Course\",\n",
+    "    \"Youtube Channel\": \"Other\"\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "cd6e0ac8-2382-4a26-9e77-ee739081396f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type_to_media_type = {\n",
+    "    \"Application\": None,\n",
+    "    \"Big Data\": None,\n",
+    "    \"Bioimage Analysis\": None,\n",
+    "    \"Blog\": \"text\",\n",
+    "    \"Blog Post\": \"text\",\n",
+    "    \"Book\": \"text\",\n",
+    "    \"Book Chapter\": \"text\",\n",
+    "    \"Code\": \"code\",\n",
+    "    \"Collection\": None,\n",
+    "    \"Conference Abstract\": \"text\",\n",
+    "    \"Data\": None,\n",
+    "    \"Document\": \"text\",\n",
+    "    \"Documentation\": \"text\",\n",
+    "    \"Event\": None,\n",
+    "    \"Forum Post\": \"text\",\n",
+    "    \"Github Repository\": None,\n",
+    "    \"Jupyter Book\": \"code\",\n",
+    "    \"Notebook\": \"code\",\n",
+    "    \"Online Course\": None,\n",
+    "    \"Online Tutorial\": None,\n",
+    "    \"Open Source Software\": None,\n",
+    "    \"Poster\": None,\n",
+    "    \"Practicals\": None,\n",
+    "    \"Preprint\": \"text\",\n",
+    "    \"Presentation\": \"presentation\",\n",
+    "    \"Publication\": \"text\",\n",
+    "    \"Python\": None,\n",
+    "    \"Report\": \"text\",\n",
+    "    \"Slide\": \"presentation\",\n",
+    "    \"Slides\": \"presentation\",\n",
+    "    \"Tutorial\": None,\n",
+    "    \"Video\": \"video\",\n",
+    "    \"Videos\": \"video\",\n",
+    "    \"Website\": None,\n",
+    "    \"Workshop\": None,\n",
+    "    \"Youtube Channel\": \"video\"\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "c32c15e5-2d12-4051-b238-44a94afcc5d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_12251/3151956629.py:30: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  data[\"LearningResourceType\"] = data[\"type\"].apply(map_learning_resource)\n",
+      "/tmp/ipykernel_12251/3151956629.py:31: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  data[\"MediaType\"] = data[\"type\"].apply(map_media_type)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>authors</th>\n",
+       "      <th>name</th>\n",
+       "      <th>tags</th>\n",
+       "      <th>type</th>\n",
+       "      <th>url</th>\n",
+       "      <th>license</th>\n",
+       "      <th>event_date</th>\n",
+       "      <th>event_location</th>\n",
+       "      <th>description</th>\n",
+       "      <th>num_downloads</th>\n",
+       "      <th>publication_date</th>\n",
+       "      <th>fingerprint</th>\n",
+       "      <th>author</th>\n",
+       "      <th>submission_date</th>\n",
+       "      <th>LearningResourceType</th>\n",
+       "      <th>MediaType</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Marcelo Zoccoler]</td>\n",
+       "      <td>Running Deep-Learning Scripts in the BiA-PoL O...</td>\n",
+       "      <td>Python * Artificial Intelligence * Bioimage An...</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/marcelo_zoccoler...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Browsing the Open Microscopy Image Data Resour...</td>\n",
+       "      <td>OMERO * Python</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/robert_haase/bro...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[Mara Lampert]</td>\n",
+       "      <td>Getting started with Mambaforge and Python</td>\n",
+       "      <td>Python * Conda * Mamba</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/mara_lampert/get...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Managing Scientific Python environments using ...</td>\n",
+       "      <td>Python * Conda * Mamba</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://focalplane.biologists.com/2022/12/08/m...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>[Robert Haase et al.]</td>\n",
+       "      <td>BioImage Analysis Notebooks</td>\n",
+       "      <td>Python * Bioimage Analysis</td>\n",
+       "      <td>[Book, Notebook]</td>\n",
+       "      <td>https://haesleinhuepf.github.io/BioImageAnalys...</td>\n",
+       "      <td>CC-BY-4.0 * BSD-3-Clause</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Book * Code Notebook</td>\n",
+       "      <td>text * code</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  authors                                               name  \\\n",
+       "1      [Marcelo Zoccoler]  Running Deep-Learning Scripts in the BiA-PoL O...   \n",
+       "2          [Robert Haase]  Browsing the Open Microscopy Image Data Resour...   \n",
+       "3          [Mara Lampert]         Getting started with Mambaforge and Python   \n",
+       "9          [Robert Haase]  Managing Scientific Python environments using ...   \n",
+       "29  [Robert Haase et al.]                        BioImage Analysis Notebooks   \n",
+       "\n",
+       "                                                 tags              type  \\\n",
+       "1   Python * Artificial Intelligence * Bioimage An...            [Blog]   \n",
+       "2                                      OMERO * Python            [Blog]   \n",
+       "3                              Python * Conda * Mamba            [Blog]   \n",
+       "9                              Python * Conda * Mamba            [Blog]   \n",
+       "29                         Python * Bioimage Analysis  [Book, Notebook]   \n",
+       "\n",
+       "                                                  url  \\\n",
+       "1   https://biapol.github.io/blog/marcelo_zoccoler...   \n",
+       "2   https://biapol.github.io/blog/robert_haase/bro...   \n",
+       "3   https://biapol.github.io/blog/mara_lampert/get...   \n",
+       "9   https://focalplane.biologists.com/2022/12/08/m...   \n",
+       "29  https://haesleinhuepf.github.io/BioImageAnalys...   \n",
+       "\n",
+       "                     license event_date event_location description  \\\n",
+       "1                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "2                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "3                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "9                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "29  CC-BY-4.0 * BSD-3-Clause        NaN            NaN         NaN   \n",
+       "\n",
+       "    num_downloads publication_date fingerprint author submission_date  \\\n",
+       "1             NaN              NaN         NaN    NaN             NaN   \n",
+       "2             NaN              NaN         NaN    NaN             NaN   \n",
+       "3             NaN              NaN         NaN    NaN             NaN   \n",
+       "9             NaN              NaN         NaN    NaN             NaN   \n",
+       "29            NaN              NaN         NaN    NaN             NaN   \n",
+       "\n",
+       "    LearningResourceType    MediaType  \n",
+       "1               Web Page         text  \n",
+       "2               Web Page         text  \n",
+       "3               Web Page         text  \n",
+       "9               Web Page         text  \n",
+       "29  Book * Code Notebook  text * code  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def map_learning_resource(entry):\n",
+    "    # Skip empty or NaN rows\n",
+    "    if entry is None or (isinstance(entry, float) and pd.isna(entry)):\n",
+    "        return \"\"\n",
+    "    # Use a set to avoid duplicates\n",
+    "    matches = set()\n",
+    "    if isinstance(entry, list):\n",
+    "        for item in entry:\n",
+    "            if item in type_to_learning_resource:\n",
+    "                matches.add(type_to_learning_resource[item])\n",
+    "    elif entry in type_to_learning_resource:\n",
+    "        matches.add(type_to_learning_resource[entry])\n",
+    "    return \" * \".join([m for m in matches if m is not None])\n",
+    "\n",
+    "def map_media_type(entry):\n",
+    "    # Skip empty or NaN rows\n",
+    "    if entry is None or (isinstance(entry, float) and pd.isna(entry)):\n",
+    "        return \"\"\n",
+    "    # Use a set to avoid duplicates\n",
+    "    matches = set()\n",
+    "    if isinstance(entry, list):\n",
+    "        for item in entry:\n",
+    "            if item in type_to_media_type:\n",
+    "                matches.add(type_to_media_type[item])\n",
+    "    elif entry in type_to_media_type:\n",
+    "        matches.add(type_to_media_type[entry])\n",
+    "    return \" * \".join([m for m in matches if m is not None])\n",
+    "\n",
+    "# Apply the mapping functions\n",
+    "data[\"LearningResourceType\"] = data[\"type\"].apply(map_learning_resource)\n",
+    "data[\"MediaType\"] = data[\"type\"].apply(map_media_type)\n",
+    "\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19f96ee9-e203-4b19-82b2-96a0e4088383",
+   "metadata": {},
+   "source": [
+    "#### 5. Change the author names to fit the DALIA format (for persons: surname, prename and for organizations: organization-name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "827776ce-3be9-4b28-b664-687c7d4fc4ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_12251/970863209.py:37: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  data[\"Authors\"] = data[\"authors\"].apply(normalize_author_format)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>authors</th>\n",
+       "      <th>name</th>\n",
+       "      <th>tags</th>\n",
+       "      <th>type</th>\n",
+       "      <th>url</th>\n",
+       "      <th>license</th>\n",
+       "      <th>event_date</th>\n",
+       "      <th>event_location</th>\n",
+       "      <th>description</th>\n",
+       "      <th>num_downloads</th>\n",
+       "      <th>publication_date</th>\n",
+       "      <th>fingerprint</th>\n",
+       "      <th>author</th>\n",
+       "      <th>submission_date</th>\n",
+       "      <th>LearningResourceType</th>\n",
+       "      <th>MediaType</th>\n",
+       "      <th>Authors</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Marcelo Zoccoler]</td>\n",
+       "      <td>Running Deep-Learning Scripts in the BiA-PoL O...</td>\n",
+       "      <td>Python * Artificial Intelligence * Bioimage An...</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/marcelo_zoccoler...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Zoccoler, Marcelo</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Browsing the Open Microscopy Image Data Resour...</td>\n",
+       "      <td>OMERO * Python</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/robert_haase/bro...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Haase, Robert</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[Mara Lampert]</td>\n",
+       "      <td>Getting started with Mambaforge and Python</td>\n",
+       "      <td>Python * Conda * Mamba</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://biapol.github.io/blog/mara_lampert/get...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Lampert, Mara</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>[Robert Haase]</td>\n",
+       "      <td>Managing Scientific Python environments using ...</td>\n",
+       "      <td>Python * Conda * Mamba</td>\n",
+       "      <td>[Blog]</td>\n",
+       "      <td>https://focalplane.biologists.com/2022/12/08/m...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Haase, Robert</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>[Robert Haase et al.]</td>\n",
+       "      <td>BioImage Analysis Notebooks</td>\n",
+       "      <td>Python * Bioimage Analysis</td>\n",
+       "      <td>[Book, Notebook]</td>\n",
+       "      <td>https://haesleinhuepf.github.io/BioImageAnalys...</td>\n",
+       "      <td>CC-BY-4.0 * BSD-3-Clause</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Book * Code Notebook</td>\n",
+       "      <td>text * code</td>\n",
+       "      <td>Robert Haase et al.</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  authors                                               name  \\\n",
+       "1      [Marcelo Zoccoler]  Running Deep-Learning Scripts in the BiA-PoL O...   \n",
+       "2          [Robert Haase]  Browsing the Open Microscopy Image Data Resour...   \n",
+       "3          [Mara Lampert]         Getting started with Mambaforge and Python   \n",
+       "9          [Robert Haase]  Managing Scientific Python environments using ...   \n",
+       "29  [Robert Haase et al.]                        BioImage Analysis Notebooks   \n",
+       "\n",
+       "                                                 tags              type  \\\n",
+       "1   Python * Artificial Intelligence * Bioimage An...            [Blog]   \n",
+       "2                                      OMERO * Python            [Blog]   \n",
+       "3                              Python * Conda * Mamba            [Blog]   \n",
+       "9                              Python * Conda * Mamba            [Blog]   \n",
+       "29                         Python * Bioimage Analysis  [Book, Notebook]   \n",
+       "\n",
+       "                                                  url  \\\n",
+       "1   https://biapol.github.io/blog/marcelo_zoccoler...   \n",
+       "2   https://biapol.github.io/blog/robert_haase/bro...   \n",
+       "3   https://biapol.github.io/blog/mara_lampert/get...   \n",
+       "9   https://focalplane.biologists.com/2022/12/08/m...   \n",
+       "29  https://haesleinhuepf.github.io/BioImageAnalys...   \n",
+       "\n",
+       "                     license event_date event_location description  \\\n",
+       "1                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "2                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "3                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "9                  CC-BY-4.0        NaN            NaN         NaN   \n",
+       "29  CC-BY-4.0 * BSD-3-Clause        NaN            NaN         NaN   \n",
+       "\n",
+       "    num_downloads publication_date fingerprint author submission_date  \\\n",
+       "1             NaN              NaN         NaN    NaN             NaN   \n",
+       "2             NaN              NaN         NaN    NaN             NaN   \n",
+       "3             NaN              NaN         NaN    NaN             NaN   \n",
+       "9             NaN              NaN         NaN    NaN             NaN   \n",
+       "29            NaN              NaN         NaN    NaN             NaN   \n",
+       "\n",
+       "    LearningResourceType    MediaType              Authors  \n",
+       "1               Web Page         text    Zoccoler, Marcelo  \n",
+       "2               Web Page         text        Haase, Robert  \n",
+       "3               Web Page         text        Lampert, Mara  \n",
+       "9               Web Page         text        Haase, Robert  \n",
+       "29  Book * Code Notebook  text * code  Robert Haase et al.  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "\n",
+    "def normalize_author_format(authors):\n",
+    "    # Helper function to reformat a single name\n",
+    "    def reformat_name(name):\n",
+    "        # Check if it's already in \"Surname, Prename\" format\n",
+    "        if \",\" in name:\n",
+    "            return name.strip()\n",
+    "        # If in \"Prename Surname\" format, convert to \"Surname, Prename\"\n",
+    "        parts = name.split()\n",
+    "        et_al = ['et', 'al.']\n",
+    "        if len(parts) == 2 and all(p not in et_al for p in parts):\n",
+    "            return f\"{parts[1]}, {parts[0]}\"\n",
+    "        if len(parts) == 3 and all(p not in et_al for p in parts):\n",
+    "            return f\"{parts[2]}, {parts[0]}{parts[1]}\"\n",
+    "        return name.strip()  # Return unchanged if not a simple name format\n",
+    "\n",
+    "\n",
+    "    # Convert single strings to lists for uniform processing\n",
+    "    if isinstance(authors, str):\n",
+    "        # Split on commas for inline lists like \"Prename Surname, Prename Surname\"\n",
+    "        authors = [a.strip() for a in re.split(r\",\\s*|\\*|\\band\\b\", authors)]\n",
+    "    elif isinstance(authors, list):\n",
+    "        authors = [str(a).strip() for a in authors]  # Ensure all elements are strings\n",
+    "\n",
+    "    # Process each author entry\n",
+    "    formatted_authors = []\n",
+    "    for author in authors:\n",
+    "        formatted_authors.append(reformat_name(author))\n",
+    "\n",
+    "    # Join all processed names with \"*\"\n",
+    "    return \" * \".join(formatted_authors)\n",
+    "\n",
+    "\n",
+    "# Apply the normalization function\n",
+    "data[\"Authors\"] = data[\"authors\"].apply(normalize_author_format)\n",
+    "\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0983e2a5-3f93-4cfa-9f40-3f41233fe77e",
+   "metadata": {},
+   "source": [
+    "####  6. Change to names of the columns that already fit the DALIA format to their corresponding name in DALIA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "4213ac0c-3274-408e-a86d-bc9e61832de8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Title</th>\n",
+       "      <th>Keywords</th>\n",
+       "      <th>Link</th>\n",
+       "      <th>License</th>\n",
+       "      <th>Description</th>\n",
+       "      <th>PublicationDate</th>\n",
+       "      <th>LearningResourceType</th>\n",
+       "      <th>MediaType</th>\n",
+       "      <th>Authors</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Running Deep-Learning Scripts in the BiA-PoL O...</td>\n",
+       "      <td>Python * Artificial Intelligence * Bioimage An...</td>\n",
+       "      <td>https://biapol.github.io/blog/marcelo_zoccoler...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Zoccoler, Marcelo</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Browsing the Open Microscopy Image Data Resour...</td>\n",
+       "      <td>OMERO * Python</td>\n",
+       "      <td>https://biapol.github.io/blog/robert_haase/bro...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Haase, Robert</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Getting started with Mambaforge and Python</td>\n",
+       "      <td>Python * Conda * Mamba</td>\n",
+       "      <td>https://biapol.github.io/blog/mara_lampert/get...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Lampert, Mara</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Managing Scientific Python environments using ...</td>\n",
+       "      <td>Python * Conda * Mamba</td>\n",
+       "      <td>https://focalplane.biologists.com/2022/12/08/m...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Haase, Robert</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>BioImage Analysis Notebooks</td>\n",
+       "      <td>Python * Bioimage Analysis</td>\n",
+       "      <td>https://haesleinhuepf.github.io/BioImageAnalys...</td>\n",
+       "      <td>CC-BY-4.0 * BSD-3-Clause</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Book * Code Notebook</td>\n",
+       "      <td>text * code</td>\n",
+       "      <td>Robert Haase et al.</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Title  \\\n",
+       "1   Running Deep-Learning Scripts in the BiA-PoL O...   \n",
+       "2   Browsing the Open Microscopy Image Data Resour...   \n",
+       "3          Getting started with Mambaforge and Python   \n",
+       "9   Managing Scientific Python environments using ...   \n",
+       "29                        BioImage Analysis Notebooks   \n",
+       "\n",
+       "                                             Keywords  \\\n",
+       "1   Python * Artificial Intelligence * Bioimage An...   \n",
+       "2                                      OMERO * Python   \n",
+       "3                              Python * Conda * Mamba   \n",
+       "9                              Python * Conda * Mamba   \n",
+       "29                         Python * Bioimage Analysis   \n",
+       "\n",
+       "                                                 Link  \\\n",
+       "1   https://biapol.github.io/blog/marcelo_zoccoler...   \n",
+       "2   https://biapol.github.io/blog/robert_haase/bro...   \n",
+       "3   https://biapol.github.io/blog/mara_lampert/get...   \n",
+       "9   https://focalplane.biologists.com/2022/12/08/m...   \n",
+       "29  https://haesleinhuepf.github.io/BioImageAnalys...   \n",
+       "\n",
+       "                     License Description PublicationDate  \\\n",
+       "1                  CC-BY-4.0         NaN             NaN   \n",
+       "2                  CC-BY-4.0         NaN             NaN   \n",
+       "3                  CC-BY-4.0         NaN             NaN   \n",
+       "9                  CC-BY-4.0         NaN             NaN   \n",
+       "29  CC-BY-4.0 * BSD-3-Clause         NaN             NaN   \n",
+       "\n",
+       "    LearningResourceType    MediaType              Authors  \n",
+       "1               Web Page         text    Zoccoler, Marcelo  \n",
+       "2               Web Page         text        Haase, Robert  \n",
+       "3               Web Page         text        Lampert, Mara  \n",
+       "9               Web Page         text        Haase, Robert  \n",
+       "29  Book * Code Notebook  text * code  Robert Haase et al.  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Rename columns\n",
+    "data = data.rename(columns={'name': 'Title', 'license': 'License', 'url': 'Link', 'description': 'Description', 'publication_date': 'PublicationDate', 'tags': 'Keywords'})\n",
+    "\n",
+    "# Remove unwanted columns with no important data\n",
+    "data = data.drop(columns=['event_date', 'event_location', 'num_downloads', 'submission_date', 'fingerprint', 'author', 'type', 'authors'])\n",
+    "\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69d43abb-d409-49bf-ab21-b79729441d1f",
+   "metadata": {},
+   "source": [
+    "#### 7. Introduce the **Community Column**: NFDI4BioImage if it is listed in the tags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "5a79c41e-6037-44c2-8cdd-0988197de047",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Title</th>\n",
+       "      <th>Keywords</th>\n",
+       "      <th>Link</th>\n",
+       "      <th>License</th>\n",
+       "      <th>Description</th>\n",
+       "      <th>PublicationDate</th>\n",
+       "      <th>LearningResourceType</th>\n",
+       "      <th>MediaType</th>\n",
+       "      <th>Authors</th>\n",
+       "      <th>Community</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Running Deep-Learning Scripts in the BiA-PoL O...</td>\n",
+       "      <td>Python * Artificial Intelligence * Bioimage An...</td>\n",
+       "      <td>https://biapol.github.io/blog/marcelo_zoccoler...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Zoccoler, Marcelo</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Browsing the Open Microscopy Image Data Resour...</td>\n",
+       "      <td>OMERO * Python</td>\n",
+       "      <td>https://biapol.github.io/blog/robert_haase/bro...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Haase, Robert</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Getting started with Mambaforge and Python</td>\n",
+       "      <td>Python * Conda * Mamba</td>\n",
+       "      <td>https://biapol.github.io/blog/mara_lampert/get...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Lampert, Mara</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Managing Scientific Python environments using ...</td>\n",
+       "      <td>Python * Conda * Mamba</td>\n",
+       "      <td>https://focalplane.biologists.com/2022/12/08/m...</td>\n",
+       "      <td>CC-BY-4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Web Page</td>\n",
+       "      <td>text</td>\n",
+       "      <td>Haase, Robert</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>BioImage Analysis Notebooks</td>\n",
+       "      <td>Python * Bioimage Analysis</td>\n",
+       "      <td>https://haesleinhuepf.github.io/BioImageAnalys...</td>\n",
+       "      <td>CC-BY-4.0 * BSD-3-Clause</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Book * Code Notebook</td>\n",
+       "      <td>text * code</td>\n",
+       "      <td>Robert Haase et al.</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Title  \\\n",
+       "1   Running Deep-Learning Scripts in the BiA-PoL O...   \n",
+       "2   Browsing the Open Microscopy Image Data Resour...   \n",
+       "3          Getting started with Mambaforge and Python   \n",
+       "9   Managing Scientific Python environments using ...   \n",
+       "29                        BioImage Analysis Notebooks   \n",
+       "\n",
+       "                                             Keywords  \\\n",
+       "1   Python * Artificial Intelligence * Bioimage An...   \n",
+       "2                                      OMERO * Python   \n",
+       "3                              Python * Conda * Mamba   \n",
+       "9                              Python * Conda * Mamba   \n",
+       "29                         Python * Bioimage Analysis   \n",
+       "\n",
+       "                                                 Link  \\\n",
+       "1   https://biapol.github.io/blog/marcelo_zoccoler...   \n",
+       "2   https://biapol.github.io/blog/robert_haase/bro...   \n",
+       "3   https://biapol.github.io/blog/mara_lampert/get...   \n",
+       "9   https://focalplane.biologists.com/2022/12/08/m...   \n",
+       "29  https://haesleinhuepf.github.io/BioImageAnalys...   \n",
+       "\n",
+       "                     License Description PublicationDate  \\\n",
+       "1                  CC-BY-4.0         NaN             NaN   \n",
+       "2                  CC-BY-4.0         NaN             NaN   \n",
+       "3                  CC-BY-4.0         NaN             NaN   \n",
+       "9                  CC-BY-4.0         NaN             NaN   \n",
+       "29  CC-BY-4.0 * BSD-3-Clause         NaN             NaN   \n",
+       "\n",
+       "    LearningResourceType    MediaType              Authors Community  \n",
+       "1               Web Page         text    Zoccoler, Marcelo      None  \n",
+       "2               Web Page         text        Haase, Robert      None  \n",
+       "3               Web Page         text        Lampert, Mara      None  \n",
+       "9               Web Page         text        Haase, Robert      None  \n",
+       "29  Book * Code Notebook  text * code  Robert Haase et al.      None  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def include_community(entry):\n",
+    "    if isinstance(entry, list):\n",
+    "        if any(e.lower() == 'nfdi4bioimage' for e in entry if isinstance(e, str)):\n",
+    "            return 'NFDI4Bioimage'\n",
+    "    elif isinstance(entry, str):\n",
+    "        if entry.lower() == 'nfdi4bioimage':\n",
+    "            return 'NFDI4Bioimage'\n",
+    "    return None\n",
+    "\n",
+    "\n",
+    "# Apply the function\n",
+    "data['Community'] = data['Keywords'].apply(include_community)\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "78b5ec11-6e2a-4b6d-8ff3-faa58325b232",
+   "metadata": {},
+   "source": [
+    "### 8. Introduce the **FileFormat** Column by comparing the MediaType to a FileFormat list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "89a3f72f-e614-4fe3-afc7-fc22345e104e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import re\n",
+    "import time\n",
+    "\n",
+    "# Function to extract record ID from a Zenodo link\n",
+    "def extract_zenodo_record_id(url):\n",
+    "    # Regex to match Zenodo record links and extract the record ID\n",
+    "    match = re.search(r\"https://zenodo.org/records/(\\d+)\", url)\n",
+    "    return match.group(1) if match else None\n",
+    "\n",
+    "# Function to fetch file formats from Zenodo using the record ID\n",
+    "def fetch_file_formats(record_id):\n",
+    "    if not record_id:\n",
+    "        return None\n",
+    "    api_url = f\"https://zenodo.org/api/records/{record_id}\"\n",
+    "    try:\n",
+    "        time.sleep(1)  # Add a 1-second delay between requests\n",
+    "        response = requests.get(api_url)\n",
+    "        response.raise_for_status()  # Raise an error for non-2xx responses\n",
+    "        data = response.json()\n",
+    "        file_types = {\n",
+    "            file[\"key\"].split(\".\")[-1].lower()\n",
+    "            for file in data.get(\"files\", [])\n",
+    "            if \".\" in file[\"key\"]\n",
+    "        }\n",
+    "        return \" * \".join(sorted(file_types)) if file_types else None\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error fetching file formats for record ID {record_id}: {e}\")\n",
+    "        return None\n",
+    "\n",
+    "# Function to process a single URL or a list of URLs\n",
+    "def process_links(link_input):\n",
+    "    if isinstance(link_input, str):\n",
+    "        # Single URL case\n",
+    "        record_id = extract_zenodo_record_id(link_input)\n",
+    "        if record_id:\n",
+    "            return fetch_file_formats(record_id)\n",
+    "    elif isinstance(link_input, list):\n",
+    "        # List of URLs case\n",
+    "        for link in link_input:\n",
+    "            record_id = extract_zenodo_record_id(link.strip())\n",
+    "            if record_id:\n",
+    "                file_format = fetch_file_formats(record_id)\n",
+    "                if file_format:  # Return on first valid result\n",
+    "                    return file_format\n",
+    "    return None  # Return None if no valid formats are found\n",
+    "\n",
+    "# Process the DataFrame\n",
+    "data[\"FileFormat\"] = data[\"Link\"].apply(process_links)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74d9702a-00cb-4408-8815-c26fd9a4fdee",
+   "metadata": {},
+   "source": [
+    "Additionally map the Type Column to certain File Formats, if it is not already filled from the previous step. (only works for certain MediaTypes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "965d0a41-9762-47cb-8bac-7042d35960c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def map_file_format(media_type, file_format):\n",
+    "    # If FileFormat already has a valid entry, return it as is\n",
+    "    if file_format is not None and file_format.strip() != \"\":\n",
+    "        return file_format\n",
+    "    # Map media types to specific file formats\n",
+    "    if media_type == \"audio\":\n",
+    "        return \".mp3\"\n",
+    "    elif media_type == \"video\":\n",
+    "        return \".mp4\"\n",
+    "    else:\n",
+    "        return \"\"  # Return empty string if no mapping is needed\n",
+    "\n",
+    "# Apply the mapping function\n",
+    "data[\"FileFormat\"] = data.apply(\n",
+    "    lambda row: map_file_format(row[\"MediaType\"], row[\"FileFormat\"]),\n",
+    "    axis=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "461e7dc1-7572-4664-887d-ec36f4ed2656",
+   "metadata": {},
+   "source": [
+    "Now also correct the Format of the Link Column:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "0de3c9ba-a0b8-434d-bd79-896ad87cf1c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Make * Delimiter for the Links if there is more than one for some entries\n",
+    "data[\"Link\"] = data[\"Link\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #URL"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "22a0d7cc-e4d7-4c77-807c-662fb44ffbe0",
+   "metadata": {},
+   "source": [
+    "#### 9. Extract the Language of each Entry\n",
+    "This is done using the [xlm-roberta-base-language-detection](https://huggingface.co/papluca/xlm-roberta-base-language-detection) model via the transformers package pipeline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "5f3abf90-990f-4ccc-8da3-05bb26e6538e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "model_ckpt = \"papluca/xlm-roberta-base-language-detection\"\n",
+    "pipe = pipeline(\"text-classification\", model=model_ckpt)\n",
+    "\n",
+    "def detect_language(text):\n",
+    "    lang = pipe([text], top_k=1, truncation=True)[0][0][\"label\"]\n",
+    "    return lang if lang in [\"en\", \"de\"] else \"\"\n",
+    "\n",
+    "data[\"Language\"] = data[\"Title\"].apply(detect_language)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0cd61d11-5907-43ba-968c-bb851d003631",
+   "metadata": {},
+   "source": [
+    "### Export the data to a csv that now fits the DALIA Format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "63071e24-8d4e-4885-ae78-74669bbe5557",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Exported 330 rows.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# save selected data\n",
+    "data.to_csv(destination, index=False)\n",
+    "\n",
+    "num_rows = data.shape[0]\n",
+    "print(f\"Exported {num_rows} rows.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

	authors	name	tags	type	url	license	event_date	event_location	description	num_downloads	publication_date	fingerprint	author	submission_date
0	[Elisabeth Kugler]	Sharing Your Poster on Figshare: A Community G...	[Sharing, Research Data Management]	[Blog]	https://focalplane.biologists.com/2023/07/26/s...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	[Marcelo Zoccoler]	Running Deep-Learning Scripts in the BiA-PoL O...	[Python, Artificial Intelligence, Bioimage Ana...	[Blog]	https://biapol.github.io/blog/marcelo_zoccoler...	CC-BY-4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	[Robert Haase]	Browsing the Open Microscopy Image Data Resour...	[OMERO, Python]	[Blog]	https://biapol.github.io/blog/robert_haase/bro...	CC-BY-4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	[Mara Lampert]	Getting started with Mambaforge and Python	[Python, Conda, Mamba]	[Blog]	https://biapol.github.io/blog/mara_lampert/get...	CC-BY-4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	[Jennifer Waters]	Promoting Data Management at the Nikon Imaging...	[Research Data Management]	[Blog]	https://datamanagement.hms.harvard.edu/news/pr...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	authors	name	tags	type	url	license	event_date	event_location	description	num_downloads	publication_date	fingerprint	author	submission_date
394	NaN	Virtual-I2K-2024-multiview-stitcher	[Big Data, Bioimageanalysis]	[Github Repository, Tutorial]	[https://github.com/m-albert/Virtual-I2K-2024-...	BSD-3-CLAUSE	NaN	NaN	Repository accompanying the multiview-stitcher...	NaN	2024-10-30T07:38:11+00:00	NaN	Marvin Albert	NaN
397	NaN	Prompt-Engineering-LLMs-Course	[Llms, Prompt Engineering, Code Generation]	[Github Repository, Tutorial]	https://github.com/HelmholtzAI-Consultants-Mun...	MIT	NaN	NaN		NaN	2024-09-11T07:45:30+00:00	NaN	Isra Mekki	NaN
	authors	name	tags	type	url	license	event_date	event_location	description	num_downloads	publication_date	fingerprint	author	submission_date
1	[Marcelo Zoccoler]	Running Deep-Learning Scripts in the BiA-PoL O...	Python * Artificial Intelligence * Bioimage An...	[Blog]	https://biapol.github.io/blog/marcelo_zoccoler...	CC-BY-4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	[Robert Haase]	Browsing the Open Microscopy Image Data Resour...	OMERO * Python	[Blog]	https://biapol.github.io/blog/robert_haase/bro...	CC-BY-4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	[Mara Lampert]	Getting started with Mambaforge and Python	Python * Conda * Mamba	[Blog]	https://biapol.github.io/blog/mara_lampert/get...	CC-BY-4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
9	[Robert Haase]	Managing Scientific Python environments using ...	Python * Conda * Mamba	[Blog]	https://focalplane.biologists.com/2022/12/08/m...	CC-BY-4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
29	[Robert Haase et al.]	BioImage Analysis Notebooks	Python * Bioimage Analysis	[Book, Notebook]	https://haesleinhuepf.github.io/BioImageAnalys...	CC-BY-4.0 * BSD-3-CLAUSE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN