From 0e5a6f0b15e3db2635f106f76694f9d3f46435c8 Mon Sep 17 00:00:00 2001
From: Lea Gihlein <85543649+lea-33@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:09:20 +0100
Subject: [PATCH] Add Export_to_DALIA.ipynb
---
scripts/Export_to_DALIA.ipynb | 2252 +++++++++++++++++++++++++++++++++
1 file changed, 2252 insertions(+)
create mode 100644 scripts/Export_to_DALIA.ipynb
diff --git a/scripts/Export_to_DALIA.ipynb b/scripts/Export_to_DALIA.ipynb
new file mode 100644
index 00000000..f8568ac0
--- /dev/null
+++ b/scripts/Export_to_DALIA.ipynb
@@ -0,0 +1,2252 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7a19e5fa-6f3a-4a8a-9244-6bf9fdebad76",
+ "metadata": {},
+ "source": [
+ "### Test Conversion of yml to DALIA format"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2e055672-a937-4e46-926e-fdf6c527d628",
+ "metadata": {},
+ "source": [
+ "#### Load the Yml as a pandas DF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "7396751e-9b56-4bf6-bc35-e6e38f6c108c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " authors | \n",
+ " name | \n",
+ " tags | \n",
+ " type | \n",
+ " url | \n",
+ " license | \n",
+ " event_date | \n",
+ " event_location | \n",
+ " description | \n",
+ " num_downloads | \n",
+ " publication_date | \n",
+ " fingerprint | \n",
+ " author | \n",
+ " submission_date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " [Elisabeth Kugler] | \n",
+ " Sharing Your Poster on Figshare: A Community G... | \n",
+ " [Sharing, Research Data Management] | \n",
+ " [Blog] | \n",
+ " https://focalplane.biologists.com/2023/07/26/s... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " [Marcelo Zoccoler] | \n",
+ " Running Deep-Learning Scripts in the BiA-PoL O... | \n",
+ " [Python, Artificial Intelligence, Bioimage Ana... | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/marcelo_zoccoler... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [Robert Haase] | \n",
+ " Browsing the Open Microscopy Image Data Resour... | \n",
+ " [OMERO, Python] | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/robert_haase/bro... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [Mara Lampert] | \n",
+ " Getting started with Mambaforge and Python | \n",
+ " [Python, Conda, Mamba] | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/mara_lampert/get... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [Jennifer Waters] | \n",
+ " Promoting Data Management at the Nikon Imaging... | \n",
+ " [Research Data Management] | \n",
+ " [Blog] | \n",
+ " https://datamanagement.hms.harvard.edu/news/pr... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " authors name \\\n",
+ "0 [Elisabeth Kugler] Sharing Your Poster on Figshare: A Community G... \n",
+ "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n",
+ "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n",
+ "3 [Mara Lampert] Getting started with Mambaforge and Python \n",
+ "4 [Jennifer Waters] Promoting Data Management at the Nikon Imaging... \n",
+ "\n",
+ " tags type \\\n",
+ "0 [Sharing, Research Data Management] [Blog] \n",
+ "1 [Python, Artificial Intelligence, Bioimage Ana... [Blog] \n",
+ "2 [OMERO, Python] [Blog] \n",
+ "3 [Python, Conda, Mamba] [Blog] \n",
+ "4 [Research Data Management] [Blog] \n",
+ "\n",
+ " url license event_date \\\n",
+ "0 https://focalplane.biologists.com/2023/07/26/s... NaN NaN \n",
+ "1 https://biapol.github.io/blog/marcelo_zoccoler... CC-BY-4.0 NaN \n",
+ "2 https://biapol.github.io/blog/robert_haase/bro... CC-BY-4.0 NaN \n",
+ "3 https://biapol.github.io/blog/mara_lampert/get... CC-BY-4.0 NaN \n",
+ "4 https://datamanagement.hms.harvard.edu/news/pr... NaN NaN \n",
+ "\n",
+ " event_location description num_downloads publication_date fingerprint \\\n",
+ "0 NaN NaN NaN NaN NaN \n",
+ "1 NaN NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN NaN \n",
+ "4 NaN NaN NaN NaN NaN \n",
+ "\n",
+ " author submission_date \n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN NaN "
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "# This file exports selected data as csv file\n",
+ "source = \"../resources/\"\n",
+ "#destination = './docs/export/training_materials.csv'\n",
+ "destination = '../docs/export/DALIA_training_materials.csv'\n",
+ "\n",
+ "from generate_link_lists import load_dataframe\n",
+ "\n",
+ "df = load_dataframe(source)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c65fd523-cffc-4f20-9ed4-1012d4a33b1b",
+ "metadata": {},
+ "source": [
+ "#### 1. Change the entries with a author column by writing those entries to the authors column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "f2bf12bd-56dc-49b4-902d-e68050d715ce",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " authors | \n",
+ " name | \n",
+ " tags | \n",
+ " type | \n",
+ " url | \n",
+ " license | \n",
+ " event_date | \n",
+ " event_location | \n",
+ " description | \n",
+ " num_downloads | \n",
+ " publication_date | \n",
+ " fingerprint | \n",
+ " author | \n",
+ " submission_date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 394 | \n",
+ " NaN | \n",
+ " Virtual-I2K-2024-multiview-stitcher | \n",
+ " [Big Data, Bioimageanalysis] | \n",
+ " [Github Repository, Tutorial] | \n",
+ " [https://github.com/m-albert/Virtual-I2K-2024-... | \n",
+ " BSD-3-CLAUSE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Repository accompanying the multiview-stitcher... | \n",
+ " NaN | \n",
+ " 2024-10-30T07:38:11+00:00 | \n",
+ " NaN | \n",
+ " Marvin Albert | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 397 | \n",
+ " NaN | \n",
+ " Prompt-Engineering-LLMs-Course | \n",
+ " [Llms, Prompt Engineering, Code Generation] | \n",
+ " [Github Repository, Tutorial] | \n",
+ " https://github.com/HelmholtzAI-Consultants-Mun... | \n",
+ " MIT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " | \n",
+ " NaN | \n",
+ " 2024-09-11T07:45:30+00:00 | \n",
+ " NaN | \n",
+ " Isra Mekki | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " authors name \\\n",
+ "394 NaN Virtual-I2K-2024-multiview-stitcher \n",
+ "397 NaN Prompt-Engineering-LLMs-Course \n",
+ "\n",
+ " tags \\\n",
+ "394 [Big Data, Bioimageanalysis] \n",
+ "397 [Llms, Prompt Engineering, Code Generation] \n",
+ "\n",
+ " type \\\n",
+ "394 [Github Repository, Tutorial] \n",
+ "397 [Github Repository, Tutorial] \n",
+ "\n",
+ " url license \\\n",
+ "394 [https://github.com/m-albert/Virtual-I2K-2024-... BSD-3-CLAUSE \n",
+ "397 https://github.com/HelmholtzAI-Consultants-Mun... MIT \n",
+ "\n",
+ " event_date event_location \\\n",
+ "394 NaN NaN \n",
+ "397 NaN NaN \n",
+ "\n",
+ " description num_downloads \\\n",
+ "394 Repository accompanying the multiview-stitcher... NaN \n",
+ "397 NaN \n",
+ "\n",
+ " publication_date fingerprint author submission_date \n",
+ "394 2024-10-30T07:38:11+00:00 NaN Marvin Albert NaN \n",
+ "397 2024-09-11T07:45:30+00:00 NaN Isra Mekki NaN "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#check which entries have 'author' column\n",
+ "df[df['author'].notna()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "1371bc6c-23d9-46db-857c-41dd73e861c2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " authors | \n",
+ " name | \n",
+ " tags | \n",
+ " type | \n",
+ " url | \n",
+ " license | \n",
+ " event_date | \n",
+ " event_location | \n",
+ " description | \n",
+ " num_downloads | \n",
+ " publication_date | \n",
+ " fingerprint | \n",
+ " author | \n",
+ " submission_date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 394 | \n",
+ " Marvin Albert | \n",
+ " Virtual-I2K-2024-multiview-stitcher | \n",
+ " [Big Data, Bioimageanalysis] | \n",
+ " [Github Repository, Tutorial] | \n",
+ " [https://github.com/m-albert/Virtual-I2K-2024-... | \n",
+ " BSD-3-CLAUSE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Repository accompanying the multiview-stitcher... | \n",
+ " NaN | \n",
+ " 2024-10-30T07:38:11+00:00 | \n",
+ " NaN | \n",
+ " Marvin Albert | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 397 | \n",
+ " Isra Mekki | \n",
+ " Prompt-Engineering-LLMs-Course | \n",
+ " [Llms, Prompt Engineering, Code Generation] | \n",
+ " [Github Repository, Tutorial] | \n",
+ " https://github.com/HelmholtzAI-Consultants-Mun... | \n",
+ " MIT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " | \n",
+ " NaN | \n",
+ " 2024-09-11T07:45:30+00:00 | \n",
+ " NaN | \n",
+ " Isra Mekki | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " authors name \\\n",
+ "394 Marvin Albert Virtual-I2K-2024-multiview-stitcher \n",
+ "397 Isra Mekki Prompt-Engineering-LLMs-Course \n",
+ "\n",
+ " tags \\\n",
+ "394 [Big Data, Bioimageanalysis] \n",
+ "397 [Llms, Prompt Engineering, Code Generation] \n",
+ "\n",
+ " type \\\n",
+ "394 [Github Repository, Tutorial] \n",
+ "397 [Github Repository, Tutorial] \n",
+ "\n",
+ " url license \\\n",
+ "394 [https://github.com/m-albert/Virtual-I2K-2024-... BSD-3-CLAUSE \n",
+ "397 https://github.com/HelmholtzAI-Consultants-Mun... MIT \n",
+ "\n",
+ " event_date event_location \\\n",
+ "394 NaN NaN \n",
+ "397 NaN NaN \n",
+ "\n",
+ " description num_downloads \\\n",
+ "394 Repository accompanying the multiview-stitcher... NaN \n",
+ "397 NaN \n",
+ "\n",
+ " publication_date fingerprint author submission_date \n",
+ "394 2024-10-30T07:38:11+00:00 NaN Marvin Albert NaN \n",
+ "397 2024-09-11T07:45:30+00:00 NaN Isra Mekki NaN "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Iterate over rows to change the information to the authors column\n",
+ "for index, entry in df[df['author'].notna()].iterrows():\n",
+ " df.loc[index, 'authors'] = entry['author']\n",
+ " \n",
+ "df[df['author'].notna()]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "51bdf851-201b-4c7d-8f9e-49ba5fb00eac",
+ "metadata": {},
+ "source": [
+ "#### 2. Exclude entries without mandatory attributes (License, Authors, Title, Link)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "11c39326-61e1-422d-99df-240f4b9b5c86",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " authors | \n",
+ " name | \n",
+ " tags | \n",
+ " type | \n",
+ " url | \n",
+ " license | \n",
+ " event_date | \n",
+ " event_location | \n",
+ " description | \n",
+ " num_downloads | \n",
+ " publication_date | \n",
+ " fingerprint | \n",
+ " author | \n",
+ " submission_date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [Marcelo Zoccoler] | \n",
+ " Running Deep-Learning Scripts in the BiA-PoL O... | \n",
+ " [Python, Artificial Intelligence, Bioimage Ana... | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/marcelo_zoccoler... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [Robert Haase] | \n",
+ " Browsing the Open Microscopy Image Data Resour... | \n",
+ " [OMERO, Python] | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/robert_haase/bro... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [Mara Lampert] | \n",
+ " Getting started with Mambaforge and Python | \n",
+ " [Python, Conda, Mamba] | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/mara_lampert/get... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " [Robert Haase] | \n",
+ " Managing Scientific Python environments using ... | \n",
+ " [Python, Conda, Mamba] | \n",
+ " [Blog] | \n",
+ " https://focalplane.biologists.com/2022/12/08/m... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " [Robert Haase et al.] | \n",
+ " BioImage Analysis Notebooks | \n",
+ " [Python, Bioimage Analysis] | \n",
+ " [Book, Notebook] | \n",
+ " https://haesleinhuepf.github.io/BioImageAnalys... | \n",
+ " [CC-BY-4.0, BSD-3-CLAUSE] | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " authors name \\\n",
+ "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n",
+ "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n",
+ "3 [Mara Lampert] Getting started with Mambaforge and Python \n",
+ "9 [Robert Haase] Managing Scientific Python environments using ... \n",
+ "29 [Robert Haase et al.] BioImage Analysis Notebooks \n",
+ "\n",
+ " tags type \\\n",
+ "1 [Python, Artificial Intelligence, Bioimage Ana... [Blog] \n",
+ "2 [OMERO, Python] [Blog] \n",
+ "3 [Python, Conda, Mamba] [Blog] \n",
+ "9 [Python, Conda, Mamba] [Blog] \n",
+ "29 [Python, Bioimage Analysis] [Book, Notebook] \n",
+ "\n",
+ " url \\\n",
+ "1 https://biapol.github.io/blog/marcelo_zoccoler... \n",
+ "2 https://biapol.github.io/blog/robert_haase/bro... \n",
+ "3 https://biapol.github.io/blog/mara_lampert/get... \n",
+ "9 https://focalplane.biologists.com/2022/12/08/m... \n",
+ "29 https://haesleinhuepf.github.io/BioImageAnalys... \n",
+ "\n",
+ " license event_date event_location description \\\n",
+ "1 CC-BY-4.0 NaN NaN NaN \n",
+ "2 CC-BY-4.0 NaN NaN NaN \n",
+ "3 CC-BY-4.0 NaN NaN NaN \n",
+ "9 CC-BY-4.0 NaN NaN NaN \n",
+ "29 [CC-BY-4.0, BSD-3-CLAUSE] NaN NaN NaN \n",
+ "\n",
+ " num_downloads publication_date fingerprint author submission_date \n",
+ "1 NaN NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN NaN \n",
+ "9 NaN NaN NaN NaN NaN \n",
+ "29 NaN NaN NaN NaN NaN "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = df[~df['license'].str.lower().isin(['unknown']) & df['license'].notna() & df['authors'].notna() & df['name'].notna()& df['url'].notna()]\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "5e708904-0161-4fb6-8bf8-c2f6dc3dbbea",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total number of entries found: 536\n",
+ "Number of entries found with all mandatory entries: 330\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f'Total number of entries found: {len(df)}')\n",
+ "print(f'Number of entries found with all mandatory entries: {len(data)}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "8ce34a4c-f14f-40b0-8254-a4234d1f9d23",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " authors | \n",
+ " name | \n",
+ " tags | \n",
+ " type | \n",
+ " url | \n",
+ " license | \n",
+ " event_date | \n",
+ " event_location | \n",
+ " description | \n",
+ " num_downloads | \n",
+ " publication_date | \n",
+ " fingerprint | \n",
+ " author | \n",
+ " submission_date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [Marcelo Zoccoler] | \n",
+ " Running Deep-Learning Scripts in the BiA-PoL O... | \n",
+ " [Python, Artificial Intelligence, Bioimage Ana... | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/marcelo_zoccoler... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [Robert Haase] | \n",
+ " Browsing the Open Microscopy Image Data Resour... | \n",
+ " [OMERO, Python] | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/robert_haase/bro... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [Mara Lampert] | \n",
+ " Getting started with Mambaforge and Python | \n",
+ " [Python, Conda, Mamba] | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/mara_lampert/get... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " [Robert Haase] | \n",
+ " Managing Scientific Python environments using ... | \n",
+ " [Python, Conda, Mamba] | \n",
+ " [Blog] | \n",
+ " https://focalplane.biologists.com/2022/12/08/m... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " [Robert Haase et al.] | \n",
+ " BioImage Analysis Notebooks | \n",
+ " [Python, Bioimage Analysis] | \n",
+ " [Book, Notebook] | \n",
+ " https://haesleinhuepf.github.io/BioImageAnalys... | \n",
+ " [CC-BY-4.0, BSD-3-CLAUSE] | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " authors name \\\n",
+ "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n",
+ "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n",
+ "3 [Mara Lampert] Getting started with Mambaforge and Python \n",
+ "9 [Robert Haase] Managing Scientific Python environments using ... \n",
+ "29 [Robert Haase et al.] BioImage Analysis Notebooks \n",
+ "\n",
+ " tags type \\\n",
+ "1 [Python, Artificial Intelligence, Bioimage Ana... [Blog] \n",
+ "2 [OMERO, Python] [Blog] \n",
+ "3 [Python, Conda, Mamba] [Blog] \n",
+ "9 [Python, Conda, Mamba] [Blog] \n",
+ "29 [Python, Bioimage Analysis] [Book, Notebook] \n",
+ "\n",
+ " url \\\n",
+ "1 https://biapol.github.io/blog/marcelo_zoccoler... \n",
+ "2 https://biapol.github.io/blog/robert_haase/bro... \n",
+ "3 https://biapol.github.io/blog/mara_lampert/get... \n",
+ "9 https://focalplane.biologists.com/2022/12/08/m... \n",
+ "29 https://haesleinhuepf.github.io/BioImageAnalys... \n",
+ "\n",
+ " license event_date event_location description \\\n",
+ "1 CC-BY-4.0 NaN NaN NaN \n",
+ "2 CC-BY-4.0 NaN NaN NaN \n",
+ "3 CC-BY-4.0 NaN NaN NaN \n",
+ "9 CC-BY-4.0 NaN NaN NaN \n",
+ "29 [CC-BY-4.0, BSD-3-CLAUSE] NaN NaN NaN \n",
+ "\n",
+ " num_downloads publication_date fingerprint author submission_date \n",
+ "1 NaN NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN NaN \n",
+ "9 NaN NaN NaN NaN NaN \n",
+ "29 NaN NaN NaN NaN NaN "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1b18efe8-8833-478a-9971-8ec727580fa1",
+ "metadata": {},
+ "source": [
+ "#### 3. Change the format of the **Tags** and **License** columns to fit the DALIA format"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "5318cd0d-7b81-47df-bab6-e64a8afbf9a2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_12251/210055857.py:1: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data[\"tags\"] = data[\"tags\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #Tags\n",
+ "/tmp/ipykernel_12251/210055857.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data[\"license\"] = data[\"license\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #License\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " authors | \n",
+ " name | \n",
+ " tags | \n",
+ " type | \n",
+ " url | \n",
+ " license | \n",
+ " event_date | \n",
+ " event_location | \n",
+ " description | \n",
+ " num_downloads | \n",
+ " publication_date | \n",
+ " fingerprint | \n",
+ " author | \n",
+ " submission_date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [Marcelo Zoccoler] | \n",
+ " Running Deep-Learning Scripts in the BiA-PoL O... | \n",
+ " Python * Artificial Intelligence * Bioimage An... | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/marcelo_zoccoler... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [Robert Haase] | \n",
+ " Browsing the Open Microscopy Image Data Resour... | \n",
+ " OMERO * Python | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/robert_haase/bro... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [Mara Lampert] | \n",
+ " Getting started with Mambaforge and Python | \n",
+ " Python * Conda * Mamba | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/mara_lampert/get... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " [Robert Haase] | \n",
+ " Managing Scientific Python environments using ... | \n",
+ " Python * Conda * Mamba | \n",
+ " [Blog] | \n",
+ " https://focalplane.biologists.com/2022/12/08/m... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " [Robert Haase et al.] | \n",
+ " BioImage Analysis Notebooks | \n",
+ " Python * Bioimage Analysis | \n",
+ " [Book, Notebook] | \n",
+ " https://haesleinhuepf.github.io/BioImageAnalys... | \n",
+ " CC-BY-4.0 * BSD-3-CLAUSE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " authors name \\\n",
+ "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n",
+ "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n",
+ "3 [Mara Lampert] Getting started with Mambaforge and Python \n",
+ "9 [Robert Haase] Managing Scientific Python environments using ... \n",
+ "29 [Robert Haase et al.] BioImage Analysis Notebooks \n",
+ "\n",
+ " tags type \\\n",
+ "1 Python * Artificial Intelligence * Bioimage An... [Blog] \n",
+ "2 OMERO * Python [Blog] \n",
+ "3 Python * Conda * Mamba [Blog] \n",
+ "9 Python * Conda * Mamba [Blog] \n",
+ "29 Python * Bioimage Analysis [Book, Notebook] \n",
+ "\n",
+ " url \\\n",
+ "1 https://biapol.github.io/blog/marcelo_zoccoler... \n",
+ "2 https://biapol.github.io/blog/robert_haase/bro... \n",
+ "3 https://biapol.github.io/blog/mara_lampert/get... \n",
+ "9 https://focalplane.biologists.com/2022/12/08/m... \n",
+ "29 https://haesleinhuepf.github.io/BioImageAnalys... \n",
+ "\n",
+ " license event_date event_location description \\\n",
+ "1 CC-BY-4.0 NaN NaN NaN \n",
+ "2 CC-BY-4.0 NaN NaN NaN \n",
+ "3 CC-BY-4.0 NaN NaN NaN \n",
+ "9 CC-BY-4.0 NaN NaN NaN \n",
+ "29 CC-BY-4.0 * BSD-3-CLAUSE NaN NaN NaN \n",
+ "\n",
+ " num_downloads publication_date fingerprint author submission_date \n",
+ "1 NaN NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN NaN \n",
+ "9 NaN NaN NaN NaN NaN \n",
+ "29 NaN NaN NaN NaN NaN "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[\"tags\"] = data[\"tags\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #Tags\n",
+ "data[\"license\"] = data[\"license\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #License\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "ac322332-6c61-4764-b8c2-760c33518429",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_12251/2387137408.py:21: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data[\"license\"] = data[\"license\"].replace(license_mapping)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Map the License Entries to valid input\n",
+ "license_mapping = {\n",
+ " 'APACHE-2.0 LICENSE' : 'Apache-2.0',\n",
+ " 'CC0 1.0 UNIVERSAL' : 'CC0-1.0',\n",
+ " 'CC-BY-4.0 * BSD-3-CLAUSE' : 'CC-BY-4.0 * BSD-3-Clause',\n",
+ " 'CC0 (MOSTLY, BUT CAN DIFFER DEPENDING ON RESOURCE)' : 'CC0-1.0',\n",
+ " 'CCY-BY-SA-4.0' : 'CC-BY-SA-4.0',\n",
+ " 'YOUTTUBE STANDARD LICENSE' : 'YOUTUBE STANDARD LICENSE',\n",
+ " 'CC-BY-NC-SA' : 'CC-BY-NC-SA-4.0',\n",
+ " 'BSD3-CLAUSE' : 'BSD-3-Clause',\n",
+ " 'CC-ZERO' : 'CC0-1.0',\n",
+ " 'BSD 3-Clause \"New\" or \"Revised\" License' : 'BSD-3-Clause',\n",
+ " 'cc-by-4.0' : ' CC-BY-4.0',\n",
+ " 'Creative Commons Attribution Share Alike 4.0 International' : 'CC-BY-SA-4.0',\n",
+ " 'GNU General Public License v3.0' : 'GPL-3.0-only',\n",
+ " 'CC BY-NC-SA 4.0' : 'CC-BY-NC-SA-4.0',\n",
+ " 'BSD-3-CLAUSE' : 'BSD-3-Clause',\n",
+ " 'BSD-2-CLAUSE' : 'BSD-2-Clause',\n",
+ " 'APACHE-2.0' : 'Apache-2.0'\n",
+ "}\n",
+ "data[\"license\"] = data[\"license\"].replace(license_mapping)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "03fbeeb5-67d4-403c-a780-8757f738b9bb",
+ "metadata": {},
+ "source": [
+ "#### 4. Morph the **Type** Column into the **LearningResourceType** and **MediaType** Column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "8a104889-190a-4504-af64-c5a019392ad3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create Mapping for the Type Column:\n",
+ "type_to_learning_resource = {\n",
+ " \"Application\": \"Software Application\",\n",
+ " \"Big Data\": \"Data\",\n",
+ " \"Bioimage Analysis\": \"Other\",\n",
+ " \"Blog\": \"Web Page\",\n",
+ " \"Blog Post\": \"Text\",\n",
+ " \"Book\": \"Book\",\n",
+ " \"Book Chapter\": \"Book\",\n",
+ " \"Code\": None,\n",
+ " \"Collection\": \"Other\",\n",
+ " \"Conference Abstract\": \"Text\",\n",
+ " \"Data\": \"Data\",\n",
+ " \"Document\": \"Text\",\n",
+ " \"Documentation\": \"Text\",\n",
+ " \"Event\": \"Other\",\n",
+ " \"Forum Post\": \"Text\",\n",
+ " \"Github Repository\": \"Other\",\n",
+ " \"Jupyter Book\": \"Code Notebook\",\n",
+ " \"Notebook\": \"Code Notebook\",\n",
+ " \"Online Course\": \"Course\",\n",
+ " \"Online Tutorial\": \"Tutorial\",\n",
+ " \"Open Source Software\": \"Software Application\",\n",
+ " \"Poster\": \"Poster\",\n",
+ " \"Practicals\": \"Course\",\n",
+ " \"Preprint\": \"Text\",\n",
+ " \"Presentation\": \"Presentation\",\n",
+ " \"Publication\": \"Article\",\n",
+ " \"Python\": None,\n",
+ " \"Report\": \"Report\",\n",
+ " \"Slide\": \"Presentation\",\n",
+ " \"Slides\": \"Presentation\",\n",
+ " \"Tutorial\": \"Tutorial\",\n",
+ " \"Video\": None,\n",
+ " \"Videos\": None,\n",
+ " \"Website\": \"Web Page\",\n",
+ " \"Workshop\": \"Course\",\n",
+ " \"Youtube Channel\": \"Other\"\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "cd6e0ac8-2382-4a26-9e77-ee739081396f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "type_to_media_type = {\n",
+ " \"Application\": None,\n",
+ " \"Big Data\": None,\n",
+ " \"Bioimage Analysis\": None,\n",
+ " \"Blog\": \"text\",\n",
+ " \"Blog Post\": \"text\",\n",
+ " \"Book\": \"text\",\n",
+ " \"Book Chapter\": \"text\",\n",
+ " \"Code\": \"code\",\n",
+ " \"Collection\": None,\n",
+ " \"Conference Abstract\": \"text\",\n",
+ " \"Data\": None,\n",
+ " \"Document\": \"text\",\n",
+ " \"Documentation\": \"text\",\n",
+ " \"Event\": None,\n",
+ " \"Forum Post\": \"text\",\n",
+ " \"Github Repository\": None,\n",
+ " \"Jupyter Book\": \"code\",\n",
+ " \"Notebook\": \"code\",\n",
+ " \"Online Course\": None,\n",
+ " \"Online Tutorial\": None,\n",
+ " \"Open Source Software\": None,\n",
+ " \"Poster\": None,\n",
+ " \"Practicals\": None,\n",
+ " \"Preprint\": \"text\",\n",
+ " \"Presentation\": \"presentation\",\n",
+ " \"Publication\": \"text\",\n",
+ " \"Python\": None,\n",
+ " \"Report\": \"text\",\n",
+ " \"Slide\": \"presentation\",\n",
+ " \"Slides\": \"presentation\",\n",
+ " \"Tutorial\": None,\n",
+ " \"Video\": \"video\",\n",
+ " \"Videos\": \"video\",\n",
+ " \"Website\": None,\n",
+ " \"Workshop\": None,\n",
+ " \"Youtube Channel\": \"video\"\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "c32c15e5-2d12-4051-b238-44a94afcc5d1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_12251/3151956629.py:30: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data[\"LearningResourceType\"] = data[\"type\"].apply(map_learning_resource)\n",
+ "/tmp/ipykernel_12251/3151956629.py:31: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data[\"MediaType\"] = data[\"type\"].apply(map_media_type)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " authors | \n",
+ " name | \n",
+ " tags | \n",
+ " type | \n",
+ " url | \n",
+ " license | \n",
+ " event_date | \n",
+ " event_location | \n",
+ " description | \n",
+ " num_downloads | \n",
+ " publication_date | \n",
+ " fingerprint | \n",
+ " author | \n",
+ " submission_date | \n",
+ " LearningResourceType | \n",
+ " MediaType | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [Marcelo Zoccoler] | \n",
+ " Running Deep-Learning Scripts in the BiA-PoL O... | \n",
+ " Python * Artificial Intelligence * Bioimage An... | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/marcelo_zoccoler... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [Robert Haase] | \n",
+ " Browsing the Open Microscopy Image Data Resour... | \n",
+ " OMERO * Python | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/robert_haase/bro... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [Mara Lampert] | \n",
+ " Getting started with Mambaforge and Python | \n",
+ " Python * Conda * Mamba | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/mara_lampert/get... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " [Robert Haase] | \n",
+ " Managing Scientific Python environments using ... | \n",
+ " Python * Conda * Mamba | \n",
+ " [Blog] | \n",
+ " https://focalplane.biologists.com/2022/12/08/m... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " [Robert Haase et al.] | \n",
+ " BioImage Analysis Notebooks | \n",
+ " Python * Bioimage Analysis | \n",
+ " [Book, Notebook] | \n",
+ " https://haesleinhuepf.github.io/BioImageAnalys... | \n",
+ " CC-BY-4.0 * BSD-3-Clause | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Book * Code Notebook | \n",
+ " text * code | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " authors name \\\n",
+ "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n",
+ "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n",
+ "3 [Mara Lampert] Getting started with Mambaforge and Python \n",
+ "9 [Robert Haase] Managing Scientific Python environments using ... \n",
+ "29 [Robert Haase et al.] BioImage Analysis Notebooks \n",
+ "\n",
+ " tags type \\\n",
+ "1 Python * Artificial Intelligence * Bioimage An... [Blog] \n",
+ "2 OMERO * Python [Blog] \n",
+ "3 Python * Conda * Mamba [Blog] \n",
+ "9 Python * Conda * Mamba [Blog] \n",
+ "29 Python * Bioimage Analysis [Book, Notebook] \n",
+ "\n",
+ " url \\\n",
+ "1 https://biapol.github.io/blog/marcelo_zoccoler... \n",
+ "2 https://biapol.github.io/blog/robert_haase/bro... \n",
+ "3 https://biapol.github.io/blog/mara_lampert/get... \n",
+ "9 https://focalplane.biologists.com/2022/12/08/m... \n",
+ "29 https://haesleinhuepf.github.io/BioImageAnalys... \n",
+ "\n",
+ " license event_date event_location description \\\n",
+ "1 CC-BY-4.0 NaN NaN NaN \n",
+ "2 CC-BY-4.0 NaN NaN NaN \n",
+ "3 CC-BY-4.0 NaN NaN NaN \n",
+ "9 CC-BY-4.0 NaN NaN NaN \n",
+ "29 CC-BY-4.0 * BSD-3-Clause NaN NaN NaN \n",
+ "\n",
+ " num_downloads publication_date fingerprint author submission_date \\\n",
+ "1 NaN NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN NaN \n",
+ "9 NaN NaN NaN NaN NaN \n",
+ "29 NaN NaN NaN NaN NaN \n",
+ "\n",
+ " LearningResourceType MediaType \n",
+ "1 Web Page text \n",
+ "2 Web Page text \n",
+ "3 Web Page text \n",
+ "9 Web Page text \n",
+ "29 Book * Code Notebook text * code "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def map_learning_resource(entry):\n",
+ " # Skip empty or NaN rows\n",
+ " if entry is None or (isinstance(entry, float) and pd.isna(entry)):\n",
+ " return \"\"\n",
+ " # Use a set to avoid duplicates\n",
+ " matches = set()\n",
+ " if isinstance(entry, list):\n",
+ " for item in entry:\n",
+ " if item in type_to_learning_resource:\n",
+ " matches.add(type_to_learning_resource[item])\n",
+ " elif entry in type_to_learning_resource:\n",
+ " matches.add(type_to_learning_resource[entry])\n",
+ " return \" * \".join([m for m in matches if m is not None])\n",
+ "\n",
+ "def map_media_type(entry):\n",
+ " # Skip empty or NaN rows\n",
+ " if entry is None or (isinstance(entry, float) and pd.isna(entry)):\n",
+ " return \"\"\n",
+ " # Use a set to avoid duplicates\n",
+ " matches = set()\n",
+ " if isinstance(entry, list):\n",
+ " for item in entry:\n",
+ " if item in type_to_media_type:\n",
+ " matches.add(type_to_media_type[item])\n",
+ " elif entry in type_to_media_type:\n",
+ " matches.add(type_to_media_type[entry])\n",
+ " return \" * \".join([m for m in matches if m is not None])\n",
+ "\n",
+ "# Apply the mapping functions\n",
+ "data[\"LearningResourceType\"] = data[\"type\"].apply(map_learning_resource)\n",
+ "data[\"MediaType\"] = data[\"type\"].apply(map_media_type)\n",
+ "\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "19f96ee9-e203-4b19-82b2-96a0e4088383",
+ "metadata": {},
+ "source": [
+ "#### 5. Change the author names to fit the DALIA format (for persons: surname, prename and for organizations: organization-name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "827776ce-3be9-4b28-b664-687c7d4fc4ab",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_12251/970863209.py:37: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data[\"Authors\"] = data[\"authors\"].apply(normalize_author_format)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " authors | \n",
+ " name | \n",
+ " tags | \n",
+ " type | \n",
+ " url | \n",
+ " license | \n",
+ " event_date | \n",
+ " event_location | \n",
+ " description | \n",
+ " num_downloads | \n",
+ " publication_date | \n",
+ " fingerprint | \n",
+ " author | \n",
+ " submission_date | \n",
+ " LearningResourceType | \n",
+ " MediaType | \n",
+ " Authors | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [Marcelo Zoccoler] | \n",
+ " Running Deep-Learning Scripts in the BiA-PoL O... | \n",
+ " Python * Artificial Intelligence * Bioimage An... | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/marcelo_zoccoler... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Zoccoler, Marcelo | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [Robert Haase] | \n",
+ " Browsing the Open Microscopy Image Data Resour... | \n",
+ " OMERO * Python | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/robert_haase/bro... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Haase, Robert | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [Mara Lampert] | \n",
+ " Getting started with Mambaforge and Python | \n",
+ " Python * Conda * Mamba | \n",
+ " [Blog] | \n",
+ " https://biapol.github.io/blog/mara_lampert/get... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Lampert, Mara | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " [Robert Haase] | \n",
+ " Managing Scientific Python environments using ... | \n",
+ " Python * Conda * Mamba | \n",
+ " [Blog] | \n",
+ " https://focalplane.biologists.com/2022/12/08/m... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Haase, Robert | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " [Robert Haase et al.] | \n",
+ " BioImage Analysis Notebooks | \n",
+ " Python * Bioimage Analysis | \n",
+ " [Book, Notebook] | \n",
+ " https://haesleinhuepf.github.io/BioImageAnalys... | \n",
+ " CC-BY-4.0 * BSD-3-Clause | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Book * Code Notebook | \n",
+ " text * code | \n",
+ " Robert Haase et al. | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " authors name \\\n",
+ "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n",
+ "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n",
+ "3 [Mara Lampert] Getting started with Mambaforge and Python \n",
+ "9 [Robert Haase] Managing Scientific Python environments using ... \n",
+ "29 [Robert Haase et al.] BioImage Analysis Notebooks \n",
+ "\n",
+ " tags type \\\n",
+ "1 Python * Artificial Intelligence * Bioimage An... [Blog] \n",
+ "2 OMERO * Python [Blog] \n",
+ "3 Python * Conda * Mamba [Blog] \n",
+ "9 Python * Conda * Mamba [Blog] \n",
+ "29 Python * Bioimage Analysis [Book, Notebook] \n",
+ "\n",
+ " url \\\n",
+ "1 https://biapol.github.io/blog/marcelo_zoccoler... \n",
+ "2 https://biapol.github.io/blog/robert_haase/bro... \n",
+ "3 https://biapol.github.io/blog/mara_lampert/get... \n",
+ "9 https://focalplane.biologists.com/2022/12/08/m... \n",
+ "29 https://haesleinhuepf.github.io/BioImageAnalys... \n",
+ "\n",
+ " license event_date event_location description \\\n",
+ "1 CC-BY-4.0 NaN NaN NaN \n",
+ "2 CC-BY-4.0 NaN NaN NaN \n",
+ "3 CC-BY-4.0 NaN NaN NaN \n",
+ "9 CC-BY-4.0 NaN NaN NaN \n",
+ "29 CC-BY-4.0 * BSD-3-Clause NaN NaN NaN \n",
+ "\n",
+ " num_downloads publication_date fingerprint author submission_date \\\n",
+ "1 NaN NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN NaN \n",
+ "9 NaN NaN NaN NaN NaN \n",
+ "29 NaN NaN NaN NaN NaN \n",
+ "\n",
+ " LearningResourceType MediaType Authors \n",
+ "1 Web Page text Zoccoler, Marcelo \n",
+ "2 Web Page text Haase, Robert \n",
+ "3 Web Page text Lampert, Mara \n",
+ "9 Web Page text Haase, Robert \n",
+ "29 Book * Code Notebook text * code Robert Haase et al. "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import re\n",
+ "\n",
+ "def normalize_author_format(authors):\n",
+ " # Helper function to reformat a single name\n",
+ " def reformat_name(name):\n",
+ " # Check if it's already in \"Surname, Prename\" format\n",
+ " if \",\" in name:\n",
+ " return name.strip()\n",
+ " # If in \"Prename Surname\" format, convert to \"Surname, Prename\"\n",
+ " parts = name.split()\n",
+ " et_al = ['et', 'al.']\n",
+ " if len(parts) == 2 and all(p not in et_al for p in parts):\n",
+ " return f\"{parts[1]}, {parts[0]}\"\n",
+ " if len(parts) == 3 and all(p not in et_al for p in parts):\n",
+ " return f\"{parts[2]}, {parts[0]}{parts[1]}\"\n",
+ " return name.strip() # Return unchanged if not a simple name format\n",
+ "\n",
+ "\n",
+ " # Convert single strings to lists for uniform processing\n",
+ " if isinstance(authors, str):\n",
+ " # Split on commas for inline lists like \"Prename Surname, Prename Surname\"\n",
+ " authors = [a.strip() for a in re.split(r\",\\s*|\\*|\\band\\b\", authors)]\n",
+ " elif isinstance(authors, list):\n",
+ " authors = [str(a).strip() for a in authors] # Ensure all elements are strings\n",
+ "\n",
+ " # Process each author entry\n",
+ " formatted_authors = []\n",
+ " for author in authors:\n",
+ " formatted_authors.append(reformat_name(author))\n",
+ "\n",
+ " # Join all processed names with \"*\"\n",
+ " return \" * \".join(formatted_authors)\n",
+ "\n",
+ "\n",
+ "# Apply the normalization function\n",
+ "data[\"Authors\"] = data[\"authors\"].apply(normalize_author_format)\n",
+ "\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0983e2a5-3f93-4cfa-9f40-3f41233fe77e",
+ "metadata": {},
+ "source": [
+ "#### 6. Change to names of the columns that already fit the DALIA format to their corresponding name in DALIA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "4213ac0c-3274-408e-a86d-bc9e61832de8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Title | \n",
+ " Keywords | \n",
+ " Link | \n",
+ " License | \n",
+ " Description | \n",
+ " PublicationDate | \n",
+ " LearningResourceType | \n",
+ " MediaType | \n",
+ " Authors | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " Running Deep-Learning Scripts in the BiA-PoL O... | \n",
+ " Python * Artificial Intelligence * Bioimage An... | \n",
+ " https://biapol.github.io/blog/marcelo_zoccoler... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Zoccoler, Marcelo | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Browsing the Open Microscopy Image Data Resour... | \n",
+ " OMERO * Python | \n",
+ " https://biapol.github.io/blog/robert_haase/bro... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Haase, Robert | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Getting started with Mambaforge and Python | \n",
+ " Python * Conda * Mamba | \n",
+ " https://biapol.github.io/blog/mara_lampert/get... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Lampert, Mara | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Managing Scientific Python environments using ... | \n",
+ " Python * Conda * Mamba | \n",
+ " https://focalplane.biologists.com/2022/12/08/m... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Haase, Robert | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " BioImage Analysis Notebooks | \n",
+ " Python * Bioimage Analysis | \n",
+ " https://haesleinhuepf.github.io/BioImageAnalys... | \n",
+ " CC-BY-4.0 * BSD-3-Clause | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Book * Code Notebook | \n",
+ " text * code | \n",
+ " Robert Haase et al. | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Title \\\n",
+ "1 Running Deep-Learning Scripts in the BiA-PoL O... \n",
+ "2 Browsing the Open Microscopy Image Data Resour... \n",
+ "3 Getting started with Mambaforge and Python \n",
+ "9 Managing Scientific Python environments using ... \n",
+ "29 BioImage Analysis Notebooks \n",
+ "\n",
+ " Keywords \\\n",
+ "1 Python * Artificial Intelligence * Bioimage An... \n",
+ "2 OMERO * Python \n",
+ "3 Python * Conda * Mamba \n",
+ "9 Python * Conda * Mamba \n",
+ "29 Python * Bioimage Analysis \n",
+ "\n",
+ " Link \\\n",
+ "1 https://biapol.github.io/blog/marcelo_zoccoler... \n",
+ "2 https://biapol.github.io/blog/robert_haase/bro... \n",
+ "3 https://biapol.github.io/blog/mara_lampert/get... \n",
+ "9 https://focalplane.biologists.com/2022/12/08/m... \n",
+ "29 https://haesleinhuepf.github.io/BioImageAnalys... \n",
+ "\n",
+ " License Description PublicationDate \\\n",
+ "1 CC-BY-4.0 NaN NaN \n",
+ "2 CC-BY-4.0 NaN NaN \n",
+ "3 CC-BY-4.0 NaN NaN \n",
+ "9 CC-BY-4.0 NaN NaN \n",
+ "29 CC-BY-4.0 * BSD-3-Clause NaN NaN \n",
+ "\n",
+ " LearningResourceType MediaType Authors \n",
+ "1 Web Page text Zoccoler, Marcelo \n",
+ "2 Web Page text Haase, Robert \n",
+ "3 Web Page text Lampert, Mara \n",
+ "9 Web Page text Haase, Robert \n",
+ "29 Book * Code Notebook text * code Robert Haase et al. "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Rename columns\n",
+ "data = data.rename(columns={'name': 'Title', 'license': 'License', 'url': 'Link', 'description': 'Description', 'publication_date': 'PublicationDate', 'tags': 'Keywords'})\n",
+ "\n",
+ "# Remove unwanted columns with no important data\n",
+ "data = data.drop(columns=['event_date', 'event_location', 'num_downloads', 'submission_date', 'fingerprint', 'author', 'type', 'authors'])\n",
+ "\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "69d43abb-d409-49bf-ab21-b79729441d1f",
+ "metadata": {},
+ "source": [
+ "#### 7. Introduce the **Community Column**: NFDI4BioImage if it is listed in the tags"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "5a79c41e-6037-44c2-8cdd-0988197de047",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Title | \n",
+ " Keywords | \n",
+ " Link | \n",
+ " License | \n",
+ " Description | \n",
+ " PublicationDate | \n",
+ " LearningResourceType | \n",
+ " MediaType | \n",
+ " Authors | \n",
+ " Community | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " Running Deep-Learning Scripts in the BiA-PoL O... | \n",
+ " Python * Artificial Intelligence * Bioimage An... | \n",
+ " https://biapol.github.io/blog/marcelo_zoccoler... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Zoccoler, Marcelo | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Browsing the Open Microscopy Image Data Resour... | \n",
+ " OMERO * Python | \n",
+ " https://biapol.github.io/blog/robert_haase/bro... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Haase, Robert | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Getting started with Mambaforge and Python | \n",
+ " Python * Conda * Mamba | \n",
+ " https://biapol.github.io/blog/mara_lampert/get... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Lampert, Mara | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Managing Scientific Python environments using ... | \n",
+ " Python * Conda * Mamba | \n",
+ " https://focalplane.biologists.com/2022/12/08/m... | \n",
+ " CC-BY-4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Web Page | \n",
+ " text | \n",
+ " Haase, Robert | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " BioImage Analysis Notebooks | \n",
+ " Python * Bioimage Analysis | \n",
+ " https://haesleinhuepf.github.io/BioImageAnalys... | \n",
+ " CC-BY-4.0 * BSD-3-Clause | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Book * Code Notebook | \n",
+ " text * code | \n",
+ " Robert Haase et al. | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Title \\\n",
+ "1 Running Deep-Learning Scripts in the BiA-PoL O... \n",
+ "2 Browsing the Open Microscopy Image Data Resour... \n",
+ "3 Getting started with Mambaforge and Python \n",
+ "9 Managing Scientific Python environments using ... \n",
+ "29 BioImage Analysis Notebooks \n",
+ "\n",
+ " Keywords \\\n",
+ "1 Python * Artificial Intelligence * Bioimage An... \n",
+ "2 OMERO * Python \n",
+ "3 Python * Conda * Mamba \n",
+ "9 Python * Conda * Mamba \n",
+ "29 Python * Bioimage Analysis \n",
+ "\n",
+ " Link \\\n",
+ "1 https://biapol.github.io/blog/marcelo_zoccoler... \n",
+ "2 https://biapol.github.io/blog/robert_haase/bro... \n",
+ "3 https://biapol.github.io/blog/mara_lampert/get... \n",
+ "9 https://focalplane.biologists.com/2022/12/08/m... \n",
+ "29 https://haesleinhuepf.github.io/BioImageAnalys... \n",
+ "\n",
+ " License Description PublicationDate \\\n",
+ "1 CC-BY-4.0 NaN NaN \n",
+ "2 CC-BY-4.0 NaN NaN \n",
+ "3 CC-BY-4.0 NaN NaN \n",
+ "9 CC-BY-4.0 NaN NaN \n",
+ "29 CC-BY-4.0 * BSD-3-Clause NaN NaN \n",
+ "\n",
+ " LearningResourceType MediaType Authors Community \n",
+ "1 Web Page text Zoccoler, Marcelo None \n",
+ "2 Web Page text Haase, Robert None \n",
+ "3 Web Page text Lampert, Mara None \n",
+ "9 Web Page text Haase, Robert None \n",
+ "29 Book * Code Notebook text * code Robert Haase et al. None "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def include_community(entry):\n",
+ " if isinstance(entry, list):\n",
+ " if any(e.lower() == 'nfdi4bioimage' for e in entry if isinstance(e, str)):\n",
+ " return 'NFDI4Bioimage'\n",
+ " elif isinstance(entry, str):\n",
+ " if entry.lower() == 'nfdi4bioimage':\n",
+ " return 'NFDI4Bioimage'\n",
+ " return None\n",
+ "\n",
+ "\n",
+ "# Apply the function\n",
+ "data['Community'] = data['Keywords'].apply(include_community)\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "78b5ec11-6e2a-4b6d-8ff3-faa58325b232",
+ "metadata": {},
+ "source": [
+ "### 8. Introduce the **FileFormat** Column by comparing the MediaType to a FileFormat list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "89a3f72f-e614-4fe3-afc7-fc22345e104e",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "import re\n",
+ "import time\n",
+ "\n",
+ "# Function to extract record ID from a Zenodo link\n",
+ "def extract_zenodo_record_id(url):\n",
+ " # Regex to match Zenodo record links and extract the record ID\n",
+ " match = re.search(r\"https://zenodo.org/records/(\\d+)\", url)\n",
+ " return match.group(1) if match else None\n",
+ "\n",
+ "# Function to fetch file formats from Zenodo using the record ID\n",
+ "def fetch_file_formats(record_id):\n",
+ " if not record_id:\n",
+ " return None\n",
+ " api_url = f\"https://zenodo.org/api/records/{record_id}\"\n",
+ " try:\n",
+ " time.sleep(1) # Add a 1-second delay between requests\n",
+ " response = requests.get(api_url)\n",
+ " response.raise_for_status() # Raise an error for non-2xx responses\n",
+ " data = response.json()\n",
+ " file_types = {\n",
+ " file[\"key\"].split(\".\")[-1].lower()\n",
+ " for file in data.get(\"files\", [])\n",
+ " if \".\" in file[\"key\"]\n",
+ " }\n",
+ " return \" * \".join(sorted(file_types)) if file_types else None\n",
+ " except Exception as e:\n",
+ " print(f\"Error fetching file formats for record ID {record_id}: {e}\")\n",
+ " return None\n",
+ "\n",
+ "# Function to process a single URL or a list of URLs\n",
+ "def process_links(link_input):\n",
+ " if isinstance(link_input, str):\n",
+ " # Single URL case\n",
+ " record_id = extract_zenodo_record_id(link_input)\n",
+ " if record_id:\n",
+ " return fetch_file_formats(record_id)\n",
+ " elif isinstance(link_input, list):\n",
+ " # List of URLs case\n",
+ " for link in link_input:\n",
+ " record_id = extract_zenodo_record_id(link.strip())\n",
+ " if record_id:\n",
+ " file_format = fetch_file_formats(record_id)\n",
+ " if file_format: # Return on first valid result\n",
+ " return file_format\n",
+ " return None # Return None if no valid formats are found\n",
+ "\n",
+ "# Process the DataFrame\n",
+ "data[\"FileFormat\"] = data[\"Link\"].apply(process_links)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "74d9702a-00cb-4408-8815-c26fd9a4fdee",
+ "metadata": {},
+ "source": [
+ "Additionally map the Type Column to certain File Formats, if it is not already filled from the previous step. (only works for certain MediaTypes)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "965d0a41-9762-47cb-8bac-7042d35960c8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def map_file_format(media_type, file_format):\n",
+ " # If FileFormat already has a valid entry, return it as is\n",
+ " if file_format is not None and file_format.strip() != \"\":\n",
+ " return file_format\n",
+ " # Map media types to specific file formats\n",
+ " if media_type == \"audio\":\n",
+ " return \".mp3\"\n",
+ " elif media_type == \"video\":\n",
+ " return \".mp4\"\n",
+ " else:\n",
+ " return \"\" # Return empty string if no mapping is needed\n",
+ "\n",
+ "# Apply the mapping function\n",
+ "data[\"FileFormat\"] = data.apply(\n",
+ " lambda row: map_file_format(row[\"MediaType\"], row[\"FileFormat\"]),\n",
+ " axis=1\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "461e7dc1-7572-4664-887d-ec36f4ed2656",
+ "metadata": {},
+ "source": [
+ "Now also correct the Format of the Link Column:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "0de3c9ba-a0b8-434d-bd79-896ad87cf1c1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Make * Delimiter for the Links if there is more than one for some entries\n",
+ "data[\"Link\"] = data[\"Link\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #URL"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "22a0d7cc-e4d7-4c77-807c-662fb44ffbe0",
+ "metadata": {},
+ "source": [
+ "#### 9. Extract the Language of each Entry\n",
+ "This is done using the [xlm-roberta-base-language-detection](https://huggingface.co/papluca/xlm-roberta-base-language-detection) model via the transformers package pipeline."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "5f3abf90-990f-4ccc-8da3-05bb26e6538e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import pipeline\n",
+ "\n",
+ "model_ckpt = \"papluca/xlm-roberta-base-language-detection\"\n",
+ "pipe = pipeline(\"text-classification\", model=model_ckpt)\n",
+ "\n",
+ "def detect_language(text):\n",
+ " lang = pipe([text], top_k=1, truncation=True)[0][0][\"label\"]\n",
+ " return lang if lang in [\"en\", \"de\"] else \"\"\n",
+ "\n",
+ "data[\"Language\"] = data[\"Title\"].apply(detect_language)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0cd61d11-5907-43ba-968c-bb851d003631",
+ "metadata": {},
+ "source": [
+ "### Export the data to a csv that now fits the DALIA Format"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "63071e24-8d4e-4885-ae78-74669bbe5557",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Exported 330 rows.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# save selected data\n",
+ "data.to_csv(destination, index=False)\n",
+ "\n",
+ "num_rows = data.shape[0]\n",
+ "print(f\"Exported {num_rows} rows.\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}