From 0e5a6f0b15e3db2635f106f76694f9d3f46435c8 Mon Sep 17 00:00:00 2001 From: Lea Gihlein <85543649+lea-33@users.noreply.github.com> Date: Wed, 29 Jan 2025 13:09:20 +0100 Subject: [PATCH] Add Export_to_DALIA.ipynb --- scripts/Export_to_DALIA.ipynb | 2252 +++++++++++++++++++++++++++++++++ 1 file changed, 2252 insertions(+) create mode 100644 scripts/Export_to_DALIA.ipynb diff --git a/scripts/Export_to_DALIA.ipynb b/scripts/Export_to_DALIA.ipynb new file mode 100644 index 00000000..f8568ac0 --- /dev/null +++ b/scripts/Export_to_DALIA.ipynb @@ -0,0 +1,2252 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7a19e5fa-6f3a-4a8a-9244-6bf9fdebad76", + "metadata": {}, + "source": [ + "### Test Conversion of yml to DALIA format" + ] + }, + { + "cell_type": "markdown", + "id": "2e055672-a937-4e46-926e-fdf6c527d628", + "metadata": {}, + "source": [ + "#### Load the Yml as a pandas DF" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7396751e-9b56-4bf6-bc35-e6e38f6c108c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorsnametagstypeurllicenseevent_dateevent_locationdescriptionnum_downloadspublication_datefingerprintauthorsubmission_date
0[Elisabeth Kugler]Sharing Your Poster on Figshare: A Community G...[Sharing, Research Data Management][Blog]https://focalplane.biologists.com/2023/07/26/s...NaNNaNNaNNaNNaNNaNNaNNaNNaN
1[Marcelo Zoccoler]Running Deep-Learning Scripts in the BiA-PoL O...[Python, Artificial Intelligence, Bioimage Ana...[Blog]https://biapol.github.io/blog/marcelo_zoccoler...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
2[Robert Haase]Browsing the Open Microscopy Image Data Resour...[OMERO, Python][Blog]https://biapol.github.io/blog/robert_haase/bro...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
3[Mara Lampert]Getting started with Mambaforge and Python[Python, Conda, Mamba][Blog]https://biapol.github.io/blog/mara_lampert/get...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
4[Jennifer Waters]Promoting Data Management at the Nikon Imaging...[Research Data Management][Blog]https://datamanagement.hms.harvard.edu/news/pr...NaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " authors name \\\n", + "0 [Elisabeth Kugler] Sharing Your Poster on Figshare: A Community G... \n", + "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n", + "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n", + "3 [Mara Lampert] Getting started with Mambaforge and Python \n", + "4 [Jennifer Waters] Promoting Data Management at the Nikon Imaging... \n", + "\n", + " tags type \\\n", + "0 [Sharing, Research Data Management] [Blog] \n", + "1 [Python, Artificial Intelligence, Bioimage Ana... [Blog] \n", + "2 [OMERO, Python] [Blog] \n", + "3 [Python, Conda, Mamba] [Blog] \n", + "4 [Research Data Management] [Blog] \n", + "\n", + " url license event_date \\\n", + "0 https://focalplane.biologists.com/2023/07/26/s... NaN NaN \n", + "1 https://biapol.github.io/blog/marcelo_zoccoler... CC-BY-4.0 NaN \n", + "2 https://biapol.github.io/blog/robert_haase/bro... CC-BY-4.0 NaN \n", + "3 https://biapol.github.io/blog/mara_lampert/get... CC-BY-4.0 NaN \n", + "4 https://datamanagement.hms.harvard.edu/news/pr... NaN NaN \n", + "\n", + " event_location description num_downloads publication_date fingerprint \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN \n", + "\n", + " author submission_date \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# This file exports selected data as csv file\n", + "source = \"../resources/\"\n", + "#destination = './docs/export/training_materials.csv'\n", + "destination = '../docs/export/DALIA_training_materials.csv'\n", + "\n", + "from generate_link_lists import load_dataframe\n", + "\n", + "df = load_dataframe(source)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c65fd523-cffc-4f20-9ed4-1012d4a33b1b", + "metadata": {}, + "source": [ + "#### 1. Change the entries with a author column by writing those entries to the authors column" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f2bf12bd-56dc-49b4-902d-e68050d715ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorsnametagstypeurllicenseevent_dateevent_locationdescriptionnum_downloadspublication_datefingerprintauthorsubmission_date
394NaNVirtual-I2K-2024-multiview-stitcher[Big Data, Bioimageanalysis][Github Repository, Tutorial][https://github.com/m-albert/Virtual-I2K-2024-...BSD-3-CLAUSENaNNaNRepository accompanying the multiview-stitcher...NaN2024-10-30T07:38:11+00:00NaNMarvin AlbertNaN
397NaNPrompt-Engineering-LLMs-Course[Llms, Prompt Engineering, Code Generation][Github Repository, Tutorial]https://github.com/HelmholtzAI-Consultants-Mun...MITNaNNaNNaN2024-09-11T07:45:30+00:00NaNIsra MekkiNaN
\n", + "
" + ], + "text/plain": [ + " authors name \\\n", + "394 NaN Virtual-I2K-2024-multiview-stitcher \n", + "397 NaN Prompt-Engineering-LLMs-Course \n", + "\n", + " tags \\\n", + "394 [Big Data, Bioimageanalysis] \n", + "397 [Llms, Prompt Engineering, Code Generation] \n", + "\n", + " type \\\n", + "394 [Github Repository, Tutorial] \n", + "397 [Github Repository, Tutorial] \n", + "\n", + " url license \\\n", + "394 [https://github.com/m-albert/Virtual-I2K-2024-... BSD-3-CLAUSE \n", + "397 https://github.com/HelmholtzAI-Consultants-Mun... MIT \n", + "\n", + " event_date event_location \\\n", + "394 NaN NaN \n", + "397 NaN NaN \n", + "\n", + " description num_downloads \\\n", + "394 Repository accompanying the multiview-stitcher... NaN \n", + "397 NaN \n", + "\n", + " publication_date fingerprint author submission_date \n", + "394 2024-10-30T07:38:11+00:00 NaN Marvin Albert NaN \n", + "397 2024-09-11T07:45:30+00:00 NaN Isra Mekki NaN " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#check which entries have 'author' column\n", + "df[df['author'].notna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1371bc6c-23d9-46db-857c-41dd73e861c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorsnametagstypeurllicenseevent_dateevent_locationdescriptionnum_downloadspublication_datefingerprintauthorsubmission_date
394Marvin AlbertVirtual-I2K-2024-multiview-stitcher[Big Data, Bioimageanalysis][Github Repository, Tutorial][https://github.com/m-albert/Virtual-I2K-2024-...BSD-3-CLAUSENaNNaNRepository accompanying the multiview-stitcher...NaN2024-10-30T07:38:11+00:00NaNMarvin AlbertNaN
397Isra MekkiPrompt-Engineering-LLMs-Course[Llms, Prompt Engineering, Code Generation][Github Repository, Tutorial]https://github.com/HelmholtzAI-Consultants-Mun...MITNaNNaNNaN2024-09-11T07:45:30+00:00NaNIsra MekkiNaN
\n", + "
" + ], + "text/plain": [ + " authors name \\\n", + "394 Marvin Albert Virtual-I2K-2024-multiview-stitcher \n", + "397 Isra Mekki Prompt-Engineering-LLMs-Course \n", + "\n", + " tags \\\n", + "394 [Big Data, Bioimageanalysis] \n", + "397 [Llms, Prompt Engineering, Code Generation] \n", + "\n", + " type \\\n", + "394 [Github Repository, Tutorial] \n", + "397 [Github Repository, Tutorial] \n", + "\n", + " url license \\\n", + "394 [https://github.com/m-albert/Virtual-I2K-2024-... BSD-3-CLAUSE \n", + "397 https://github.com/HelmholtzAI-Consultants-Mun... MIT \n", + "\n", + " event_date event_location \\\n", + "394 NaN NaN \n", + "397 NaN NaN \n", + "\n", + " description num_downloads \\\n", + "394 Repository accompanying the multiview-stitcher... NaN \n", + "397 NaN \n", + "\n", + " publication_date fingerprint author submission_date \n", + "394 2024-10-30T07:38:11+00:00 NaN Marvin Albert NaN \n", + "397 2024-09-11T07:45:30+00:00 NaN Isra Mekki NaN " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Iterate over rows to change the information to the authors column\n", + "for index, entry in df[df['author'].notna()].iterrows():\n", + " df.loc[index, 'authors'] = entry['author']\n", + " \n", + "df[df['author'].notna()]" + ] + }, + { + "cell_type": "markdown", + "id": "51bdf851-201b-4c7d-8f9e-49ba5fb00eac", + "metadata": {}, + "source": [ + "#### 2. Exclude entries without mandatory attributes (License, Authors, Title, Link)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "11c39326-61e1-422d-99df-240f4b9b5c86", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorsnametagstypeurllicenseevent_dateevent_locationdescriptionnum_downloadspublication_datefingerprintauthorsubmission_date
1[Marcelo Zoccoler]Running Deep-Learning Scripts in the BiA-PoL O...[Python, Artificial Intelligence, Bioimage Ana...[Blog]https://biapol.github.io/blog/marcelo_zoccoler...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
2[Robert Haase]Browsing the Open Microscopy Image Data Resour...[OMERO, Python][Blog]https://biapol.github.io/blog/robert_haase/bro...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
3[Mara Lampert]Getting started with Mambaforge and Python[Python, Conda, Mamba][Blog]https://biapol.github.io/blog/mara_lampert/get...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
9[Robert Haase]Managing Scientific Python environments using ...[Python, Conda, Mamba][Blog]https://focalplane.biologists.com/2022/12/08/m...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
29[Robert Haase et al.]BioImage Analysis Notebooks[Python, Bioimage Analysis][Book, Notebook]https://haesleinhuepf.github.io/BioImageAnalys...[CC-BY-4.0, BSD-3-CLAUSE]NaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " authors name \\\n", + "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n", + "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n", + "3 [Mara Lampert] Getting started with Mambaforge and Python \n", + "9 [Robert Haase] Managing Scientific Python environments using ... \n", + "29 [Robert Haase et al.] BioImage Analysis Notebooks \n", + "\n", + " tags type \\\n", + "1 [Python, Artificial Intelligence, Bioimage Ana... [Blog] \n", + "2 [OMERO, Python] [Blog] \n", + "3 [Python, Conda, Mamba] [Blog] \n", + "9 [Python, Conda, Mamba] [Blog] \n", + "29 [Python, Bioimage Analysis] [Book, Notebook] \n", + "\n", + " url \\\n", + "1 https://biapol.github.io/blog/marcelo_zoccoler... \n", + "2 https://biapol.github.io/blog/robert_haase/bro... \n", + "3 https://biapol.github.io/blog/mara_lampert/get... \n", + "9 https://focalplane.biologists.com/2022/12/08/m... \n", + "29 https://haesleinhuepf.github.io/BioImageAnalys... \n", + "\n", + " license event_date event_location description \\\n", + "1 CC-BY-4.0 NaN NaN NaN \n", + "2 CC-BY-4.0 NaN NaN NaN \n", + "3 CC-BY-4.0 NaN NaN NaN \n", + "9 CC-BY-4.0 NaN NaN NaN \n", + "29 [CC-BY-4.0, BSD-3-CLAUSE] NaN NaN NaN \n", + "\n", + " num_downloads publication_date fingerprint author submission_date \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "9 NaN NaN NaN NaN NaN \n", + "29 NaN NaN NaN NaN NaN " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = df[~df['license'].str.lower().isin(['unknown']) & df['license'].notna() & df['authors'].notna() & df['name'].notna()& df['url'].notna()]\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5e708904-0161-4fb6-8bf8-c2f6dc3dbbea", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of entries found: 536\n", + "Number of entries found with all mandatory entries: 330\n" + ] + } + ], + "source": [ + "print(f'Total number of entries found: {len(df)}')\n", + "print(f'Number of entries found with all mandatory entries: {len(data)}')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8ce34a4c-f14f-40b0-8254-a4234d1f9d23", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorsnametagstypeurllicenseevent_dateevent_locationdescriptionnum_downloadspublication_datefingerprintauthorsubmission_date
1[Marcelo Zoccoler]Running Deep-Learning Scripts in the BiA-PoL O...[Python, Artificial Intelligence, Bioimage Ana...[Blog]https://biapol.github.io/blog/marcelo_zoccoler...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
2[Robert Haase]Browsing the Open Microscopy Image Data Resour...[OMERO, Python][Blog]https://biapol.github.io/blog/robert_haase/bro...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
3[Mara Lampert]Getting started with Mambaforge and Python[Python, Conda, Mamba][Blog]https://biapol.github.io/blog/mara_lampert/get...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
9[Robert Haase]Managing Scientific Python environments using ...[Python, Conda, Mamba][Blog]https://focalplane.biologists.com/2022/12/08/m...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
29[Robert Haase et al.]BioImage Analysis Notebooks[Python, Bioimage Analysis][Book, Notebook]https://haesleinhuepf.github.io/BioImageAnalys...[CC-BY-4.0, BSD-3-CLAUSE]NaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " authors name \\\n", + "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n", + "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n", + "3 [Mara Lampert] Getting started with Mambaforge and Python \n", + "9 [Robert Haase] Managing Scientific Python environments using ... \n", + "29 [Robert Haase et al.] BioImage Analysis Notebooks \n", + "\n", + " tags type \\\n", + "1 [Python, Artificial Intelligence, Bioimage Ana... [Blog] \n", + "2 [OMERO, Python] [Blog] \n", + "3 [Python, Conda, Mamba] [Blog] \n", + "9 [Python, Conda, Mamba] [Blog] \n", + "29 [Python, Bioimage Analysis] [Book, Notebook] \n", + "\n", + " url \\\n", + "1 https://biapol.github.io/blog/marcelo_zoccoler... \n", + "2 https://biapol.github.io/blog/robert_haase/bro... \n", + "3 https://biapol.github.io/blog/mara_lampert/get... \n", + "9 https://focalplane.biologists.com/2022/12/08/m... \n", + "29 https://haesleinhuepf.github.io/BioImageAnalys... \n", + "\n", + " license event_date event_location description \\\n", + "1 CC-BY-4.0 NaN NaN NaN \n", + "2 CC-BY-4.0 NaN NaN NaN \n", + "3 CC-BY-4.0 NaN NaN NaN \n", + "9 CC-BY-4.0 NaN NaN NaN \n", + "29 [CC-BY-4.0, BSD-3-CLAUSE] NaN NaN NaN \n", + "\n", + " num_downloads publication_date fingerprint author submission_date \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "9 NaN NaN NaN NaN NaN \n", + "29 NaN NaN NaN NaN NaN " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "1b18efe8-8833-478a-9971-8ec727580fa1", + "metadata": {}, + "source": [ + "#### 3. Change the format of the **Tags** and **License** columns to fit the DALIA format" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5318cd0d-7b81-47df-bab6-e64a8afbf9a2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_12251/210055857.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " data[\"tags\"] = data[\"tags\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #Tags\n", + "/tmp/ipykernel_12251/210055857.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " data[\"license\"] = data[\"license\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #License\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorsnametagstypeurllicenseevent_dateevent_locationdescriptionnum_downloadspublication_datefingerprintauthorsubmission_date
1[Marcelo Zoccoler]Running Deep-Learning Scripts in the BiA-PoL O...Python * Artificial Intelligence * Bioimage An...[Blog]https://biapol.github.io/blog/marcelo_zoccoler...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
2[Robert Haase]Browsing the Open Microscopy Image Data Resour...OMERO * Python[Blog]https://biapol.github.io/blog/robert_haase/bro...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
3[Mara Lampert]Getting started with Mambaforge and PythonPython * Conda * Mamba[Blog]https://biapol.github.io/blog/mara_lampert/get...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
9[Robert Haase]Managing Scientific Python environments using ...Python * Conda * Mamba[Blog]https://focalplane.biologists.com/2022/12/08/m...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaN
29[Robert Haase et al.]BioImage Analysis NotebooksPython * Bioimage Analysis[Book, Notebook]https://haesleinhuepf.github.io/BioImageAnalys...CC-BY-4.0 * BSD-3-CLAUSENaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " authors name \\\n", + "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n", + "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n", + "3 [Mara Lampert] Getting started with Mambaforge and Python \n", + "9 [Robert Haase] Managing Scientific Python environments using ... \n", + "29 [Robert Haase et al.] BioImage Analysis Notebooks \n", + "\n", + " tags type \\\n", + "1 Python * Artificial Intelligence * Bioimage An... [Blog] \n", + "2 OMERO * Python [Blog] \n", + "3 Python * Conda * Mamba [Blog] \n", + "9 Python * Conda * Mamba [Blog] \n", + "29 Python * Bioimage Analysis [Book, Notebook] \n", + "\n", + " url \\\n", + "1 https://biapol.github.io/blog/marcelo_zoccoler... \n", + "2 https://biapol.github.io/blog/robert_haase/bro... \n", + "3 https://biapol.github.io/blog/mara_lampert/get... \n", + "9 https://focalplane.biologists.com/2022/12/08/m... \n", + "29 https://haesleinhuepf.github.io/BioImageAnalys... \n", + "\n", + " license event_date event_location description \\\n", + "1 CC-BY-4.0 NaN NaN NaN \n", + "2 CC-BY-4.0 NaN NaN NaN \n", + "3 CC-BY-4.0 NaN NaN NaN \n", + "9 CC-BY-4.0 NaN NaN NaN \n", + "29 CC-BY-4.0 * BSD-3-CLAUSE NaN NaN NaN \n", + "\n", + " num_downloads publication_date fingerprint author submission_date \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "9 NaN NaN NaN NaN NaN \n", + "29 NaN NaN NaN NaN NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"tags\"] = data[\"tags\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #Tags\n", + "data[\"license\"] = data[\"license\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #License\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ac322332-6c61-4764-b8c2-760c33518429", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_12251/2387137408.py:21: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " data[\"license\"] = data[\"license\"].replace(license_mapping)\n" + ] + } + ], + "source": [ + "# Map the License Entries to valid input\n", + "license_mapping = {\n", + " 'APACHE-2.0 LICENSE' : 'Apache-2.0',\n", + " 'CC0 1.0 UNIVERSAL' : 'CC0-1.0',\n", + " 'CC-BY-4.0 * BSD-3-CLAUSE' : 'CC-BY-4.0 * BSD-3-Clause',\n", + " 'CC0 (MOSTLY, BUT CAN DIFFER DEPENDING ON RESOURCE)' : 'CC0-1.0',\n", + " 'CCY-BY-SA-4.0' : 'CC-BY-SA-4.0',\n", + " 'YOUTTUBE STANDARD LICENSE' : 'YOUTUBE STANDARD LICENSE',\n", + " 'CC-BY-NC-SA' : 'CC-BY-NC-SA-4.0',\n", + " 'BSD3-CLAUSE' : 'BSD-3-Clause',\n", + " 'CC-ZERO' : 'CC0-1.0',\n", + " 'BSD 3-Clause \"New\" or \"Revised\" License' : 'BSD-3-Clause',\n", + " 'cc-by-4.0' : ' CC-BY-4.0',\n", + " 'Creative Commons Attribution Share Alike 4.0 International' : 'CC-BY-SA-4.0',\n", + " 'GNU General Public License v3.0' : 'GPL-3.0-only',\n", + " 'CC BY-NC-SA 4.0' : 'CC-BY-NC-SA-4.0',\n", + " 'BSD-3-CLAUSE' : 'BSD-3-Clause',\n", + " 'BSD-2-CLAUSE' : 'BSD-2-Clause',\n", + " 'APACHE-2.0' : 'Apache-2.0'\n", + "}\n", + "data[\"license\"] = data[\"license\"].replace(license_mapping)" + ] + }, + { + "cell_type": "markdown", + "id": "03fbeeb5-67d4-403c-a780-8757f738b9bb", + "metadata": {}, + "source": [ + "#### 4. Morph the **Type** Column into the **LearningResourceType** and **MediaType** Column" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8a104889-190a-4504-af64-c5a019392ad3", + "metadata": {}, + "outputs": [], + "source": [ + "# Create Mapping for the Type Column:\n", + "type_to_learning_resource = {\n", + " \"Application\": \"Software Application\",\n", + " \"Big Data\": \"Data\",\n", + " \"Bioimage Analysis\": \"Other\",\n", + " \"Blog\": \"Web Page\",\n", + " \"Blog Post\": \"Text\",\n", + " \"Book\": \"Book\",\n", + " \"Book Chapter\": \"Book\",\n", + " \"Code\": None,\n", + " \"Collection\": \"Other\",\n", + " \"Conference Abstract\": \"Text\",\n", + " \"Data\": \"Data\",\n", + " \"Document\": \"Text\",\n", + " \"Documentation\": \"Text\",\n", + " \"Event\": \"Other\",\n", + " \"Forum Post\": \"Text\",\n", + " \"Github Repository\": \"Other\",\n", + " \"Jupyter Book\": \"Code Notebook\",\n", + " \"Notebook\": \"Code Notebook\",\n", + " \"Online Course\": \"Course\",\n", + " \"Online Tutorial\": \"Tutorial\",\n", + " \"Open Source Software\": \"Software Application\",\n", + " \"Poster\": \"Poster\",\n", + " \"Practicals\": \"Course\",\n", + " \"Preprint\": \"Text\",\n", + " \"Presentation\": \"Presentation\",\n", + " \"Publication\": \"Article\",\n", + " \"Python\": None,\n", + " \"Report\": \"Report\",\n", + " \"Slide\": \"Presentation\",\n", + " \"Slides\": \"Presentation\",\n", + " \"Tutorial\": \"Tutorial\",\n", + " \"Video\": None,\n", + " \"Videos\": None,\n", + " \"Website\": \"Web Page\",\n", + " \"Workshop\": \"Course\",\n", + " \"Youtube Channel\": \"Other\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cd6e0ac8-2382-4a26-9e77-ee739081396f", + "metadata": {}, + "outputs": [], + "source": [ + "type_to_media_type = {\n", + " \"Application\": None,\n", + " \"Big Data\": None,\n", + " \"Bioimage Analysis\": None,\n", + " \"Blog\": \"text\",\n", + " \"Blog Post\": \"text\",\n", + " \"Book\": \"text\",\n", + " \"Book Chapter\": \"text\",\n", + " \"Code\": \"code\",\n", + " \"Collection\": None,\n", + " \"Conference Abstract\": \"text\",\n", + " \"Data\": None,\n", + " \"Document\": \"text\",\n", + " \"Documentation\": \"text\",\n", + " \"Event\": None,\n", + " \"Forum Post\": \"text\",\n", + " \"Github Repository\": None,\n", + " \"Jupyter Book\": \"code\",\n", + " \"Notebook\": \"code\",\n", + " \"Online Course\": None,\n", + " \"Online Tutorial\": None,\n", + " \"Open Source Software\": None,\n", + " \"Poster\": None,\n", + " \"Practicals\": None,\n", + " \"Preprint\": \"text\",\n", + " \"Presentation\": \"presentation\",\n", + " \"Publication\": \"text\",\n", + " \"Python\": None,\n", + " \"Report\": \"text\",\n", + " \"Slide\": \"presentation\",\n", + " \"Slides\": \"presentation\",\n", + " \"Tutorial\": None,\n", + " \"Video\": \"video\",\n", + " \"Videos\": \"video\",\n", + " \"Website\": None,\n", + " \"Workshop\": None,\n", + " \"Youtube Channel\": \"video\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c32c15e5-2d12-4051-b238-44a94afcc5d1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_12251/3151956629.py:30: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " data[\"LearningResourceType\"] = data[\"type\"].apply(map_learning_resource)\n", + "/tmp/ipykernel_12251/3151956629.py:31: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " data[\"MediaType\"] = data[\"type\"].apply(map_media_type)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorsnametagstypeurllicenseevent_dateevent_locationdescriptionnum_downloadspublication_datefingerprintauthorsubmission_dateLearningResourceTypeMediaType
1[Marcelo Zoccoler]Running Deep-Learning Scripts in the BiA-PoL O...Python * Artificial Intelligence * Bioimage An...[Blog]https://biapol.github.io/blog/marcelo_zoccoler...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaNWeb Pagetext
2[Robert Haase]Browsing the Open Microscopy Image Data Resour...OMERO * Python[Blog]https://biapol.github.io/blog/robert_haase/bro...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaNWeb Pagetext
3[Mara Lampert]Getting started with Mambaforge and PythonPython * Conda * Mamba[Blog]https://biapol.github.io/blog/mara_lampert/get...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaNWeb Pagetext
9[Robert Haase]Managing Scientific Python environments using ...Python * Conda * Mamba[Blog]https://focalplane.biologists.com/2022/12/08/m...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaNWeb Pagetext
29[Robert Haase et al.]BioImage Analysis NotebooksPython * Bioimage Analysis[Book, Notebook]https://haesleinhuepf.github.io/BioImageAnalys...CC-BY-4.0 * BSD-3-ClauseNaNNaNNaNNaNNaNNaNNaNNaNBook * Code Notebooktext * code
\n", + "
" + ], + "text/plain": [ + " authors name \\\n", + "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n", + "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n", + "3 [Mara Lampert] Getting started with Mambaforge and Python \n", + "9 [Robert Haase] Managing Scientific Python environments using ... \n", + "29 [Robert Haase et al.] BioImage Analysis Notebooks \n", + "\n", + " tags type \\\n", + "1 Python * Artificial Intelligence * Bioimage An... [Blog] \n", + "2 OMERO * Python [Blog] \n", + "3 Python * Conda * Mamba [Blog] \n", + "9 Python * Conda * Mamba [Blog] \n", + "29 Python * Bioimage Analysis [Book, Notebook] \n", + "\n", + " url \\\n", + "1 https://biapol.github.io/blog/marcelo_zoccoler... \n", + "2 https://biapol.github.io/blog/robert_haase/bro... \n", + "3 https://biapol.github.io/blog/mara_lampert/get... \n", + "9 https://focalplane.biologists.com/2022/12/08/m... \n", + "29 https://haesleinhuepf.github.io/BioImageAnalys... \n", + "\n", + " license event_date event_location description \\\n", + "1 CC-BY-4.0 NaN NaN NaN \n", + "2 CC-BY-4.0 NaN NaN NaN \n", + "3 CC-BY-4.0 NaN NaN NaN \n", + "9 CC-BY-4.0 NaN NaN NaN \n", + "29 CC-BY-4.0 * BSD-3-Clause NaN NaN NaN \n", + "\n", + " num_downloads publication_date fingerprint author submission_date \\\n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "9 NaN NaN NaN NaN NaN \n", + "29 NaN NaN NaN NaN NaN \n", + "\n", + " LearningResourceType MediaType \n", + "1 Web Page text \n", + "2 Web Page text \n", + "3 Web Page text \n", + "9 Web Page text \n", + "29 Book * Code Notebook text * code " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def map_learning_resource(entry):\n", + " # Skip empty or NaN rows\n", + " if entry is None or (isinstance(entry, float) and pd.isna(entry)):\n", + " return \"\"\n", + " # Use a set to avoid duplicates\n", + " matches = set()\n", + " if isinstance(entry, list):\n", + " for item in entry:\n", + " if item in type_to_learning_resource:\n", + " matches.add(type_to_learning_resource[item])\n", + " elif entry in type_to_learning_resource:\n", + " matches.add(type_to_learning_resource[entry])\n", + " return \" * \".join([m for m in matches if m is not None])\n", + "\n", + "def map_media_type(entry):\n", + " # Skip empty or NaN rows\n", + " if entry is None or (isinstance(entry, float) and pd.isna(entry)):\n", + " return \"\"\n", + " # Use a set to avoid duplicates\n", + " matches = set()\n", + " if isinstance(entry, list):\n", + " for item in entry:\n", + " if item in type_to_media_type:\n", + " matches.add(type_to_media_type[item])\n", + " elif entry in type_to_media_type:\n", + " matches.add(type_to_media_type[entry])\n", + " return \" * \".join([m for m in matches if m is not None])\n", + "\n", + "# Apply the mapping functions\n", + "data[\"LearningResourceType\"] = data[\"type\"].apply(map_learning_resource)\n", + "data[\"MediaType\"] = data[\"type\"].apply(map_media_type)\n", + "\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "19f96ee9-e203-4b19-82b2-96a0e4088383", + "metadata": {}, + "source": [ + "#### 5. Change the author names to fit the DALIA format (for persons: surname, prename and for organizations: organization-name)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "827776ce-3be9-4b28-b664-687c7d4fc4ab", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_12251/970863209.py:37: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " data[\"Authors\"] = data[\"authors\"].apply(normalize_author_format)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorsnametagstypeurllicenseevent_dateevent_locationdescriptionnum_downloadspublication_datefingerprintauthorsubmission_dateLearningResourceTypeMediaTypeAuthors
1[Marcelo Zoccoler]Running Deep-Learning Scripts in the BiA-PoL O...Python * Artificial Intelligence * Bioimage An...[Blog]https://biapol.github.io/blog/marcelo_zoccoler...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaNWeb PagetextZoccoler, Marcelo
2[Robert Haase]Browsing the Open Microscopy Image Data Resour...OMERO * Python[Blog]https://biapol.github.io/blog/robert_haase/bro...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaNWeb PagetextHaase, Robert
3[Mara Lampert]Getting started with Mambaforge and PythonPython * Conda * Mamba[Blog]https://biapol.github.io/blog/mara_lampert/get...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaNWeb PagetextLampert, Mara
9[Robert Haase]Managing Scientific Python environments using ...Python * Conda * Mamba[Blog]https://focalplane.biologists.com/2022/12/08/m...CC-BY-4.0NaNNaNNaNNaNNaNNaNNaNNaNWeb PagetextHaase, Robert
29[Robert Haase et al.]BioImage Analysis NotebooksPython * Bioimage Analysis[Book, Notebook]https://haesleinhuepf.github.io/BioImageAnalys...CC-BY-4.0 * BSD-3-ClauseNaNNaNNaNNaNNaNNaNNaNNaNBook * Code Notebooktext * codeRobert Haase et al.
\n", + "
" + ], + "text/plain": [ + " authors name \\\n", + "1 [Marcelo Zoccoler] Running Deep-Learning Scripts in the BiA-PoL O... \n", + "2 [Robert Haase] Browsing the Open Microscopy Image Data Resour... \n", + "3 [Mara Lampert] Getting started with Mambaforge and Python \n", + "9 [Robert Haase] Managing Scientific Python environments using ... \n", + "29 [Robert Haase et al.] BioImage Analysis Notebooks \n", + "\n", + " tags type \\\n", + "1 Python * Artificial Intelligence * Bioimage An... [Blog] \n", + "2 OMERO * Python [Blog] \n", + "3 Python * Conda * Mamba [Blog] \n", + "9 Python * Conda * Mamba [Blog] \n", + "29 Python * Bioimage Analysis [Book, Notebook] \n", + "\n", + " url \\\n", + "1 https://biapol.github.io/blog/marcelo_zoccoler... \n", + "2 https://biapol.github.io/blog/robert_haase/bro... \n", + "3 https://biapol.github.io/blog/mara_lampert/get... \n", + "9 https://focalplane.biologists.com/2022/12/08/m... \n", + "29 https://haesleinhuepf.github.io/BioImageAnalys... \n", + "\n", + " license event_date event_location description \\\n", + "1 CC-BY-4.0 NaN NaN NaN \n", + "2 CC-BY-4.0 NaN NaN NaN \n", + "3 CC-BY-4.0 NaN NaN NaN \n", + "9 CC-BY-4.0 NaN NaN NaN \n", + "29 CC-BY-4.0 * BSD-3-Clause NaN NaN NaN \n", + "\n", + " num_downloads publication_date fingerprint author submission_date \\\n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "9 NaN NaN NaN NaN NaN \n", + "29 NaN NaN NaN NaN NaN \n", + "\n", + " LearningResourceType MediaType Authors \n", + "1 Web Page text Zoccoler, Marcelo \n", + "2 Web Page text Haase, Robert \n", + "3 Web Page text Lampert, Mara \n", + "9 Web Page text Haase, Robert \n", + "29 Book * Code Notebook text * code Robert Haase et al. " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import re\n", + "\n", + "def normalize_author_format(authors):\n", + " # Helper function to reformat a single name\n", + " def reformat_name(name):\n", + " # Check if it's already in \"Surname, Prename\" format\n", + " if \",\" in name:\n", + " return name.strip()\n", + " # If in \"Prename Surname\" format, convert to \"Surname, Prename\"\n", + " parts = name.split()\n", + " et_al = ['et', 'al.']\n", + " if len(parts) == 2 and all(p not in et_al for p in parts):\n", + " return f\"{parts[1]}, {parts[0]}\"\n", + " if len(parts) == 3 and all(p not in et_al for p in parts):\n", + " return f\"{parts[2]}, {parts[0]}{parts[1]}\"\n", + " return name.strip() # Return unchanged if not a simple name format\n", + "\n", + "\n", + " # Convert single strings to lists for uniform processing\n", + " if isinstance(authors, str):\n", + " # Split on commas for inline lists like \"Prename Surname, Prename Surname\"\n", + " authors = [a.strip() for a in re.split(r\",\\s*|\\*|\\band\\b\", authors)]\n", + " elif isinstance(authors, list):\n", + " authors = [str(a).strip() for a in authors] # Ensure all elements are strings\n", + "\n", + " # Process each author entry\n", + " formatted_authors = []\n", + " for author in authors:\n", + " formatted_authors.append(reformat_name(author))\n", + "\n", + " # Join all processed names with \"*\"\n", + " return \" * \".join(formatted_authors)\n", + "\n", + "\n", + "# Apply the normalization function\n", + "data[\"Authors\"] = data[\"authors\"].apply(normalize_author_format)\n", + "\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "0983e2a5-3f93-4cfa-9f40-3f41233fe77e", + "metadata": {}, + "source": [ + "#### 6. Change to names of the columns that already fit the DALIA format to their corresponding name in DALIA" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4213ac0c-3274-408e-a86d-bc9e61832de8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TitleKeywordsLinkLicenseDescriptionPublicationDateLearningResourceTypeMediaTypeAuthors
1Running Deep-Learning Scripts in the BiA-PoL O...Python * Artificial Intelligence * Bioimage An...https://biapol.github.io/blog/marcelo_zoccoler...CC-BY-4.0NaNNaNWeb PagetextZoccoler, Marcelo
2Browsing the Open Microscopy Image Data Resour...OMERO * Pythonhttps://biapol.github.io/blog/robert_haase/bro...CC-BY-4.0NaNNaNWeb PagetextHaase, Robert
3Getting started with Mambaforge and PythonPython * Conda * Mambahttps://biapol.github.io/blog/mara_lampert/get...CC-BY-4.0NaNNaNWeb PagetextLampert, Mara
9Managing Scientific Python environments using ...Python * Conda * Mambahttps://focalplane.biologists.com/2022/12/08/m...CC-BY-4.0NaNNaNWeb PagetextHaase, Robert
29BioImage Analysis NotebooksPython * Bioimage Analysishttps://haesleinhuepf.github.io/BioImageAnalys...CC-BY-4.0 * BSD-3-ClauseNaNNaNBook * Code Notebooktext * codeRobert Haase et al.
\n", + "
" + ], + "text/plain": [ + " Title \\\n", + "1 Running Deep-Learning Scripts in the BiA-PoL O... \n", + "2 Browsing the Open Microscopy Image Data Resour... \n", + "3 Getting started with Mambaforge and Python \n", + "9 Managing Scientific Python environments using ... \n", + "29 BioImage Analysis Notebooks \n", + "\n", + " Keywords \\\n", + "1 Python * Artificial Intelligence * Bioimage An... \n", + "2 OMERO * Python \n", + "3 Python * Conda * Mamba \n", + "9 Python * Conda * Mamba \n", + "29 Python * Bioimage Analysis \n", + "\n", + " Link \\\n", + "1 https://biapol.github.io/blog/marcelo_zoccoler... \n", + "2 https://biapol.github.io/blog/robert_haase/bro... \n", + "3 https://biapol.github.io/blog/mara_lampert/get... \n", + "9 https://focalplane.biologists.com/2022/12/08/m... \n", + "29 https://haesleinhuepf.github.io/BioImageAnalys... \n", + "\n", + " License Description PublicationDate \\\n", + "1 CC-BY-4.0 NaN NaN \n", + "2 CC-BY-4.0 NaN NaN \n", + "3 CC-BY-4.0 NaN NaN \n", + "9 CC-BY-4.0 NaN NaN \n", + "29 CC-BY-4.0 * BSD-3-Clause NaN NaN \n", + "\n", + " LearningResourceType MediaType Authors \n", + "1 Web Page text Zoccoler, Marcelo \n", + "2 Web Page text Haase, Robert \n", + "3 Web Page text Lampert, Mara \n", + "9 Web Page text Haase, Robert \n", + "29 Book * Code Notebook text * code Robert Haase et al. " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Rename columns\n", + "data = data.rename(columns={'name': 'Title', 'license': 'License', 'url': 'Link', 'description': 'Description', 'publication_date': 'PublicationDate', 'tags': 'Keywords'})\n", + "\n", + "# Remove unwanted columns with no important data\n", + "data = data.drop(columns=['event_date', 'event_location', 'num_downloads', 'submission_date', 'fingerprint', 'author', 'type', 'authors'])\n", + "\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "69d43abb-d409-49bf-ab21-b79729441d1f", + "metadata": {}, + "source": [ + "#### 7. Introduce the **Community Column**: NFDI4BioImage if it is listed in the tags" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5a79c41e-6037-44c2-8cdd-0988197de047", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TitleKeywordsLinkLicenseDescriptionPublicationDateLearningResourceTypeMediaTypeAuthorsCommunity
1Running Deep-Learning Scripts in the BiA-PoL O...Python * Artificial Intelligence * Bioimage An...https://biapol.github.io/blog/marcelo_zoccoler...CC-BY-4.0NaNNaNWeb PagetextZoccoler, MarceloNone
2Browsing the Open Microscopy Image Data Resour...OMERO * Pythonhttps://biapol.github.io/blog/robert_haase/bro...CC-BY-4.0NaNNaNWeb PagetextHaase, RobertNone
3Getting started with Mambaforge and PythonPython * Conda * Mambahttps://biapol.github.io/blog/mara_lampert/get...CC-BY-4.0NaNNaNWeb PagetextLampert, MaraNone
9Managing Scientific Python environments using ...Python * Conda * Mambahttps://focalplane.biologists.com/2022/12/08/m...CC-BY-4.0NaNNaNWeb PagetextHaase, RobertNone
29BioImage Analysis NotebooksPython * Bioimage Analysishttps://haesleinhuepf.github.io/BioImageAnalys...CC-BY-4.0 * BSD-3-ClauseNaNNaNBook * Code Notebooktext * codeRobert Haase et al.None
\n", + "
" + ], + "text/plain": [ + " Title \\\n", + "1 Running Deep-Learning Scripts in the BiA-PoL O... \n", + "2 Browsing the Open Microscopy Image Data Resour... \n", + "3 Getting started with Mambaforge and Python \n", + "9 Managing Scientific Python environments using ... \n", + "29 BioImage Analysis Notebooks \n", + "\n", + " Keywords \\\n", + "1 Python * Artificial Intelligence * Bioimage An... \n", + "2 OMERO * Python \n", + "3 Python * Conda * Mamba \n", + "9 Python * Conda * Mamba \n", + "29 Python * Bioimage Analysis \n", + "\n", + " Link \\\n", + "1 https://biapol.github.io/blog/marcelo_zoccoler... \n", + "2 https://biapol.github.io/blog/robert_haase/bro... \n", + "3 https://biapol.github.io/blog/mara_lampert/get... \n", + "9 https://focalplane.biologists.com/2022/12/08/m... \n", + "29 https://haesleinhuepf.github.io/BioImageAnalys... \n", + "\n", + " License Description PublicationDate \\\n", + "1 CC-BY-4.0 NaN NaN \n", + "2 CC-BY-4.0 NaN NaN \n", + "3 CC-BY-4.0 NaN NaN \n", + "9 CC-BY-4.0 NaN NaN \n", + "29 CC-BY-4.0 * BSD-3-Clause NaN NaN \n", + "\n", + " LearningResourceType MediaType Authors Community \n", + "1 Web Page text Zoccoler, Marcelo None \n", + "2 Web Page text Haase, Robert None \n", + "3 Web Page text Lampert, Mara None \n", + "9 Web Page text Haase, Robert None \n", + "29 Book * Code Notebook text * code Robert Haase et al. None " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def include_community(entry):\n", + " if isinstance(entry, list):\n", + " if any(e.lower() == 'nfdi4bioimage' for e in entry if isinstance(e, str)):\n", + " return 'NFDI4Bioimage'\n", + " elif isinstance(entry, str):\n", + " if entry.lower() == 'nfdi4bioimage':\n", + " return 'NFDI4Bioimage'\n", + " return None\n", + "\n", + "\n", + "# Apply the function\n", + "data['Community'] = data['Keywords'].apply(include_community)\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "78b5ec11-6e2a-4b6d-8ff3-faa58325b232", + "metadata": {}, + "source": [ + "### 8. Introduce the **FileFormat** Column by comparing the MediaType to a FileFormat list" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "89a3f72f-e614-4fe3-afc7-fc22345e104e", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import requests\n", + "import re\n", + "import time\n", + "\n", + "# Function to extract record ID from a Zenodo link\n", + "def extract_zenodo_record_id(url):\n", + " # Regex to match Zenodo record links and extract the record ID\n", + " match = re.search(r\"https://zenodo.org/records/(\\d+)\", url)\n", + " return match.group(1) if match else None\n", + "\n", + "# Function to fetch file formats from Zenodo using the record ID\n", + "def fetch_file_formats(record_id):\n", + " if not record_id:\n", + " return None\n", + " api_url = f\"https://zenodo.org/api/records/{record_id}\"\n", + " try:\n", + " time.sleep(1) # Add a 1-second delay between requests\n", + " response = requests.get(api_url)\n", + " response.raise_for_status() # Raise an error for non-2xx responses\n", + " data = response.json()\n", + " file_types = {\n", + " file[\"key\"].split(\".\")[-1].lower()\n", + " for file in data.get(\"files\", [])\n", + " if \".\" in file[\"key\"]\n", + " }\n", + " return \" * \".join(sorted(file_types)) if file_types else None\n", + " except Exception as e:\n", + " print(f\"Error fetching file formats for record ID {record_id}: {e}\")\n", + " return None\n", + "\n", + "# Function to process a single URL or a list of URLs\n", + "def process_links(link_input):\n", + " if isinstance(link_input, str):\n", + " # Single URL case\n", + " record_id = extract_zenodo_record_id(link_input)\n", + " if record_id:\n", + " return fetch_file_formats(record_id)\n", + " elif isinstance(link_input, list):\n", + " # List of URLs case\n", + " for link in link_input:\n", + " record_id = extract_zenodo_record_id(link.strip())\n", + " if record_id:\n", + " file_format = fetch_file_formats(record_id)\n", + " if file_format: # Return on first valid result\n", + " return file_format\n", + " return None # Return None if no valid formats are found\n", + "\n", + "# Process the DataFrame\n", + "data[\"FileFormat\"] = data[\"Link\"].apply(process_links)" + ] + }, + { + "cell_type": "markdown", + "id": "74d9702a-00cb-4408-8815-c26fd9a4fdee", + "metadata": {}, + "source": [ + "Additionally map the Type Column to certain File Formats, if it is not already filled from the previous step. (only works for certain MediaTypes)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "965d0a41-9762-47cb-8bac-7042d35960c8", + "metadata": {}, + "outputs": [], + "source": [ + "def map_file_format(media_type, file_format):\n", + " # If FileFormat already has a valid entry, return it as is\n", + " if file_format is not None and file_format.strip() != \"\":\n", + " return file_format\n", + " # Map media types to specific file formats\n", + " if media_type == \"audio\":\n", + " return \".mp3\"\n", + " elif media_type == \"video\":\n", + " return \".mp4\"\n", + " else:\n", + " return \"\" # Return empty string if no mapping is needed\n", + "\n", + "# Apply the mapping function\n", + "data[\"FileFormat\"] = data.apply(\n", + " lambda row: map_file_format(row[\"MediaType\"], row[\"FileFormat\"]),\n", + " axis=1\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "461e7dc1-7572-4664-887d-ec36f4ed2656", + "metadata": {}, + "source": [ + "Now also correct the Format of the Link Column:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0de3c9ba-a0b8-434d-bd79-896ad87cf1c1", + "metadata": {}, + "outputs": [], + "source": [ + "# Make * Delimiter for the Links if there is more than one for some entries\n", + "data[\"Link\"] = data[\"Link\"].apply(lambda x: ' * '.join(x) if isinstance(x, list) else x) #URL" + ] + }, + { + "cell_type": "markdown", + "id": "22a0d7cc-e4d7-4c77-807c-662fb44ffbe0", + "metadata": {}, + "source": [ + "#### 9. Extract the Language of each Entry\n", + "This is done using the [xlm-roberta-base-language-detection](https://huggingface.co/papluca/xlm-roberta-base-language-detection) model via the transformers package pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "5f3abf90-990f-4ccc-8da3-05bb26e6538e", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import pipeline\n", + "\n", + "model_ckpt = \"papluca/xlm-roberta-base-language-detection\"\n", + "pipe = pipeline(\"text-classification\", model=model_ckpt)\n", + "\n", + "def detect_language(text):\n", + " lang = pipe([text], top_k=1, truncation=True)[0][0][\"label\"]\n", + " return lang if lang in [\"en\", \"de\"] else \"\"\n", + "\n", + "data[\"Language\"] = data[\"Title\"].apply(detect_language)" + ] + }, + { + "cell_type": "markdown", + "id": "0cd61d11-5907-43ba-968c-bb851d003631", + "metadata": {}, + "source": [ + "### Export the data to a csv that now fits the DALIA Format" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "63071e24-8d4e-4885-ae78-74669bbe5557", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Exported 330 rows.\n" + ] + } + ], + "source": [ + "# save selected data\n", + "data.to_csv(destination, index=False)\n", + "\n", + "num_rows = data.shape[0]\n", + "print(f\"Exported {num_rows} rows.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}