Skip to content

Commit

Permalink
improved bibtex entries for ML domain page (#1455)
Browse files Browse the repository at this point in the history
  • Loading branch information
bianchini88 authored Jan 25, 2024
1 parent 5ce5660 commit 88ef096
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 109 deletions.
231 changes: 130 additions & 101 deletions _bibliography/references.bib
Original file line number Diff line number Diff line change
@@ -1,104 +1,133 @@
@Article{Huerta2023,
author={Huerta, E. A. et al.},
title={FAIR for AI: An interdisciplinary and international community building perspective},
journal={Scientific Data},
year={2023},
volume={10},
number={1},
pages={487},
doi={10.1038/s41597-023-02298-6},
url={https://doi.org/10.1038/s41597-023-02298-6}
}

@Article{Walsh2021,
author={Walsh, Ian et al.},
title={Author Correction: DOME: recommendations for supervised machine learning validation in biology},
journal={Nature Methods},
year={2021},
volume={18},
number={11},
pages={1409-1410},
doi={10.1038/s41592-021-01304-2},
url={https://doi.org/10.1038/s41592-021-01304-2}
}

@inproceedings{Katz2021WorkingTU,
title={Working Towards Understanding the Role of FAIR for Machine Learning},
author={Daniel S. Katz and Fotis E. Psomopoulos and Leyla Jael Castro},
booktitle={Workshop on Data and Research Objects Management for Linked Open Science},
year={2021},
url={https://api.semanticscholar.org/CorpusID:242926042}
}

@Article{Ravi2022,
author={Ravi, Nikil et al.},
title={FAIR principles for AI models with a practical application for accelerated high energy diffraction microscopy},
journal={Scientific Data},
year={2022},
volume={9},
number={1},
pages={657},
doi={10.1038/s41597-022-01712-9},
url={https://doi.org/10.1038/s41597-022-01712-9}
}

@Article{Williamson2023,
AUTHOR = { Williamson, HF et al.},
TITLE = {Data management challenges for artificial intelligence in plant and agricultural research
},
JOURNAL = {F1000Research},
VOLUME = {10},
YEAR = {2023},
NUMBER = {324},
DOI = {10.12688/f1000research.52204.2}
}

@inbook{Sansone2023,
author = { Susanna-Assunta Sansone and Philippe Rocca-Serra and Mark Wilkinson and Lee Harland },
title = {FAIR: Making Data AI-Ready},
booktitle = {Artificial Intelligence for Science},
year = {2023},
chapter = {Chapter 33},
pages = {627-640},
doi = {10.1142/9789811265679_0033},
URL = {https://www.worldscientific.com/doi/abs/10.1142/9789811265679_0033}
}

@inbook{Psomopoulos2023,
author = { Fotis Psomopoulos and Carole Goble and Leyla Jael Castro and Jennifer Harrow and Silvio C. E. Tosatto },
title = {A Roadmap for Defining Machine Learning Standards in Life Sciences},
booktitle = {Artificial Intelligence for Science},
year = {2023},
chapter = {Chapter 22},
pages = {399-410},
doi = {10.1142/9789811265679_0022},
URL = {https://www.worldscientific.com/doi/abs/10.1142/9789811265679_0022}
}

@Article{Comess2020,
AUTHOR = { Comess, S et al.},
TITLE = {Bringing Big Data to Bear in Environmental Public Health: Challenges and Recommendations.},
JOURNAL = {Front Artif Intell},
YEAR = {2020},
DOI = {10.3389/frai.2020.00031}
}

@Article{Berisha2021,
author={Berisha, Visar
and Krantsevich, Chelsea
and Hahn, P. Richard
and Hahn, Shira
and Dasarathy, Gautam
and Turaga, Pavan
and Liss, Julie},
title={Digital medicine and the curse of dimensionality},
journal={npj Digital Medicine},
year={2021},
volume={4},
number={1},
pages={153},
doi={10.1038/s41746-021-00521-5},
url={https://doi.org/10.1038/s41746-021-00521-5}
@article{walsh2021Author,
title = {Author {{Correction}}: {{DOME}}: Recommendations for Supervised Machine Learning Validation in Biology},
shorttitle = {Author {{Correction}}},
author = {Walsh, Ian and Fishman, Dmytro and {Garcia-Gasulla}, Dario and Titma, Tiina and Pollastri, Gianluca and {ELIXIR Machine Learning Focus Group} and Capriotti, Emidio and Casadio, Rita and {Capella-Gutierrez}, Salvador and Cirillo, Davide and Del Conte, Alessio and Dimopoulos, Alexandros C. and Del Angel, Victoria Dominguez and Dopazo, Joaquin and Fariselli, Piero and Fern{\'a}ndez, Jos{\'e} Maria and Huber, Florian and Kreshuk, Anna and Lenaerts, Tom and Martelli, Pier Luigi and Navarro, Arcadi and Broin, Pilib {\'O} and Pi{\~n}ero, Janet and Piovesan, Damiano and Reczko, Martin and Ronzano, Francesco and Satagopam, Venkata and Savojardo, Castrense and Spiwok, Vojtech and Tangaro, Marco Antonio and Tartari, Giacomo and Salgado, David and Valencia, Alfonso and Zambelli, Federico and Harrow, Jennifer and Psomopoulos, Fotis E. and Tosatto, Silvio C. E.},
year = {2021},
month = nov,
journal = {Nature Methods},
volume = {18},
number = {11},
pages = {1409--1410},
issn = {1548-7091, 1548-7105},
doi = {10.1038/s41592-021-01304-2},
urldate = {2024-01-25},
langid = {english}
}

@article{williamson2023Data,
title = {Data Management Challenges for Artificial Intelligence in Plant and Agricultural Research},
author = {Williamson, Hugh F. and Brettschneider, Julia and Caccamo, Mario and Davey, Robert P. and Goble, Carole and Kersey, Paul J. and May, Sean and Morris, Richard J. and Ostler, Richard and Pridmore, Tony and Rawlings, Chris and Studholme, David and Tsaftaris, Sotirios A. and Leonelli, Sabina},
year = {2023},
month = jan,
journal = {F1000Research},
volume = {10},
pages = {324},
issn = {2046-1402},
doi = {10.12688/f1000research.52204.2},
urldate = {2024-01-25},
langid = {english}
}

@article{ravi2022FAIR,
title = {{{FAIR}} Principles for {{AI}} Models with a Practical Application for Accelerated High Energy Diffraction Microscopy},
author = {Ravi, Nikil and Chaturvedi, Pranshu and Huerta, E. A. and Liu, Zhengchun and Chard, Ryan and Scourtas, Aristana and Schmidt, K. J. and Chard, Kyle and Blaiszik, Ben and Foster, Ian},
year = {2022},
month = nov,
journal = {Scientific Data},
volume = {9},
number = {1},
pages = {657},
issn = {2052-4463},
doi = {10.1038/s41597-022-01712-9},
urldate = {2024-01-25},
langid = {english}
}

@inbook{sansone2023FAIR,
title = {{{FAIR}}: {{Making Data AI-Ready}}},
shorttitle = {{{FAIR}}},
booktitle = {Artificial {{Intelligence}} for {{Science}}},
author = {Sansone, Susanna-Assunta and {Rocca-Serra}, Philippe and Wilkinson, Mark and Harland, Lee},
year = {2023},
month = apr,
pages = {627--640},
publisher = {{WORLD SCIENTIFIC}},
doi = {10.1142/9789811265679_0033},
urldate = {2024-01-25},
collaborator = {Choudhary, Alok and Fox, Geoffrey and Hey, Tony},
isbn = {9789811265662 9789811265679},
langid = {english}
}

@article{huerta2023FAIR,
title = {{{FAIR}} for {{AI}}: {{An}} Interdisciplinary and International Community Building Perspective},
shorttitle = {{{FAIR}} for {{AI}}},
author = {Huerta, E. A. and Blaiszik, Ben and Brinson, L. Catherine and Bouchard, Kristofer E. and Diaz, Daniel and Doglioni, Caterina and Duarte, Javier M. and Emani, Murali and Foster, Ian and Fox, Geoffrey and Harris, Philip and Heinrich, Lukas and Jha, Shantenu and Katz, Daniel S. and Kindratenko, Volodymyr and Kirkpatrick, Christine R. and {Lassila-Perini}, Kati and Madduri, Ravi K. and Neubauer, Mark S. and Psomopoulos, Fotis E. and Roy, Avik and R{\"u}bel, Oliver and Zhao, Zhizhen and Zhu, Ruike},
year = {2023},
month = jul,
journal = {Scientific Data},
volume = {10},
number = {1},
pages = {487},
issn = {2052-4463},
doi = {10.1038/s41597-023-02298-6},
urldate = {2024-01-25},
langid = {english}
}

@inbook{psomopoulos2023Roadmap,
title = {A {{Roadmap}} for {{Defining Machine Learning Standards}} in {{Life Sciences}}},
booktitle = {Artificial {{Intelligence}} for {{Science}}},
author = {Psomopoulos, Fotis and Goble, Carole and Castro, Leyla Jael and Harrow, Jennifer and Tosatto, Silvio C. E.},
year = {2023},
month = apr,
pages = {399--410},
publisher = {{WORLD SCIENTIFIC}},
doi = {10.1142/9789811265679_0022},
urldate = {2024-01-25},
collaborator = {Choudhary, Alok and Fox, Geoffrey and Hey, Tony},
isbn = {9789811265662 9789811265679},
langid = {english}
}

@article{berisha2021Digital,
title = {Digital Medicine and the Curse of Dimensionality},
author = {Berisha, Visar and Krantsevich, Chelsea and Hahn, P. Richard and Hahn, Shira and Dasarathy, Gautam and Turaga, Pavan and Liss, Julie},
year = {2021},
month = oct,
journal = {npj Digital Medicine},
volume = {4},
number = {1},
pages = {153},
issn = {2398-6352},
doi = {10.1038/s41746-021-00521-5},
urldate = {2024-01-25},
langid = {english}
}

@inproceedings{castro2021Working,
title = {Working {{Towards Understanding}} the {{Role}} of {{FAIR}} for {{Machine Learning}}},
author = {Castro, Leyla Jael and Katz, Daniel S. and Psomopoulos, Fotis},
year = {2021},
publisher = {{PUBLISSO}},
doi = {10.4126/FRL01-006429415},
urldate = {2024-01-25},
langid = {english},
keywords = {FAIR,Machine Learning,Metadata}
}

@article{comess2020Bringing,
title = {Bringing {{Big Data}} to {{Bear}} in {{Environmental Public Health}}: {{Challenges}} and {{Recommendations}}},
shorttitle = {Bringing {{Big Data}} to {{Bear}} in {{Environmental Public Health}}},
author = {Comess, Saskia and Akbay, Alexia and Vasiliou, Melpomene and Hines, Ronald N. and Joppa, Lucas and Vasiliou, Vasilis and Kleinstreuer, Nicole},
year = {2020},
month = may,
journal = {Frontiers in Artificial Intelligence},
volume = {3},
pages = {31},
issn = {2624-8212},
doi = {10.3389/frai.2020.00031},
urldate = {2024-01-25}
}

@article{field2008migs,
Expand Down
16 changes: 8 additions & 8 deletions pages/your_domain/machine_learning.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,20 @@ training:
## What is machine learning (on this page)?

### Description
The definition of Machine Learning (ML) can become so vague that it can reach the point of becoming a philosophical question. There are two main approaches; the first one considers the model as the centre of the definition. In this perspective, ML is defined mainly by the characteristics and capabilities of the model employed. This approach focuses on the technical aspects of ML, by emphasising factors such as algorithms, data processing and model performance {% cite Ravi2022 %}.
The definition of Machine Learning (ML) can become so vague that it can reach the point of becoming a philosophical question. There are two main approaches; the first one considers the model as the centre of the definition. In this perspective, ML is defined mainly by the characteristics and capabilities of the model employed. This approach focuses on the technical aspects of ML, by emphasising factors such as algorithms, data processing and model performance {% cite ravi2022 %}.
In contrast, the second approach places the entire ML process at the centre of the definition of ML. In this form, the entire process is seen as a single entity, which includes all internal aspects/steps such as data collection, preprocessing, feature engineering, model training, evaluation and deployment. This approach acknowledges that all these stages are interconnected.

For the purposes of this page, we will be using the first definition, i.e. the model-centred approach. The reasons behind this choice mainly include the technical focus with a strong emphasis on the technical aspects of ML and the performance evaluation because it provides a clear framework for comparing against other models.

## What is FAIR in machine learning?

### Description
As global standards for good data stewardship, the FAIR principles have become instrumental in ML, impacting policies and practices across sectors {% cite Psomopoulos2023 %}. Embraced by policymakers and research institutes, implementing FAIR in ML enhances technical performance and proves economically advantageous.
As global standards for good data stewardship, the FAIR principles have become instrumental in ML, impacting policies and practices across sectors {% cite psomopoulos2023Roadmap %}. Embraced by policymakers and research institutes, implementing FAIR in ML enhances technical performance and proves economically advantageous.

FAIR is an aspirational guideline, not a stringent standard, propelling better data utilisation in ML. Applying the FAIR principles in ML involves creating easily discoverable models with comprehensive metadata, openly accessible with clear licensing, compatible with other systems via standardised formats, and designed for reuse with meticulous documentation, licensing and versioning {% cite Huerta2023 %}.
Actualising these principles necessitates infrastructural adjustments and stakeholder education {% cite Katz2021WorkingTU %}.
FAIR is an aspirational guideline, not a stringent standard, propelling better data utilisation in ML. Applying the FAIR principles in ML involves creating easily discoverable models with comprehensive metadata, openly accessible with clear licensing, compatible with other systems via standardised formats, and designed for reuse with meticulous documentation, licensing and versioning {% cite huerta2023FAIR %}.
Actualising these principles necessitates infrastructural adjustments and stakeholder education {% cite castro2021Working %}.

FAIR ML holds immense potential in life sciences, from accelerating drug discovery processes to driving innovative bioinformatics applications through data interoperability from various sources. The transformative power of FAIR extends to enhancing predictive modelling, genetic research, disease diagnosis, and much more, demonstrating the criticality of its adoption for the future of life sciences {% cite Ravi2022 %}.
FAIR ML holds immense potential in life sciences, from accelerating drug discovery processes to driving innovative bioinformatics applications through data interoperability from various sources. The transformative power of FAIR extends to enhancing predictive modelling, genetic research, disease diagnosis, and much more, demonstrating the criticality of its adoption for the future of life sciences {% cite ravi2022FAIR %}.


### Considerations
Expand All @@ -44,7 +44,7 @@ There are several key considerations in applying the FAIR principles in ML and t
- Evaluation: How can FAIR models be transparently evaluated? What role does documentation of performance metrics, validation procedures, and evaluation results play in providing users with an understanding of a model’s capabilities and limitations?

### Solutions
Applying the FAIR principles in ML is the focal point of initiatives such as the ELIXIR {% tool "dome" %} {% cite Walsh2021 %} recommendations and the [ELIXIR ML Focus Group](https://elixir-europe.org/focus-groups/machine-learning). The following solutions apply to the above-mentioned considerations at a high level:
Applying the FAIR principles in ML is the focal point of initiatives such as the ELIXIR {% tool "dome" %} {% cite walsh2021Author %} recommendations and the [ELIXIR ML Focus Group](https://elixir-europe.org/focus-groups/machine-learning). The following solutions apply to the above-mentioned considerations at a high level:
- Use community-backed and standardised metadata that includes details about the model's authors, creation date, model type, training data, intended tasks, and performance metrics (such as {% tool "bioimage" %}, {% tool "schema-org" %}, {% tool "dome" %}, and {% tool "onnx" %}).
- Assign a unique and persistent identifier to each model. This identifier should be linked to the model's metadata to improve searchability.
- Ensure that models are open-sourced and shared on public platforms (e.g. {% tool "github" %} or {% tool "huggingface" %}) to improve accessibility.
Expand All @@ -61,9 +61,9 @@ Applying the FAIR principles in ML is the focal point of initiatives such as the
## Data readiness for the ML models

### Description
The success of an ML model depends on the input data. Finding an appropriate dataset can be a challenge. The data has to be cleaned, explored, and evaluated before ML model training. Data preparation is often the most time-consuming step in the AI lifecycle. The “Garbage in, Garbage out” principle (GIGO) has to be kept in mind at this stage: models trained on unreliable data will produce unreliable predictions {% cite Sansone2023 %}.
The success of an ML model depends on the input data. Finding an appropriate dataset can be a challenge. The data has to be cleaned, explored, and evaluated before ML model training. Data preparation is often the most time-consuming step in the AI lifecycle. The “Garbage in, Garbage out” principle (GIGO) has to be kept in mind at this stage: models trained on unreliable data will produce unreliable predictions {% cite sansone2023FAIR %}.

To prepare data for ML models, several steps need to be followed. Firstly, data should be collected from reliable and diverse resources, ensuring it represents the problem domain accurately. Then, data cleaning techniques such as removing duplicates, handling missing values, and addressing outliers should be applied to enhance data quality. A commonly used practice for this task is the implementation of {% tool "scikit-learn" %}. Regarding features, on one hand there is the selection or extraction that can be performed to identify the most relevant and informative attributes for the model {% cite Comess2020 %}. On the other hand, there is the problem of the number of features which exceeds that of observations due to the nature of data in life sciences {% cite Berisha2021 %}. Furthermore, data normalisation and standardisation might be necessary to scale features appropriately. Also, over or under-sampling can be used to handle an imbalanced dataset {% cite Williamson2023 %}.
To prepare data for ML models, several steps need to be followed. Firstly, data should be collected from reliable and diverse resources, ensuring it represents the problem domain accurately. Then, data cleaning techniques such as removing duplicates, handling missing values, and addressing outliers should be applied to enhance data quality. A commonly used practice for this task is the implementation of {% tool "scikit-learn" %}. Regarding features, on one hand there is the selection or extraction that can be performed to identify the most relevant and informative attributes for the model {% cite comess2020Bringing %}. On the other hand, there is the problem of the number of features which exceeds that of observations due to the nature of data in life sciences {% cite berisha2021Digital %}. Furthermore, data normalisation and standardisation might be necessary to scale features appropriately. Also, over or under-sampling can be used to handle an imbalanced dataset {% cite williamson2023Data %}.

### Considerations
- The type of the data (text, images, time series, and others).
Expand Down

0 comments on commit 88ef096

Please sign in to comment.