custom.bib

@article{Chen2015ASO,
  title={A study of active learning methods for named entity recognition in clinical text},
  author={Yukun Chen and Thomas A. Lasko and Qiaozhu Mei and Joshua Charles Denny and Hua Xu},
  journal={Journal of biomedical informatics},
  year={2015},
  volume={58},
  pages={
          11-18
        },
  url={https://api.semanticscholar.org/CorpusID:3724046}
}

@misc{Presidio,
    AUTHOR = {Mendels, Omri and Peled, Coby and Vaisman Levy, Nava and Rosenthal, Tomer and Lahiani, Limor and others},
    TITLE = {{Microsoft Presidio}: Context aware, pluggable and customizable PII anonymization service for text and images},
    YEAR = {2018},
    ORGANIZATION = {Microsoft},
    URL = {https://microsoft.github.io/presidio}
}

@misc{pii-detection-removal-from-educational-data,
    author = {Langdon Holmes and Scott Crossley and Perpetual Baffour and Jules King and Lauryn Burleigh and Maggie Demkin and Ryan Holbrook and Walter Reade and Addison Howard},
    title = {The Learning Agency Lab - PII Data Detection},
    publisher = {Kaggle},
    year = {2024},
    url = {https://kaggle.com/competitions/pii-detection-removal-from-educational-data}
}

@article{Beltagy2020Longformer,
  title={Longformer: The Long-Document Transformer},
  author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
  journal={arXiv:2004.05150},
  year={2020},
}


@article{honnibal2020spacy,
  added-at = {2023-05-22T04:49:27.000+0200},
  author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane},
  biburl = {https://www.bibsonomy.org/bibtex/2616669ca18ac051794c0459373696942/rerry},
  doi = {10.5281/zenodo.1212303},
  interhash = {2d1b3a0bb97e51df1b88d8852cd5ac01},
  intrahash = {616669ca18ac051794c0459373696942},
  keywords = {nlp},
  timestamp = {2023-05-22T04:49:27.000+0200},
  title = {{spaCy: Industrial-strength Natural Language Processing in Python}},
  year = 2020
}

@inproceedings{Ziller2021PySyftAL,
  title={PySyft: A Library for Easy Federated Learning},
  author={Alexander Ziller and Andrew Trask and Antonio Lopardo and Benjamin Szymkow and Bobby Wagner and Emma Bluemke and Jean-Mickael Nounahon and Jonathan Passerat-Palmbach and Kritika Prakash and Nick Rose and Th{\'e}o Ryffel and Zarreen Naowal Reza and Georgios Kaissis},
  year={2021},
  url={https://api.semanticscholar.org/CorpusID:236690571}
}

@mastersthesis{vanderplas2022detecting,
  title = {Detecting PII in Git Commits},
  author = {van der Plas, Niek},
  school = {Delft University of Technology},
  program = {Computer Science | Software Technology},
  year = {2022},
  month = {07},
  date = {2022-07-04},
  type = {master thesis},
  abstract = {With the advancement of technology, organizations are experiencing more trouble with keeping their data private with it often leaked to the public via their code-repositories or databases. There are methods to counter the leakage of data while pushing code to a repository however, these are heavily reliant on regular expressions. Personal names, locations and other Personally Identifiable Information (PII) do not follow a reoccurring pattern and can thus only be prevented by manual code reviews, which are also prone to errors. A tool to detect these PII should be designed as an initial measure to counteract the leakage. In this paper, we propose a heavily modifiable tool in which we combine the strength of regular expressions with a state-of-the-art machine learning model to detect a variety of important PII within the code changes of Python software projects. We use CodeBERT, a RoBERTa-like Transformer model, as our PII recognizer. This recognizer is fine-tuned using the Scikit-learn library of which we injected the git commits with fake sensitive data. To test and improve the quality of the model and the entire tool, we design an experimental methodology to find the optimal value for the hyper parameters of the model, compare it against another Transformer model and run the fine-tuned model against several other code-bases with different programming languages. The outcome of these experiments benefit the quality of the model in a positive way and allows us to design a robust tool with a well-performing machine learning model to detect a variety of entities. This tool can be personalized to any business and mitigate a significant part of the potential data leaks.},
  url = {http://resolver.tudelft.nl/uuid:fe195c17-ecf5-4811-a987-89f238a6802f},
  note = {TU Delft Electrical Engineering, Mathematics and Computer Science},
  keywords = {PII detection, git commits, codebert, machine learning}, 
  mentor = {Cruz, Luis ORCID 0000-0002-1615-355X},
  committee = {Oliveira, Luiz and van Deursen, A. ORCID 0000-0003-4850-3312}
}

@misc{AzizStraiton2023PIIDetection,
  author       = {Ajmal Aziz and Rachael Straiton},
  title        = {PII Detection at Scale on the Lakehouse},
  howpublished = {YouTube},
  year         = {2023},
  note         = {Channel: Databricks. Available at: \url{https://www.youtube.com/watch?v=nTAKQuxZ9lI}},
  month        = {7},
  day          = {26},
  organization = {Databricks},
  address      = {San Francisco},
  description  = {SEEK is Australia’s largest online employment marketplace and a market leader spanning ten countries across Asia Pacific and Latin America. SEEK provides employment opportunities for roughly 16 million monthly active users and processes 25 million candidate applications to listings. Processing millions of resumes involves handling and managing highly sensitive candidate information, usually inputted in a highly unstructured format. With recent high-profile data leaks in Australia, personally identifiable information (PII) protection has become a major focus area for large digital organizations.},
}

@article{abs-2004-05150,
  author       = {Iz Beltagy and
                  Matthew E. Peters and
                  Arman Cohan},
  title        = {Longformer: The Long-Document Transformer},
  journal      = {CoRR},
  volume       = {abs/2004.05150},
  year         = {2020},
  url          = {https://arxiv.org/abs/2004.05150},
  eprinttype    = {arXiv},
  eprint       = {2004.05150},
  timestamp    = {Tue, 14 Apr 2020 16:40:34 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2004-05150.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@misc{Faraglia2023Faker,
  title        = {Faker},
  author       = {Daniele Faraglia and Other Contributors},
  year         = {2023},
  version      = {1.2.0},
  license      = {MIT},
  url          = {https://github.com/joke2k/faker}
}

@misc{wandb,
title = {Experiment Tracking with Weights and Biases},
year = {2020},
note = {Software available from wandb.com},
url={https://www.wandb.com/},
author = {Biewald, Lukas},
}