diff --git a/.gitignore b/.gitignore index 47d8e24..405f6f7 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,5 @@ cython_debug/ #.idea/ # Repo specific -WSC_non_mapped_lines.txt \ No newline at end of file +WSC_non_mapped_lines.txt +build.bat \ No newline at end of file diff --git a/README.md b/README.md index 6e3e4b7..0d752ba 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,18 @@ For example, obstructive apneas may be formatted as: 'Obs Apnea', 'OBS Apnea', ' Other issues relates to missing information in the annotations' columns, inconsistent time formats (most are 24h, some am/pm) and other dirty bits that are not necessary and complicate automatic parsing. ## How to use this script -Clone the repo or download the `wsc_clean.py` file and run it from command line as: +Clone the repo or download the `wsc_clean.py` and `mappings.txt` file and run it from command line as: `python wsc_clean.py ` +The package can be also installed from PyPI: + +`pip install wisconsinsc_cleaner` + +and executed as: + +`wsc_clean ` + ## Content of this repo A single python script (no installation needed) parses all the annotation files and produce another set of annotation files with the suffix `.uniform.txt`. The mapping of annotations is available in the `mappings.txt` file in the form `A|B|C` (see [https://zzz.bwh.harvard.edu/luna/ref/annotations/#remap] for details), meaning that every instance of `B` or `C` will be mapped as `A`. If a mapping does not exist, the original value is returned with a prefix `misc:`. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c0604e5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +requires = [ + "setuptools" +] +build-backend = "setuptools.build_meta" + +[tool.setuptools.package-data] +wisconsinsc_cleaner = ["*.txt"] + +[project] +name = "wisconsinsc_cleaner" +version = "0.0.1" +description = "A tool to clean and uniform annotation files from Wisconsin Sleep Cohort (WSC), distributed by NSRR" +readme = "README.md" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Development Status :: 4 - Beta", +] +keywords = ['data storage', 'data cleaning', 'medical records'] + +[project.scripts] +wsc_clean = "wisconsinsc_cleaner.wsc_clean:main" + +[project.urls] +"Homepage" = "https://github.com/LucaCerina/WisconsinSC_cleaner" +"Bug Tracker" = "https://github.com/LucaCerina/WisconsinSC_cleaner/issues" + +[metadata] +author = "LucaCerina" +author_email = "lccerina@duck.com" +license = "MIT" \ No newline at end of file diff --git a/mappings.txt b/wisconsinsc_cleaner/mappings.txt similarity index 99% rename from mappings.txt rename to wisconsinsc_cleaner/mappings.txt index 71fb3b6..5469bfb 100644 --- a/mappings.txt +++ b/wisconsinsc_cleaner/mappings.txt @@ -122,7 +122,7 @@ biocal:look_left|LOOK LEFT|eyes left|eyes left and back|move eyes left & center| biocal:look_right|LOOK RIGHT|eyes right|look r|eyes right and back|move eyes right & center|move eyes right, back biocal:look_up|LOOK UP|eyes up|eyes up and back|move eyes up, back|up biocal:look_down|LOOK DOWN|eyes down and back|move eyes down, back|eyes down -biocal:look_left_righteyes left and right +biocal:look_left_right|eyes left and right biocal:teeth_grit|GRIT TEETH biocal:clenc_jaw|clench jaw||clench teeth|clench diff --git a/wsc_clean.py b/wisconsinsc_cleaner/wsc_clean.py similarity index 98% rename from wsc_clean.py rename to wisconsinsc_cleaner/wsc_clean.py index 8852dde..9e13e94 100644 --- a/wsc_clean.py +++ b/wisconsinsc_cleaner/wsc_clean.py @@ -6,6 +6,7 @@ from csv import DictReader, DictWriter from datetime import datetime, timedelta from glob import glob +from importlib import resources from itertools import zip_longest from time import perf_counter from typing import Tuple, Union @@ -38,10 +39,12 @@ def datetime_sorter(start_time:datetime, input_time:datetime) -> int: delta += timedelta(days=1) return delta.total_seconds() -def load_mappings(mapping_file:str) -> dict: +def load_mappings() -> dict: """Convert maps in mappings.txt file to a dictionary to be used later. Raise ValueError in case of badly formatted map """ + # TODO bit of a clunky path here + mapping_file = resources._get_package('wisconsinsc_cleaner').__path__._path[0]+'/mappings.txt' assert Path(mapping_file).exists(), f"Mapping file {mapping_file} not found." with open(mapping_file) as map_file: maps = map_file.read().splitlines() @@ -414,7 +417,7 @@ def process_gamma_log(recording:str, input_filename:str, output_filename:str, ma return no_error, unmapped -if __name__ == "__main__": +def main(): # Get data folder if len(sys.argv)<2: print("Error! Usage wsc_clean.py ") @@ -426,7 +429,7 @@ def process_gamma_log(recording:str, input_filename:str, output_filename:str, ma # Load annotations mappings print("Loading mappings") - mapping = load_mappings('./mappings.txt') + mapping = load_mappings() # Get all recordings print("Identifying recordings") @@ -483,4 +486,6 @@ def process_gamma_log(recording:str, input_filename:str, output_filename:str, ma with open(f'./{non_mapped_filename}', 'w', encoding='utf-8') as nfile: for line in non_mapped_lines: nfile.write(line+'\n') - \ No newline at end of file + +if __name__ == "__main__": + main() \ No newline at end of file