From e3d1b5ed03d5ff2e29ad9789314291bcb96ecd92 Mon Sep 17 00:00:00 2001 From: Karim-53 <33978275+Karim-53@users.noreply.github.com> Date: Mon, 7 Jun 2021 19:49:43 +0200 Subject: [PATCH 1/7] fix --surah argument --- download.py | 24 +++++++++++++----------- requirements.txt | 1 + 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/download.py b/download.py index 6c666a2..aa2bb3c 100644 --- a/download.py +++ b/download.py @@ -3,7 +3,11 @@ A file for downloading audio recordings from the Tarteel V1 dataset. Contributed by @kareemn. -Example usage: python download.py -s 1 --use-cache +Example usage 1: download only the audio related to surah 1 (Al-Fatiha) 140 Mb +python download.py -s 1 --use-cache --keep-downloaded-audio + +Example usage 2: download the entire audio dataset +python download.py --use-cache --keep-downloaded-audio """ from argparse import ArgumentParser @@ -27,7 +31,7 @@ parser.add_argument('--local-csv-filename', type=str, default='local.csv') parser.add_argument('--cache-dir', type=str, default='.cache') parser.add_argument('-u', '--use-cache', action='store_true') -parser.add_argument('-s', '--surah', type=int) +parser.add_argument('-s', '--surah', type=int, default=0) parser.add_argument('-k', '--keep-downloaded-audio', action='store_true') parser.add_argument( '--log', choices=['DEBUG', 'INFO', 'WARNING', 'CRITICAL'], default='INFO', @@ -65,7 +69,7 @@ def download_entry_audio(entry, download_audio_dir, raw_audio_dir, use_cache=Tru # Ensure the proper surah directory structure for the downloaded audio. downloaded_ayah_audio_dir = file_utils.prepare_ayah_directory( - download_audio_dir, surah_num, ayah_num) + download_audio_dir, surah_num, ayah_num) # Download and save the initially downloaded audio recording to the given path. download_recording_from_url(url, downloaded_ayah_audio_dir, use_cache) @@ -84,13 +88,13 @@ def download_entry_audio(entry, download_audio_dir, raw_audio_dir, use_cache=Tru # Prepare all requisite cache directories. subcache_directory_names = (DATASET_CSV_CACHE, DOWNLOADED_AUDIO_CACHE, RAW_AUDIO_CACHE) csv_cache_dir, downloaded_audio_dir, raw_audio_dir = file_utils.prepare_cache_directories( - subcache_directory_names, - cache_directory, - use_cache) + subcache_directory_names, + cache_directory, + use_cache) # Create path to dataset csv. path_to_dataset_csv = file_utils.get_path_to_dataset_csv( - csv_cache_dir, args.local_csv_filename) + csv_cache_dir, args.local_csv_filename) # If we have decided not to use the cache, download the dataset CSV. if not use_cache: @@ -99,7 +103,7 @@ def download_entry_audio(entry, download_audio_dir, raw_audio_dir, use_cache=Tru # If csv is not in specified location, then throw an error. if not file_utils.does_cached_csv_dataset_exist(path_to_dataset_csv): logging.info('Dataset CSV not found at {}. Downloading to location...'.format( - path_to_dataset_csv)) + path_to_dataset_csv)) download_csv_dataset(args.csv_url, path_to_dataset_csv) else: logging.info("Using cached copy of dataset csv at {}.".format(path_to_dataset_csv)) @@ -112,9 +116,7 @@ def download_entry_audio(entry, download_audio_dir, raw_audio_dir, use_cache=Tru # Download the audio in the dataset. for entry in tqdm(labeled_entries, desc='Audio Files'): - if surah_to_download and entry[0] == str(surah_to_download): - download_entry_audio(entry, downloaded_audio_dir, raw_audio_dir, use_cache) - else: + if surah_to_download == 0 or entry[0] == str(surah_to_download): download_entry_audio(entry, downloaded_audio_dir, raw_audio_dir, use_cache) # If we don't want to keep the raw audio, remove it from the cache. diff --git a/requirements.txt b/requirements.txt index 3c49ac6..2f9ab86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ deepspeech==0.7.1 google-cloud-speech==1.3.2 +tensorflow librosa numpy==1.18.2 pandas==0.25.3 From dcd679e13e9ce2895677576bc62de69478002993 Mon Sep 17 00:00:00 2001 From: Karim-53 <33978275+Karim-53@users.noreply.github.com> Date: Mon, 7 Jun 2021 20:25:04 +0200 Subject: [PATCH 2/7] pin tf to a version Co-Authored-By: Anas Abou Allaban <16828657+piraka9011@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2f9ab86..a2f8385 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ deepspeech==0.7.1 google-cloud-speech==1.3.2 -tensorflow +tensorflow==2.5.0 librosa numpy==1.18.2 pandas==0.25.3 From dd23dffd8599f8d28148f215a9d2ed3e3283e670 Mon Sep 17 00:00:00 2001 From: Karim-53 <33978275+Karim-53@users.noreply.github.com> Date: Thu, 10 Jun 2021 18:39:04 +0200 Subject: [PATCH 3/7] refactor --- requirements.txt | 1 + utils/files.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a2f8385..5517700 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ deepspeech==0.7.1 google-cloud-speech==1.3.2 tensorflow==2.5.0 +dill librosa numpy==1.18.2 pandas==0.25.3 diff --git a/utils/files.py b/utils/files.py index 5db9d2e..907493c 100644 --- a/utils/files.py +++ b/utils/files.py @@ -103,7 +103,7 @@ def clean_cache_directories(cache_directory: str = DEFAULT_CACHE_DIRECTORY) -> N # If the cache directory doesn't exist, then just make an empty one. if not os.path.isdir(cache_directory): os.makedirs(cache_directory) - + for subdirectory in os.listdir(cache_directory): logging.info("Removing cache_subdirectory {}.".format(subdirectory)) shutil.rmtree(os.path.join(cache_directory, subdirectory)) From b24704c9eca3abba98d19ebbe5f1573e0c21bee3 Mon Sep 17 00:00:00 2001 From: Karim-53 <33978275+Karim-53@users.noreply.github.com> Date: Thu, 10 Jun 2021 18:39:08 +0200 Subject: [PATCH 4/7] Create .editorconfig --- .editorconfig | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..32a1e27 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,9 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +indent_size = 4 +indent_style = space +insert_final_newline = true +trim_trailing_whitespace = true \ No newline at end of file From e134020a92312a456426e7b77bd6f8d88b61ea50 Mon Sep 17 00:00:00 2001 From: Karim-53 <33978275+Karim-53@users.noreply.github.com> Date: Thu, 10 Jun 2021 19:24:39 +0200 Subject: [PATCH 5/7] Safe delete environment.yml --- CONTRIBUTING.md | 60 ++++++++++++++++++++++--------------------------- README.md | 8 +++++-- environment.yml | 19 ---------------- 3 files changed, 33 insertions(+), 54 deletions(-) delete mode 100644 environment.yml diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 884d1f8..eb6157b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,63 +1,57 @@ # Contributing -Tarteel-ML is an open-source project, which means you can help us make it better! -Check out the Issues tab to see open issues. -You're welcome to start with those issues that are tagged with `Good First Issue`, +Tarteel-ML is an open-source project, which means you can help us make it better! +Check out the Issues tab to see open issues. +You're welcome to start with those issues that are tagged with `Good First Issue`, tackle other issues, or create your own issues. ## Getting started Thank you for considering contributing to Tarteel-ML! Here are step-by-step instructions. ### Installing Dependencies -Before starting, you will need to install a few dependencies. We use the + +1. Before starting, you will need to install a few dependencies. We use the [Anaconda Python distribution](https://www.anaconda.com/) for dependency management, and our instructions assume you use it to. You can download it at this [link](https://www.anaconda.com/download/). -Once you have installed Anaconda and verified it is being used, download and `cd` into the -Tarteel-ML repository and run the following commands to install all dependencies. -```commandline -conda env create -f environment.yml -``` +2. Once you have installed Anaconda and verified it is being used, clone and `cd` into the +Tarteel-ML repository -After this, activate the `tarteel` environment. -```commandline -source activate tarteel-ml -``` -You should now be ready to contribute to Tarteel-ML! When you are done, remember to deactivate the -environment. +3. We highly recommend creating a specific env for this repo by running the following commands to install all dependencies. + ```commandline + conda env create --file requirements.txt + ``` + +4. After this, activate the `tarteel` environment. + ```commandline + source activate tarteel-ml + ``` + +You should now be ready to contribute to Tarteel-ML! When you are done, remember to deactivate the environment. ```commandline source deactivate tarteel-ml ``` ### Adding New Dependencies -Use the `conda install` command to add any new dependencies and ensure that the environment +Use the `pip install ` command to add any new dependencies and ensure that the environment resolves. Pull requests with new dependencies that break the existing environment for others will be rejected. -After adding your new dependencies in Anaconda, add it (with the version number) in `environment.yml` -under `dependencies`. - -#### What if the dependency I want to add isn't in Anaconda? -For any dependencies not present in Anaconda, there is a way to install with `pip`, the default -Python library manager. Use the command -``` -which pip -``` -ensure that your pip binary is the one installed by Anaconda (the output of the command should be -similar to `/anaconda3/envs/tarteel/bin/pip`). You should then `pip install` the library and add it -to the file `environment.yml` under `pip:`. - +After adding your new dependencies in Anaconda, add it (with the version number) in `requirements.txt`. ### Conventions #### Pull Requests -Whenever submitting a new PR, create a new branch named using the convention `/`. -Make sure to include descriptive and clear commit messages, while also referencing any issues your -PR addresses. Your pull request will be reviewed by the maintainers of this repository, and upon -approval, will be merged into the master branch. +- Whenever submitting a new PR, create a new branch named using the convention `/`. + +- Make sure to include descriptive and clear commit messages, while also referencing any issues your +PR addresses. + +- Your pull request will be reviewed by the maintainers of this repository, and upon +approval, will be merged into the master branch. #### Documentation Tarteel-ML requires that your code be well-commented and that you explain clearly what your changes diff --git a/README.md b/README.md index e4229a9..b829b20 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,8 @@ -# Tarteel Machine Learning + +[![Platform: windows](https://img.shields.io/badge/Platform-Linux|MacOS-lightgrey)]() +[![Python Version](https://img.shields.io/badge/python-v3.6-blue)]() + +# Tarteel Machine Learning This repo is designed to house code related to Tarteel machine learning related tasks. :microscope: @@ -17,7 +21,7 @@ If you found this repo helpful, please keep it's contributors in your duaa :rais ## Getting Started :beginner: -### Prerequisites +### Prerequisites We use Python 3.7 for our development. However, any Python above 3.6 should work. diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 35cbc47..0000000 --- a/environment.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: tarteel-ml -channels: - - conda-forge - - defaults -dependencies: - - dill=0.2.8.2 - - ffmpeg=4.0 - - jupyter=1.0.0 - - matplotlib=3.0.2 - - numpy=1.15.4 - - pandas=0.24.1 - - pip=19.1 - - python=3.6.8 - - requests=2.21.0 - - scikit-learn=0.20.2 - - sox=14.4.2 - - tensorflow=1.12.0 - - pip: - - pyAudioAnalysis==0.2.5 From dbe7e30b66dba25233552934021caa7718284d21 Mon Sep 17 00:00:00 2001 From: Karim-53 <33978275+Karim-53@users.noreply.github.com> Date: Thu, 10 Jun 2021 19:25:27 +0200 Subject: [PATCH 6/7] update and sort requirement.txt --- requirements.txt | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index db8a064..7ab1047 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,18 @@ +PyYAML==5.4 deepspeech==0.7.1 +dill~=0.2.8.2 +ffmpeg==4.0 google-cloud-speech==1.3.2 -tensorflow==2.5.0 -dill librosa +matplotlib numpy==1.18.2 pandas==0.25.3 +pyAudioAnalysis==0.2.5 pydub==0.23.1 python-Levenshtein==0.12.0 -PyYAML==5.4 requests scikit-learn soundfile +sox==14.4.2 +tensorflow==2.5.0 tqdm==4.43.0 - From fbaa13d545baa79e9c13bef0434b8e2a69141d62 Mon Sep 17 00:00:00 2001 From: Karim-53 <33978275+Karim-53@users.noreply.github.com> Date: Thu, 10 Jun 2021 19:44:36 +0200 Subject: [PATCH 7/7] resolve env on google collab --- requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7ab1047..6db787f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,18 @@ PyYAML==5.4 deepspeech==0.7.1 -dill~=0.2.8.2 -ffmpeg==4.0 +dill +ffmpeg google-cloud-speech==1.3.2 librosa matplotlib -numpy==1.18.2 -pandas==0.25.3 +numpy +pandas pyAudioAnalysis==0.2.5 pydub==0.23.1 -python-Levenshtein==0.12.0 +python-Levenshtein~=0.12.0 requests scikit-learn soundfile -sox==14.4.2 +sox tensorflow==2.5.0 tqdm==4.43.0