Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

changes to allow using multiprocessing #36

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 10 additions & 16 deletions csv_schema_inference/csv_schema_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import multiprocessing as mp
import datetime as dt
import operator

# backend can be:
# - multiprocessing
# - threading
backend = "multiprocessing"


class DetectType:
Expand Down Expand Up @@ -135,32 +138,23 @@ def execute(self, records, x, obj, d_schema):
return d_schema


def parallel(self, records, obj, d_schema):



def parallel(self, records, obj, d_schema):
cpus = (mp.cpu_count() - 2)

if cpus <= 0:
cpus = mp.cpu_count()

chunk_size = len(records) / cpus

if chunk_size < 1:
cpus = int(chunk_size * 10)
chunk_size = 1
else:
chunk_size = round(chunk_size)


from joblib import Parallel, delayed

pool = mp.Pool(processes=cpus)

results = [pool.apply_async(self.execute, args=(records[x:x+chunk_size], x, obj, d_schema)) for x in range(0, len(records), chunk_size)]
pool.close()
pool.join()

return [p.get() for p in results]
# num_workers can be set based on a variable or command line argument
num_workers = cpus
results = Parallel(n_jobs=num_workers, backend=backend)(delayed(self.execute)(records[x:x+chunk_size], x, obj, d_schema) for x in range(0, len(records), chunk_size))
return results


class CsvSchemaInference:
Expand Down
4 changes: 3 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ classifiers =
[options]
packages = find:
python_requires = >=3.7
include_package_data = False
include_package_data = False
install_requires =
joblib
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from setuptools import setup
if __name__ == '__main__':
setup()