@jaddison Here is a rough implementation which extends your ElasticsearchTypeMixin class. I called it ParallelElasticsearchTypeMixin.

Key things to note are the introduction of get_thread_count and bulk_expansion_callback. Both would be passed into the parallel_bulk helper method. The expansion callback would do the work of what the bulk_index for loop was doing. I couldn't account on how to skip for falsely docs though. Give it a whirl tell me if it's even worth a PR.

from elasticsearch.helpers import parallel_bulk
from simple_elasticsearch.mixins import ElasticsearchTypeMixin
from simple_elasticsearch.utils import queryset_iterator


class ParallelElasticsearchTypeMixin(ElasticsearchTypeMixin):

    @classmethod
    def get_thread_count(cls):
        return 4

    @classmethod
    def parallel_bulk_index(cls, es=None, queryset=None):
        es = es or cls.get_es()

        if queryset is None:
            queryset = cls.get_queryset()

        bulk_limit = cls.get_bulk_index_limit()
        thread_limit = cls.get_thread_count()
        query_limit = cls.get_query_limit()

        # this requires that `get_queryset` is implemented
        actions_iterator = queryset_iterator(queryset, query_limit)

        for success, info in parallel_bulk(client=es, actions=actions_iterator,
                                           chunk_size=bulk_limit, thread_count=thread_limit,
                                           expand_action_callback=cls.bulk_expansion_callback):
            if not success:
                print('Doc failed', info)

    @classmethod
    def bulk_expansion_callback(cls, obj):
        tmp = ()

        should_delete = not cls.should_index(obj)

        doc = cls.get_document(obj)
        if not should_delete:
            # allow for the case where a document cannot be indexed;
            # the implementation of `get_document()` should return a
            # falsy value.
            doc = cls.get_document(obj)
            if not doc:
                doc = {}

        data = {
            '_index': cls.get_index_name(),
            '_type': cls.get_type_name(),
            '_id': cls.get_document_id(obj)
        }

        data.update(cls.get_request_params(obj))
        action = {'delete' if should_delete else 'index': data}

        # bulk operation instructions/details
        tmp += action,

        # only append bulk operation data if it's not a delete operation
        if not should_delete:
            tmp += doc,

        # expansion callback must return a tuple
        return tmp

Parallelization flag for management commands #16

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions