From a930ee0f1f116bac77cf56d1fb0923989613df6d Mon Sep 17 00:00:00 2001
From: Staci Mullins <63313398+stacimc@users.noreply.github.com>
Date: Wed, 28 Feb 2024 13:47:32 -0800
Subject: [PATCH] Add proportional staging index dag (#3763)

* Add API connection

* Pull out common elasticsearch functions

* Remove duplicated tasks

* Add new dag

* Tests

* Add tasks for point alias

* Update dag docs

* Fix tests

* Fix es query for reindexing

* Get stats for source index rather than using stats endpoint

* Raise error when incorrect number of documents are reindexed

* Prevent scroll from being disabled and causing error with auto-slicing

* Turn off slicing for reindex with max_docs instead

* Do not perform multiple source reindexes simultaneously

* Update docs to correctly describe source index as a staging index

- Since we're reindexing from a staging index, it's also okay to
run the reindexing tasks in parallel
- Update broken import

* Lint

* Log reindexed count

* Clarify language to 'staging', simplify math

* Add clarifying comment about aggregations query

* Make minimum of 0 exclusive

* Only refresh index once, clean up es task params

* Clarify arguments to get_staging_source_counts
---
 catalog/dags/common/elasticsearch.py          | 225 ++++++++++++++++++
 .../create_new_es_index.py                    | 124 +---------
 .../create_new_es_index_dag.py                |  31 ++-
 ...te_proportional_by_source_staging_index.py | 106 +++++++++
 ...roportional_by_source_staging_index_dag.py | 191 +++++++++++++++
 .../elasticsearch_cluster/healthcheck_dag.py  |   2 +-
 .../recreate_full_staging_index_dag.py        |  10 +-
 catalog/dags/elasticsearch_cluster/shared.py  |  11 -
 ...roportional_by_source_staging_index_dag.py |  95 ++++++++
 documentation/catalog/reference/DAGs.md       |  72 ++++--
 10 files changed, 704 insertions(+), 163 deletions(-)
 create mode 100644 catalog/dags/common/elasticsearch.py
 create mode 100644 catalog/dags/elasticsearch_cluster/create_proportional_by_source_staging_index/create_proportional_by_source_staging_index.py
 create mode 100644 catalog/dags/elasticsearch_cluster/create_proportional_by_source_staging_index/create_proportional_by_source_staging_index_dag.py
 delete mode 100644 catalog/dags/elasticsearch_cluster/shared.py
 create mode 100644 catalog/tests/dags/elasticsearch_cluster/create_new_es_index/test_create_proportional_by_source_staging_index_dag.py

diff --git a/catalog/dags/common/elasticsearch.py b/catalog/dags/common/elasticsearch.py
new file mode 100644
index 00000000000..92acc57b6fc
--- /dev/null
+++ b/catalog/dags/common/elasticsearch.py
@@ -0,0 +1,225 @@
+import logging
+from datetime import timedelta
+from typing import Literal, Union
+
+from airflow.decorators import task, task_group
+from airflow.models.connection import Connection
+from airflow.providers.elasticsearch.hooks.elasticsearch import ElasticsearchPythonHook
+from airflow.sensors.base import PokeReturnValue
+from airflow.utils.trigger_rule import TriggerRule
+
+from common.constants import REFRESH_POKE_INTERVAL
+
+
+logger = logging.getLogger(__name__)
+
+
+# Index settings that should not be copied over from the base configuration when
+# creating a new index.
+EXCLUDED_INDEX_SETTINGS = {"provided_name", "creation_date", "uuid", "version"}
+
+
+@task
+def get_es_host(environment: str):
+    es_conn = Connection.get_connection_from_secrets(
+        f"elasticsearch_http_{environment}"
+    )
+    return es_conn.get_uri()
+
+
+@task
+def get_index_configuration(
+    source_index: str,
+    es_host: str,
+):
+    """
+    Return the configuration for the index identified by the
+    `source_index` param. `source_index` may be either an index name
+    or an alias, but must uniquely identify one existing index or an
+    error will be raised.
+    """
+    es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+
+    response = es_conn.indices.get(
+        index=source_index,
+        # Return empty dict instead of throwing error if no index can be
+        # found. We raise our own error instead.
+        ignore_unavailable=True,
+    )
+
+    if len(response) != 1:
+        raise ValueError(f"Index {source_index} could not be uniquely identified.")
+
+    # The response has the form:
+    #   { index_name: index_configuration }
+    # However, since `source_index` can be an alias rather than the index name,
+    # we do not necessarily know the index_name so we cannot access the configuration
+    # directly by key. We instead get the first value from the dict, knowing that we
+    # have already ensured in a previous check that there is exactly one value in the
+    # response.
+    config = next(iter(response.values()))
+    return config
+
+
+def remove_excluded_index_settings(index_config):
+    """
+    Remove fields from the given index configuration that should not be included when
+    using it to create a new index.
+    """
+    # Remove fields from the current_index_config that should not be copied
+    # over into the new index (such as uuid)
+    for setting in EXCLUDED_INDEX_SETTINGS:
+        index_config.get("settings", {}).get("index", {}).pop(setting)
+
+    # Aliases should also not by applied automatically
+    index_config.pop("aliases")
+
+    return index_config
+
+
+@task
+def create_index(index_config, es_host: str):
+    es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+
+    new_index = es_conn.indices.create(**index_config)
+
+    return new_index
+
+
+@task_group(group_id="trigger_and_wait_for_reindex")
+def trigger_and_wait_for_reindex(
+    es_host: str,
+    destination_index: str,
+    source_index: str,
+    timeout: timedelta,
+    requests_per_second: int,
+    query: dict | None = None,
+    max_docs: int | None = None,
+    refresh: bool = True,
+    slices: Union[int, Literal["auto"]] = "auto",
+):
+    @task
+    def trigger_reindex(
+        es_host: str,
+        destination_index: str,
+        source_index: str,
+        query: dict,
+        requests_per_second: int,
+        max_docs: int | None,
+        slices: Union[int, Literal["auto"]],
+    ):
+        es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+        source = {"index": source_index}
+        # An empty query is not accepted; only pass it
+        # if a query was actually supplied
+        if query:
+            source["query"] = query
+
+        response = es_conn.reindex(
+            source=source,
+            dest={"index": destination_index},
+            max_docs=max_docs,
+            # Parallelize indexing when not None
+            slices=slices,
+            # Do not hold the slot while awaiting completion
+            wait_for_completion=False,
+            # Whether to immediately refresh the index after completion to make
+            # the data available for search
+            refresh=refresh,
+            # Throttle
+            requests_per_second=requests_per_second,
+        )
+        return response["task"]
+
+    @task.sensor(
+        poke_interval=REFRESH_POKE_INTERVAL, timeout=timeout, mode="reschedule"
+    )
+    def wait_for_reindex(
+        es_host: str, task_id: str, expected_docs: int | None = None
+    ) -> PokeReturnValue:
+        es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+
+        response = es_conn.tasks.get(task_id=task_id)
+
+        count = response.get("task", {}).get("status", {}).get("total")
+        if expected_docs and count != expected_docs:
+            logger.info(
+                f"Reindexed {count} documents, but {expected_docs} were expected."
+            )
+        else:
+            logger.info(f"Reindexed {count} documents.")
+
+        return PokeReturnValue(is_done=response.get("completed") is True)
+
+    trigger_reindex_task = trigger_reindex(
+        es_host,
+        destination_index,
+        source_index,
+        query,
+        requests_per_second,
+        max_docs,
+        slices,
+    )
+
+    wait_for_reindex_task = wait_for_reindex(
+        task_id=trigger_reindex_task, expected_docs=max_docs, es_host=es_host
+    )
+
+    trigger_reindex_task >> wait_for_reindex_task
+
+
+@task
+def refresh_index(es_host: str, index_name: str):
+    es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+    return es_conn.indices.refresh(index=index_name)
+
+
+@task_group(group_id="point_alias")
+def point_alias(index_name: str, alias: str, es_host: str):
+    """
+    Point the target alias to the given index. If the alias is already being
+    used by one or more indices, it will first be removed from all of them.
+    """
+
+    @task.branch
+    def check_if_alias_exists(alias: str, es_host: str):
+        """Check if the alias already exists."""
+        es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+        return (
+            "point_alias.remove_existing_alias"
+            if es_conn.indices.exists_alias(name=alias)
+            else "point_alias.point_new_alias"
+        )
+
+    @task
+    def remove_existing_alias(alias: str, es_host: str):
+        """Remove the given alias from any indices to which it  points."""
+        es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+        response = es_conn.indices.delete_alias(
+            name=alias,
+            # Remove the alias from _all_ indices to which it currently
+            # applies
+            index="_all",
+        )
+        return response.get("acknowledged")
+
+    @task
+    def point_new_alias(
+        es_host: str,
+        index_name: str,
+        alias: str,
+    ):
+        es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+        response = es_conn.indices.put_alias(index=index_name, name=alias)
+        return response.get("acknowledged")
+
+    exists_alias = check_if_alias_exists(alias, es_host)
+    remove_alias = remove_existing_alias(alias, es_host)
+
+    point_alias = point_new_alias.override(
+        # The remove_alias task may be skipped.
+        trigger_rule=TriggerRule.NONE_FAILED,
+    )(es_host, index_name, alias)
+
+    exists_alias >> [remove_alias, point_alias]
+    remove_alias >> point_alias
diff --git a/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index.py b/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index.py
index 1cb2b970eda..a5e13a7da47 100644
--- a/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index.py
+++ b/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index.py
@@ -1,21 +1,14 @@
 import logging
-from datetime import timedelta
 
-from airflow.decorators import task, task_group
-from airflow.providers.elasticsearch.hooks.elasticsearch import ElasticsearchPythonHook
-from airflow.sensors.python import PythonSensor
+from airflow.decorators import task
 
-from common.constants import REFRESH_POKE_INTERVAL
+from common.elasticsearch import remove_excluded_index_settings
 from elasticsearch_cluster.create_new_es_index.utils import merge_configurations
 
 
 logger = logging.getLogger(__name__)
 
 
-# Index settings that should not be copied over from the base configuration when
-# creating a new index.
-EXCLUDED_INDEX_SETTINGS = ["provided_name", "creation_date", "uuid", "version"]
-
 GET_FINAL_INDEX_CONFIG_TASK_NAME = "get_final_index_configuration"
 GET_CURRENT_INDEX_CONFIG_TASK_NAME = "get_current_index_configuration"
 
@@ -35,40 +28,6 @@ def check_override_config(override):
     return GET_CURRENT_INDEX_CONFIG_TASK_NAME
 
 
-@task
-def get_current_index_configuration(
-    source_index: str,
-    es_host: str,
-):
-    """
-    Return the configuration for the current index, identified by the
-    `source_index` param. `source_index` may be either an index name
-    or an alias, but must uniquely identify one existing index or an
-    error will be raised.
-    """
-    es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
-
-    response = es_conn.indices.get(
-        index=source_index,
-        # Return empty dict instead of throwing error if no index can be
-        # found. We raise our own error instead.
-        ignore_unavailable=True,
-    )
-
-    if len(response) != 1:
-        raise ValueError(f"Index {source_index} could not be uniquely identified.")
-
-    # The response has the form:
-    #   { index_name: index_configuration }
-    # However, since `source_index` can be an alias rather than the index name,
-    # we do not necessarily know the index_name so we cannot access the configuration
-    # directly by key. We instead get the first value from the dict, knowing that we
-    # have already ensured in a previous check that there is exactly one value in the
-    # response.
-    config = next(iter(response.values()))
-    return config
-
-
 @task
 def merge_index_configurations(new_index_config, current_index_config):
     """
@@ -76,13 +35,7 @@ def merge_index_configurations(new_index_config, current_index_config):
     return an index configuration in the appropriate format for being
     passed to the `create_index` API.
     """
-    # Do not automatically apply any aliases to the new index
-    current_index_config.pop("aliases")
-
-    # Remove fields from the current_index_config that should not be copied
-    # over into the new index (such as uuid)
-    for setting in EXCLUDED_INDEX_SETTINGS:
-        current_index_config.get("settings", {}).get("index", {}).pop(setting)
+    remove_excluded_index_settings(current_index_config)
 
     # Merge the new configuration values into the current configuration
     return merge_configurations(current_index_config, new_index_config)
@@ -115,74 +68,3 @@ def get_final_index_configuration(
     # Apply the desired index name
     config["index"] = index_name
     return config
-
-
-@task
-def create_index(index_config, es_host: str):
-    es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
-
-    new_index = es_conn.indices.create(**index_config)
-
-    return new_index
-
-
-@task_group(group_id="trigger_and_wait_for_reindex")
-def trigger_and_wait_for_reindex(
-    index_name: str,
-    source_index: str,
-    query: dict,
-    timeout: timedelta,
-    requests_per_second: int,
-    es_host: str,
-):
-    @task
-    def trigger_reindex(
-        index_name: str,
-        source_index: str,
-        query: dict,
-        requests_per_second: int,
-        es_host: str,
-    ):
-        es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
-
-        source = {"index": source_index}
-        # An empty query is not accepted; only pass it
-        # if a query was actually supplied
-        if query:
-            source["query"] = query
-
-        response = es_conn.reindex(
-            source=source,
-            dest={"index": index_name},
-            # Parallelize indexing
-            slices="auto",
-            # Do not hold the slot while awaiting completion
-            wait_for_completion=False,
-            # Immediately refresh the index after completion to make
-            # the data available for search
-            refresh=True,
-            # Throttle
-            requests_per_second=requests_per_second,
-        )
-
-        return response["task"]
-
-    def _wait_for_reindex(task_id: str, es_host: str):
-        es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
-
-        response = es_conn.tasks.get(task_id=task_id)
-        return response.get("completed")
-
-    trigger_reindex_task = trigger_reindex(
-        index_name, source_index, query, requests_per_second, es_host
-    )
-
-    wait_for_reindex = PythonSensor(
-        task_id="wait_for_reindex",
-        python_callable=_wait_for_reindex,
-        timeout=timeout,
-        poke_interval=REFRESH_POKE_INTERVAL,
-        op_kwargs={"task_id": trigger_reindex_task, "es_host": es_host},
-    )
-
-    trigger_reindex_task >> wait_for_reindex
diff --git a/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index_dag.py b/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index_dag.py
index 5e2e517a042..3062281ee49 100644
--- a/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index_dag.py
+++ b/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index_dag.py
@@ -102,14 +102,21 @@
 from airflow.models.param import Param
 from airflow.utils.trigger_rule import TriggerRule
 
+from common import elasticsearch as es
 from common.constants import AUDIO, DAG_DEFAULT_ARGS, MEDIA_TYPES
 from common.sensors.utils import prevent_concurrency_with_dags
-from elasticsearch_cluster.create_new_es_index import create_new_es_index as es
+from elasticsearch_cluster.create_new_es_index.create_new_es_index import (
+    GET_CURRENT_INDEX_CONFIG_TASK_NAME,
+    GET_FINAL_INDEX_CONFIG_TASK_NAME,
+    check_override_config,
+    get_final_index_configuration,
+    get_index_name,
+    merge_index_configurations,
+)
 from elasticsearch_cluster.create_new_es_index.create_new_es_index_types import (
     CREATE_NEW_INDEX_CONFIGS,
     CreateNewIndex,
 )
-from elasticsearch_cluster.shared import get_es_host
 
 
 logger = logging.getLogger(__name__)
@@ -189,31 +196,29 @@ def create_new_es_index_dag(config: CreateNewIndex):
     with dag:
         prevent_concurrency = prevent_concurrency_with_dags(config.blocking_dags)
 
-        es_host = get_es_host(environment=config.environment)
+        es_host = es.get_es_host(environment=config.environment)
 
-        index_name = es.get_index_name(
+        index_name = get_index_name(
             media_type="{{ params.media_type }}",
             index_suffix="{{ params.index_suffix or ts_nodash }}",
         )
 
-        check_override = es.check_override_config(
-            override="{{ params.override_config }}"
-        )
+        check_override = check_override_config(override="{{ params.override_config }}")
 
-        current_index_config = es.get_current_index_configuration.override(
-            task_id=es.GET_CURRENT_INDEX_CONFIG_TASK_NAME
+        current_index_config = es.get_index_configuration.override(
+            task_id=GET_CURRENT_INDEX_CONFIG_TASK_NAME
         )(
             source_index="{{ params.source_index or params.media_type }}",
             es_host=es_host,
         )
 
-        merged_index_config = es.merge_index_configurations(
+        merged_index_config = merge_index_configurations(
             new_index_config="{{ params.index_config }}",
             current_index_config=current_index_config,
         )
 
-        final_index_config = es.get_final_index_configuration.override(
-            task_id=es.GET_FINAL_INDEX_CONFIG_TASK_NAME,
+        final_index_config = get_final_index_configuration.override(
+            task_id=GET_FINAL_INDEX_CONFIG_TASK_NAME,
             trigger_rule=TriggerRule.NONE_FAILED,
         )(
             override_config="{{ params.override_config }}",
@@ -228,7 +233,7 @@ def create_new_es_index_dag(config: CreateNewIndex):
         )
 
         reindex = es.trigger_and_wait_for_reindex(
-            index_name=index_name,
+            destination_index=index_name,
             source_index="{{ params.source_index or params.media_type }}",
             query="{{ params.query }}",
             timeout=config.reindex_timeout,
diff --git a/catalog/dags/elasticsearch_cluster/create_proportional_by_source_staging_index/create_proportional_by_source_staging_index.py b/catalog/dags/elasticsearch_cluster/create_proportional_by_source_staging_index/create_proportional_by_source_staging_index.py
new file mode 100644
index 00000000000..14e840c27ac
--- /dev/null
+++ b/catalog/dags/elasticsearch_cluster/create_proportional_by_source_staging_index/create_proportional_by_source_staging_index.py
@@ -0,0 +1,106 @@
+import logging
+
+from airflow.decorators import task
+from airflow.providers.elasticsearch.hooks.elasticsearch import ElasticsearchPythonHook
+
+from common.elasticsearch import remove_excluded_index_settings
+
+
+logger = logging.getLogger(__name__)
+
+
+@task
+def get_source_index(source_index: str, media_type: str):
+    """
+    Get the desired source index. If a source_index was passed in
+    params, we use that; else we default to the filtered media index.
+    """
+    return source_index or f"{media_type}-filtered"
+
+
+@task
+def get_destination_index_name(
+    media_type: str, current_datetime_str: str, percentage_of_prod: int
+):
+    """Get the desired name for the destination index."""
+    percentage_str = round(percentage_of_prod * 100)
+
+    return (
+        f"{media_type}-{percentage_str}-percent-proportional"
+        f"-{current_datetime_str.lower()}"
+    )
+
+
+@task
+def get_destination_alias(media_type: str):
+    return f"{media_type}-subset-by-source"
+
+
+@task
+def get_destination_index_config(source_config: dict, destination_index_name: str):
+    """
+    Build the index configuration for the destination index, based on the
+    source index configuration.
+    """
+    destination_config = remove_excluded_index_settings(source_config)
+
+    # Apply the desired index name
+    destination_config["index"] = destination_index_name
+    return destination_config
+
+
+@task
+def get_staging_source_counts(source_index: str, es_host: str):
+    """
+    Get the count of records per source for the given media type in the
+    staging source index.
+    """
+    es_conn = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+
+    # `size` is the max number of buckets to return for the aggregation
+    # query, and should be set to a number that exceeds the known
+    # number of sources. Read more at:
+    # https://github.com/elastic/elasticsearch/issues/18838
+    size = 100
+
+    response = es_conn.search(
+        index=source_index,
+        # Return 0 records from the search query; we only care about the
+        # aggregations
+        size=0,
+        aggregations={
+            "unique_sources": {
+                "terms": {"field": "source", "size": size, "order": {"_key": "desc"}}
+            }
+        },
+    )
+
+    sources = (
+        response.get("aggregations", {}).get("unique_sources", {}).get("buckets", [])
+    )
+    return {source["key"]: source["doc_count"] for source in sources}
+
+
+@task
+def get_proportional_source_count_kwargs(
+    staging_source_counts: dict[str, int], percentage_of_prod: int
+):
+    """
+    Return a list of kwargs for each mapped task to reindex the
+    documents for each source individually.
+
+    For each task we will have:
+    * `max_docs`: The count of records for this source needed in the new
+                  index in order for the source to make up the same
+                  proportion of the new index as it does in the
+                  source index.
+    * `query`:    An elasticsearch query that will be used to restrict
+                  the reindexing task to records from this source.
+    """
+    return [
+        {
+            "max_docs": round(staging_total * percentage_of_prod),
+            "query": {"bool": {"filter": [{"term": {"source": source}}]}},
+        }
+        for source, staging_total in staging_source_counts.items()
+    ]
diff --git a/catalog/dags/elasticsearch_cluster/create_proportional_by_source_staging_index/create_proportional_by_source_staging_index_dag.py b/catalog/dags/elasticsearch_cluster/create_proportional_by_source_staging_index/create_proportional_by_source_staging_index_dag.py
new file mode 100644
index 00000000000..28ed4a02a00
--- /dev/null
+++ b/catalog/dags/elasticsearch_cluster/create_proportional_by_source_staging_index/create_proportional_by_source_staging_index_dag.py
@@ -0,0 +1,191 @@
+"""
+# Create Proportional By Source Staging Index DAG
+
+This DAG is used to create a new staging Elasticsearch index that is a subset
+of a staging source index, such that the proportions of records by source in
+the new index is equal to the proportions of records by source in the source
+index.
+
+
+Required Dagrun Configuration parameters:
+
+* media_type:         The media type for which to create a new index.
+* percentage_of_prod: A float indicating the proportion of items to take from each
+                      source from the total amount existing in the staging
+                      source index
+
+Optional params:
+
+* source_index: An existing staging Elasticsearch index to use as the basis for
+                the new index. If not provided, the index aliased to
+                `<media_type>-filtered` will be used.
+
+## When this DAG runs
+
+This DAG is on a `None` schedule and is run manually.
+
+## Race conditions
+
+Because this DAG runs on the staging ingestion server and staging elasticsearch
+cluster, it does _not_ interfere with the `data_refresh` or
+`create_filtered_index` DAGs.
+
+However, as the DAG operates on the staging API database it will exit
+immediately if any of the following DAGs are running:
+* `staging_database_restore`
+* `recreate_full_staging_index`
+* `create_new_staging_es_index`
+"""
+from datetime import datetime, timedelta
+
+from airflow.decorators import dag
+from airflow.models.param import Param
+
+from common import elasticsearch as es
+from common import slack
+from common.constants import (
+    AUDIO,
+    DAG_DEFAULT_ARGS,
+    MEDIA_TYPES,
+    STAGING,
+)
+from common.sensors.utils import prevent_concurrency_with_dags
+from database.staging_database_restore.constants import (
+    DAG_ID as STAGING_DB_RESTORE_DAG_ID,
+)
+from elasticsearch_cluster.create_new_es_index.create_new_es_index_types import (
+    CREATE_NEW_INDEX_CONFIGS,
+)
+from elasticsearch_cluster.create_proportional_by_source_staging_index import (
+    create_proportional_by_source_staging_index as create_index,
+)
+from elasticsearch_cluster.recreate_staging_index.recreate_full_staging_index import (
+    DAG_ID as RECREATE_STAGING_INDEX_DAG_ID,
+)
+
+
+DAG_ID = "create_proportional_by_source_staging_index"
+
+
+@dag(
+    dag_id=DAG_ID,
+    default_args=DAG_DEFAULT_ARGS,
+    schedule=None,
+    start_date=datetime(2024, 1, 31),
+    tags=["database", "elasticsearch"],
+    max_active_runs=1,
+    catchup=False,
+    doc_md=__doc__,
+    params={
+        "media_type": Param(
+            default=AUDIO,
+            enum=MEDIA_TYPES,
+            description="The media type for which to create the index.",
+        ),
+        "percentage_of_prod": Param(
+            default=0.5,
+            type="number",
+            exclusiveMinimum=0,
+            maximum=1,
+            description=(
+                "The proportion of items to take of each provider from"
+                " the total amount existing in the source index."
+            ),
+        ),
+        "source_index": Param(
+            default=None,
+            type=["string", "null"],
+            description=(
+                "Optionally, the existing staging Elasticsearch index"
+                " to use as the basis for the new index. If not provided,"
+                " the index aliased to `<media_type>-filtered` will be used."
+            ),
+        ),
+    },
+    render_template_as_native_obj=True,
+)
+def create_proportional_by_source_staging_index():
+    # Fail early if any conflicting DAGs are running
+    prevent_concurrency = prevent_concurrency_with_dags(
+        external_dag_ids=[
+            STAGING_DB_RESTORE_DAG_ID,
+            RECREATE_STAGING_INDEX_DAG_ID,
+            CREATE_NEW_INDEX_CONFIGS[STAGING].dag_id,
+        ]
+    )
+
+    es_host = es.get_es_host(environment=STAGING)
+
+    source_index_name = create_index.get_source_index(
+        source_index="{{ params.source_index }}",
+        media_type="{{ params.media_type }}",
+    )
+
+    source_config = es.get_index_configuration(
+        source_index=source_index_name, es_host=es_host
+    )
+
+    destination_index_name = create_index.get_destination_index_name(
+        media_type="{{ params.media_type }}",
+        current_datetime_str="{{ ts_nodash }}",
+        percentage_of_prod="{{ params.percentage_of_prod }}",
+    )
+
+    destination_alias = create_index.get_destination_alias(
+        media_type="{{ params.media_type }}"
+    )
+
+    destination_index_config = create_index.get_destination_index_config(
+        source_config=source_config, destination_index_name=destination_index_name
+    )
+
+    new_index = es.create_index(index_config=destination_index_config, es_host=es_host)
+
+    staging_source_counts = create_index.get_staging_source_counts(
+        source_index=source_index_name, es_host=es_host
+    )
+
+    desired_source_counts = create_index.get_proportional_source_count_kwargs.override(
+        task_id="get_desired_source_counts"
+    )(
+        staging_source_counts=staging_source_counts,
+        percentage_of_prod="{{ params.percentage_of_prod }}",
+    )
+
+    reindex = es.trigger_and_wait_for_reindex.partial(
+        destination_index=destination_index_name,
+        source_index=source_index_name,
+        timeout=timedelta(hours=12),
+        requests_per_second="{{ var.value.get('ES_INDEX_THROTTLING_RATE', 20_000) }}",
+        # When slices are used to parallelize indexing, max_docs does
+        # not work reliably and the final proportions may be incorrect.
+        slices=None,
+        # Do not refresh the index after each partial reindex
+        refresh=False,
+        es_host=es_host,
+    ).expand_kwargs(desired_source_counts)
+
+    refresh_destination_index = es.refresh_index(
+        index_name=destination_index_name, es_host=es_host
+    )
+
+    point_alias = es.point_alias(
+        index_name=destination_index_name, alias=destination_alias, es_host=es_host
+    )
+
+    notify_completion = slack.notify_slack(
+        text=f"Reindexing complete for {destination_index_name}.",
+        dag_id=DAG_ID,
+        username="Proportional by Source Staging Index Creation",
+        icon_emoji=":elasticsearch:",
+    )
+
+    # Setup additional dependencies
+    prevent_concurrency >> es_host
+    es_host >> [source_index_name, destination_index_name, destination_alias]
+    staging_source_counts >> desired_source_counts
+    new_index >> staging_source_counts
+    reindex >> refresh_destination_index >> point_alias >> notify_completion
+
+
+create_proportional_by_source_staging_index()
diff --git a/catalog/dags/elasticsearch_cluster/healthcheck_dag.py b/catalog/dags/elasticsearch_cluster/healthcheck_dag.py
index 63132d604b1..50807968d4c 100644
--- a/catalog/dags/elasticsearch_cluster/healthcheck_dag.py
+++ b/catalog/dags/elasticsearch_cluster/healthcheck_dag.py
@@ -25,10 +25,10 @@
 from elasticsearch import Elasticsearch
 
 from common.constants import ENVIRONMENTS, PRODUCTION, Environment
+from common.elasticsearch import get_es_host
 from common.sensors.utils import is_concurrent_with_any
 from common.slack import send_alert, send_message
 from data_refresh.data_refresh_types import DATA_REFRESH_CONFIGS
-from elasticsearch_cluster.shared import get_es_host
 
 
 logger = logging.getLogger(__name__)
diff --git a/catalog/dags/elasticsearch_cluster/recreate_staging_index/recreate_full_staging_index_dag.py b/catalog/dags/elasticsearch_cluster/recreate_staging_index/recreate_full_staging_index_dag.py
index 3b4cb8eb5a0..b91e93edb90 100644
--- a/catalog/dags/elasticsearch_cluster/recreate_staging_index/recreate_full_staging_index_dag.py
+++ b/catalog/dags/elasticsearch_cluster/recreate_staging_index/recreate_full_staging_index_dag.py
@@ -2,7 +2,8 @@
 # Recreate Full Staging Index DAG
 
 This DAG is used to fully recreate a new staging Elasticsearch index for a
-given `media_type`, using records pulled from the staging API database. It is
+given `media_type`, using records pulled from the staging API database rather than
+from a source index (like the `create_new_staging_es_index` DAG does). It is
 used to decouple the steps of creating a new index from the rest of the
 data refresh process.
 
@@ -33,8 +34,11 @@
 cluster, it does _not_ interfere with the `data_refresh` or
 `create_filtered_index` DAGs.
 
-However, the DAG will exit immediately if the `staging_database_restore` DAG is
-running, as it operates on the staging API database.
+However, as the DAG operates on the staging API database it will exit
+immediately if any of the following DAGs are running:
+* `staging_database_restore`
+* `create_proportional_by_provider_staging_index`
+* `create_new_staging_es_index`
 """
 from datetime import datetime
 
diff --git a/catalog/dags/elasticsearch_cluster/shared.py b/catalog/dags/elasticsearch_cluster/shared.py
deleted file mode 100644
index 02fcb315cc6..00000000000
--- a/catalog/dags/elasticsearch_cluster/shared.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from airflow.decorators import task
-from airflow.models.connection import Connection
-from airflow.models.xcom_arg import XComArg
-
-from common.constants import Environment
-
-
-@task
-def get_es_host(environment: Environment) -> XComArg:
-    conn = Connection.get_connection_from_secrets(f"elasticsearch_http_{environment}")
-    return conn.get_uri()
diff --git a/catalog/tests/dags/elasticsearch_cluster/create_new_es_index/test_create_proportional_by_source_staging_index_dag.py b/catalog/tests/dags/elasticsearch_cluster/create_new_es_index/test_create_proportional_by_source_staging_index_dag.py
new file mode 100644
index 00000000000..63f5188c0bf
--- /dev/null
+++ b/catalog/tests/dags/elasticsearch_cluster/create_new_es_index/test_create_proportional_by_source_staging_index_dag.py
@@ -0,0 +1,95 @@
+import pytest
+
+from elasticsearch_cluster.create_proportional_by_source_staging_index.create_proportional_by_source_staging_index import (
+    get_proportional_source_count_kwargs,
+)
+
+
+@pytest.mark.parametrize(
+    "staging_source_counts, percentage_of_prod, expected_results",
+    [
+        (
+            {"jamendo": 10_000, "freesound": 20_000, "wikimedia_audio": 10_000},
+            0.25,
+            [
+                {
+                    "max_docs": 2_500,
+                    "query": {"bool": {"filter": [{"term": {"source": "jamendo"}}]}},
+                },
+                {
+                    "max_docs": 5_000,
+                    "query": {"bool": {"filter": [{"term": {"source": "freesound"}}]}},
+                },
+                {
+                    "max_docs": 2_500,
+                    "query": {
+                        "bool": {"filter": [{"term": {"source": "wikimedia_audio"}}]}
+                    },
+                },
+            ],
+        ),
+        (
+            {
+                "jamendo": 10_000,
+                "freesound": 20_000,
+            },
+            0.0,
+            [
+                {
+                    "max_docs": 0,
+                    "query": {"bool": {"filter": [{"term": {"source": "jamendo"}}]}},
+                },
+                {
+                    "max_docs": 0,
+                    "query": {"bool": {"filter": [{"term": {"source": "freesound"}}]}},
+                },
+            ],
+        ),
+        (
+            {
+                "jamendo": 982,
+                "freesound": 423,
+            },
+            1.0,
+            [
+                {
+                    "max_docs": 982,
+                    "query": {"bool": {"filter": [{"term": {"source": "jamendo"}}]}},
+                },
+                {
+                    "max_docs": 423,
+                    "query": {"bool": {"filter": [{"term": {"source": "freesound"}}]}},
+                },
+            ],
+        ),
+        # Proportions do not divide evenly into the estimated new index total.
+        (
+            # All sources are exactly 1/3 of the index
+            {"flickr": 3_333, "stocksnap": 3_333, "smk": 3_333},
+            0.5,
+            [
+                {
+                    # Note that each source gets 1_666 records (because it is
+                    # rounded), for a total of 4_998 records in the new index.
+                    "max_docs": 1_666,
+                    "query": {"bool": {"filter": [{"term": {"source": "flickr"}}]}},
+                },
+                {
+                    "max_docs": 1_666,
+                    "query": {"bool": {"filter": [{"term": {"source": "stocksnap"}}]}},
+                },
+                {
+                    "max_docs": 1_666,
+                    "query": {"bool": {"filter": [{"term": {"source": "smk"}}]}},
+                },
+            ],
+        ),
+    ],
+)
+def test_get_proportional_source_count_kwargs(
+    staging_source_counts, percentage_of_prod, expected_results
+):
+    actual_results = get_proportional_source_count_kwargs.function(
+        staging_source_counts, percentage_of_prod
+    )
+    assert actual_results == expected_results
diff --git a/documentation/catalog/reference/DAGs.md b/documentation/catalog/reference/DAGs.md
index 94055eafe25..15a6875e0a8 100644
--- a/documentation/catalog/reference/DAGs.md
+++ b/documentation/catalog/reference/DAGs.md
@@ -42,15 +42,16 @@ The following are DAGs grouped by their primary tag:
 
 ### Database
 
-| DAG ID                                                                            | Schedule Interval |
-| --------------------------------------------------------------------------------- | ----------------- |
-| [`batched_update`](#batched_update)                                               | `None`            |
-| [`delete_records`](#delete_records)                                               | `None`            |
-| [`recreate_audio_popularity_calculation`](#recreate_audio_popularity_calculation) | `None`            |
-| [`recreate_full_staging_index`](#recreate_full_staging_index)                     | `None`            |
-| [`recreate_image_popularity_calculation`](#recreate_image_popularity_calculation) | `None`            |
-| [`report_pending_reported_media`](#report_pending_reported_media)                 | `@weekly`         |
-| [`staging_database_restore`](#staging_database_restore)                           | `@monthly`        |
+| DAG ID                                                                                        | Schedule Interval |
+| --------------------------------------------------------------------------------------------- | ----------------- |
+| [`batched_update`](#batched_update)                                                           | `None`            |
+| [`create_proportional_by_source_staging_index`](#create_proportional_by_source_staging_index) | `None`            |
+| [`delete_records`](#delete_records)                                                           | `None`            |
+| [`recreate_audio_popularity_calculation`](#recreate_audio_popularity_calculation)             | `None`            |
+| [`recreate_full_staging_index`](#recreate_full_staging_index)                                 | `None`            |
+| [`recreate_image_popularity_calculation`](#recreate_image_popularity_calculation)             | `None`            |
+| [`report_pending_reported_media`](#report_pending_reported_media)                             | `@weekly`         |
+| [`staging_database_restore`](#staging_database_restore)                                       | `@monthly`        |
 
 ### Elasticsearch
 
@@ -144,6 +145,7 @@ The following is documentation associated with each DAG (where available):
 1.  [`create_filtered_image_index`](#create_filtered_image_index)
 1.  [`create_new_production_es_index`](#create_new_production_es_index)
 1.  [`create_new_staging_es_index`](#create_new_staging_es_index)
+1.  [`create_proportional_by_source_staging_index`](#create_proportional_by_source_staging_index)
 1.  [`delete_records`](#delete_records)
 1.  [`europeana_workflow`](#europeana_workflow)
 1.  [`finnish_museums_workflow`](#finnish_museums_workflow)
@@ -696,6 +698,43 @@ The resulting, merged configuration will be:
 }
 ```
 
+### `create_proportional_by_source_staging_index`
+
+#### Create Proportional By Source Staging Index DAG
+
+This DAG is used to create a new staging Elasticsearch index that is a subset of
+a staging source index, such that the proportions of records by source in the
+new index is equal to the proportions of records by source in the source index.
+
+Required Dagrun Configuration parameters:
+
+- media_type: The media type for which to create a new index.
+- percentage_of_prod: A float indicating the proportion of items to take from
+  each source from the total amount existing in the staging source index
+
+Optional params:
+
+- source_index: An existing staging Elasticsearch index to use as the basis for
+  the new index. If not provided, the index aliased to `<media_type>-filtered`
+  will be used.
+
+##### When this DAG runs
+
+This DAG is on a `None` schedule and is run manually.
+
+##### Race conditions
+
+Because this DAG runs on the staging ingestion server and staging elasticsearch
+cluster, it does _not_ interfere with the `data_refresh` or
+`create_filtered_index` DAGs.
+
+However, as the DAG operates on the staging API database it will exit
+immediately if any of the following DAGs are running:
+
+- `staging_database_restore`
+- `recreate_full_staging_index`
+- `create_new_staging_es_index`
+
 ### `delete_records`
 
 #### Delete Records DAG
@@ -1086,9 +1125,10 @@ code is deployed for the calculation.
 #### Recreate Full Staging Index DAG
 
 This DAG is used to fully recreate a new staging Elasticsearch index for a given
-`media_type`, using records pulled from the staging API database. It is used to
-decouple the steps of creating a new index from the rest of the data refresh
-process.
+`media_type`, using records pulled from the staging API database rather than
+from a source index (like the `create_new_staging_es_index` DAG does). It is
+used to decouple the steps of creating a new index from the rest of the data
+refresh process.
 
 Staging index creation is handled by the _staging_ ingestion server. The DAG
 triggers the ingestion server `REINDEX` action to create a new index in the
@@ -1117,8 +1157,12 @@ Because this DAG runs on the staging ingestion server and staging elasticsearch
 cluster, it does _not_ interfere with the `data_refresh` or
 `create_filtered_index` DAGs.
 
-However, the DAG will exit immediately if the `staging_database_restore` DAG is
-running, as it operates on the staging API database.
+However, as the DAG operates on the staging API database it will exit
+immediately if any of the following DAGs are running:
+
+- `staging_database_restore`
+- `create_proportional_by_provider_staging_index`
+- `create_new_staging_es_index`
 
 ### `recreate_image_popularity_calculation`