From 90180247fe23cedcdcc32249fdb9d7b25bf6051d Mon Sep 17 00:00:00 2001 From: john-gom <116556069+john-gom@users.noreply.github.com> Date: Tue, 31 Oct 2023 12:29:32 +0000 Subject: [PATCH] feat: Use query service for obsolete queries (#9228) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Split dockerfile to avoid timeouts * Pass query parameters on to query service * Ensure obsolete queries are cached separately * Remove references to products_tags * PR suggestion Co-authored-by: Stéphane Gigandet * Fix use of request_param --------- Co-authored-by: Stéphane Gigandet --- .github/labeler.yml | 1 - .github/workflows/daily.yml | 33 ------- Dockerfile | 4 +- Makefile | 4 - lib/ProductOpener/Data.pm | 47 +++------- lib/ProductOpener/Display.pm | 103 +++------------------- scripts/checkmongodb.pl | 12 --- scripts/refresh_products_tags.js | 143 ------------------------------- tests/integration/unknown_tags.t | 2 +- 9 files changed, 30 insertions(+), 319 deletions(-) delete mode 100644 scripts/refresh_products_tags.js diff --git a/.github/labeler.yml b/.github/labeler.yml index 31f90dd25df87..2510d085e859c 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -12,7 +12,6 @@ nginx: mongodb: - conf/mongodb/create_indexes.js -- scripts/refresh_products_tags.js - .github/workflows/mongo-deploy.yml - scripts/checkmongodb.pl - scripts/update_all_products_from_dir_in_mongodb.pl diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index a5a97318fcdea..83d4179c3feee 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -66,36 +66,3 @@ jobs: # make task fail intentionally false fi - - refresh-products-tags: - runs-on: ubuntu-latest - timeout-minutes: 180 - strategy: - matrix: - env: [mongo-dev] - environment: ${{ matrix.env }} - concurrency: ${{ matrix.env }} - needs: dev-db-sync - steps: - - name: Set various variables for the staging (.net) deployment - if: matrix.env == 'mongo-dev' - run: | - # deploy target - echo "SSH_HOST=10.1.0.200" >> $GITHUB_ENV - echo "SSH_PROXY_HOST=ovh1.openfoodfacts.org" >> $GITHUB_ENV - echo "SSH_USERNAME=off" >> $GITHUB_ENV - - name: Refresh MongoDB products_tags collection - uses: appleboy/ssh-action@master - with: - host: ${{ env.SSH_HOST }} - username: ${{ env.SSH_USERNAME }} - key: ${{ secrets.SSH_PRIVATE_KEY }} - proxy_host: ${{ env.SSH_PROXY_HOST }} - proxy_username: ${{ env.SSH_USERNAME }} - proxy_key: ${{ secrets.SSH_PRIVATE_KEY }} - command_timeout: 180m - script_stop: false - script: | - cd ${{ matrix.env }} - make refresh_product_tags - make restart_db diff --git a/Dockerfile b/Dockerfile index 575c12ba537de..6e41735fa7dc1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -68,7 +68,9 @@ RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt set -x && \ liblog-any-adapter-log4perl-perl \ # NB: not available in ubuntu 1804 LTS: libgeoip2-perl \ - libemail-valid-perl \ + libemail-valid-perl +RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt set -x && \ + apt install -y \ # # cpan dependencies that can be satisfied by apt even if the package itself can't: # diff --git a/Makefile b/Makefile index 5e032d3705c62..9a6361325f2ed 100644 --- a/Makefile +++ b/Makefile @@ -197,10 +197,6 @@ create_mongodb_indexes: ${DOCKER_COMPOSE} exec -T mongodb //bin/sh -c "mongo off /data/db/create_indexes.js" refresh_product_tags: - @echo "🥫 Refreshing products tags (update MongoDB products_tags collection) …" -# get id for mongodb container - docker cp scripts/refresh_products_tags.js $(shell docker-compose ps -q mongodb):/data/db - ${DOCKER_COMPOSE} exec -T mongodb //bin/sh -c "mongo off /data/db/refresh_products_tags.js" @echo "🥫 Refreshing product data cached in Postgres …" ${DOCKER_COMPOSE} run --rm backend perl /opt/product-opener/scripts/refresh_postgres.pl ${from} diff --git a/lib/ProductOpener/Data.pm b/lib/ProductOpener/Data.pm index 5501b2ddb0a65..9ba937258a4d2 100644 --- a/lib/ProductOpener/Data.pm +++ b/lib/ProductOpener/Data.pm @@ -27,19 +27,10 @@ ProductOpener::Data - methods to create or get the mongoDB client and fetch "dat The module implements the methods required to fetch certain collections from the MongoDB database. The functions used in this module are responsible for executing queries, to get connection to database and also to select the collection required. -The module exposes 2 distinct kinds of collections, products and products_tags, returned by the Data::get_products_collections -and the Data::get_products_tags_collections methods respectively. - The products collection contains a complete document for each product in the OpenFoodFacts database which exposes all available information about the product. -The products_tags collection contains a stripped down version of the data in the products collection, where each -product entry has a select few fields, including fields used in tags. The main purpose of having this copy is to -improve performance of aggregate queries for an improved user experience and more efficient resource usage. This -collection was initially proposed in L on -GitHub, where some additional context is available. - -Obsolete products that have been withdrawn from the market have separate collections: products_obsolete and products_obsolete_tags +Obsolete products that have been withdrawn from the market have separate collections: products_obsolete =cut @@ -74,6 +65,7 @@ use ProductOpener::Config qw/:all/; use MongoDB; use Tie::IxHash; use JSON::PP; +use CGI ':cgi-lib'; use Log::Any qw($log); use Action::CircuitBreaker; @@ -119,25 +111,27 @@ sub execute_query ($sub) { )->run(); } -sub execute_aggregate_tags_query ($aggregate_parameters) { - return execute_tags_query('aggregate', $aggregate_parameters); +sub execute_aggregate_tags_query ($query) { + return execute_tags_query('aggregate', $query); } -sub execute_count_tags_query ($query_ref) { - return execute_tags_query('count', $query_ref); +sub execute_count_tags_query ($query) { + return execute_tags_query('count', $query); } -sub execute_tags_query ($type, $parameters) { +sub execute_tags_query ($type, $query) { if ((defined $query_url) and (length($query_url) > 0)) { $query_url =~ s/^\s+|\s+$//g; - my $path = "$query_url/$type"; - $log->debug('Executing PostgreSQL ' . $type . ' query on ' . $path, {query => $parameters}) + my $params = Vars(); + my $url = URI->new("$query_url/$type"); + $url->query_form($params); + $log->debug('Executing PostgreSQL ' . $type . ' query on ' . $url, {query => $query}) if $log->is_debug(); my $ua = LWP::UserAgent->new(); my $resp = $ua->post( - $path, - Content => encode_json($parameters), + $url, + Content => encode_json($query), 'Content-Type' => 'application/json; charset=utf-8' ); if ($resp->is_success) { @@ -184,16 +178,6 @@ This is useful when moving products to another flavour If set to a true value, the function returns a collection that contains only obsolete products, otherwise it returns the collection with products that are not obsolete. -=head4 tags - -If set to a true value, the function may return a smaller collection that contains only the *_tags fields, -in order to speed aggregate queries. The smaller collection is created every night, -and may therefore contain slightly stale data. - -As of 2023/03/13, we return the products_tags collection for non obsolete products. -For obsolete products, we currently return the products_obsolete collection, but we might -create a separate products_obsolete_tags collection in the future, if it becomes necessary to create one. - =head3 Return values Returns a mongoDB collection object. @@ -206,11 +190,6 @@ sub get_products_collection ($parameters_ref = {}) { if ($parameters_ref->{obsolete}) { $collection .= '_obsolete'; } - # We don't have a products_obsolete_tags collection at this point - # if it changes, the following elsif should be changed to a if - elsif ($parameters_ref->{tags}) { - $collection .= '_tags'; - } return get_collection($database, $collection, $parameters_ref->{timeout}); } diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm index 1bd05ff9e85b4..9ef1789ad188c 100644 --- a/lib/ProductOpener/Display.pm +++ b/lib/ProductOpener/Display.pm @@ -1515,6 +1515,14 @@ sub can_use_query_cache() { and (not $server_options{producers_platform})); } +sub generate_query_cache_key ($name, $context_ref, $request_ref) { + # Generates a cache key taking the obsolete parameter into account + if (scalar request_param($request_ref, "obsolete")) { + $name .= '_obsolete'; + } + return generate_cache_key($name, $context_ref); +} + sub query_list_of_tags ($request_ref, $query_ref) { add_country_and_owner_filters_to_query($request_ref, $query_ref); @@ -1607,7 +1615,7 @@ sub query_list_of_tags ($request_ref, $query_ref) { } #get cache results for aggregate query - my $key = generate_cache_key("aggregate", $aggregate_parameters); + my $key = generate_query_cache_key("aggregate", $aggregate_parameters, $request_ref); $log->debug("MongoDB query key", {key => $key}) if $log->is_debug(); my $results = get_cache_results($key, $request_ref); @@ -1617,33 +1625,6 @@ sub query_list_of_tags ($request_ref, $query_ref) { # or if we are on the producers platform if (can_use_query_cache()) { $results = execute_aggregate_tags_query($aggregate_parameters); - - if (not defined $results) { - eval { - $log->debug("Executing MongoDB aggregate query on products_tags collection", - {query => $aggregate_parameters}) - if $log->is_debug(); - $results = execute_query( - sub { - return get_products_collection( - get_products_collection_request_parameters($request_ref, {tags => 1})) - ->aggregate($aggregate_parameters, {allowDiskUse => 1}); - } - ); - # the return value of aggregate has changed from version 0.702 - # and v1.4.5 of the perl MongoDB module - $results = [$results->all] if defined $results; - }; - my $err = $@; - if ($err) { - $log->warn("MongoDB error", {error => $err}) if $log->is_warn(); - } - else { - $log->info("MongoDB query ok", {error => $err}) if $log->is_info(); - } - - $log->debug("MongoDB query done", {error => $err}) if $log->is_debug(); - } } if (not defined $results) { @@ -1702,7 +1683,7 @@ sub query_list_of_tags ($request_ref, $query_ref) { else { #get total count for aggregate (without limit) and put result in cache - my $key_count = generate_cache_key("aggregate_count", $aggregate_count_parameters); + my $key_count = generate_query_cache_key("aggregate_count", $aggregate_count_parameters, $request_ref); $log->debug("MongoDB aggregate count query key", {key => $key_count}) if $log->is_debug(); my $results_count = get_cache_results($key_count, $request_ref); @@ -1713,21 +1694,6 @@ sub query_list_of_tags ($request_ref, $query_ref) { # or if we are on the producers platform if (can_use_query_cache()) { $count_results = execute_aggregate_tags_query($aggregate_count_parameters); - - if (not defined $count_results) { - eval { - $log->debug("Executing MongoDB aggregate count query on products_tags collection", - {query => $aggregate_count_parameters}) - if $log->is_debug(); - $count_results = execute_query( - sub { - return get_products_collection( - get_products_collection_request_parameters($request_ref, {tags => 1})) - ->aggregate($aggregate_count_parameters, {allowDiskUse => 1}); - } - ); - } - } } if (not defined $count_results) { @@ -4629,8 +4595,6 @@ This function looks at the request object to set parameters to pass to the get_p =head4 $additional_parameters_ref An optional reference to a hash of parameters that should be added to the parameters extracted from the request object. -This is useful in particular to request the query to be executed on the smaller products_tags category, by passing -{ tags = 1 } =head3 Return value @@ -5215,7 +5179,7 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa skip => $skip ]; - my $key = generate_cache_key("search_products", $mongodb_query_ref); + my $key = generate_query_cache_key("search_products", $mongodb_query_ref, $request_ref); $log->debug("MongoDB query key - search_products", {key => $key}) if $log->is_debug(); @@ -5258,7 +5222,7 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa $request_ref->{structured_response}{page_count} = $page_count; - # The page count may be higher than the count from the products_tags collection which is updated every night + # The page count may be higher than the count from the query service which is updated every night # in that case, set $count to $page_count # It's also possible that the count query had a timeout and that $count is 0 even though we have results if ($page_count > $count) { @@ -5536,7 +5500,7 @@ sub estimate_result_count ($request_ref, $query_ref, $cache_results_flag) { } elsif (keys %{$query_ref} > 0) { #check if count results is in cache - my $key_count = generate_cache_key("search_products_count", $query_ref); + my $key_count = generate_query_cache_key("search_products_count", $query_ref, $request_ref); $log->debug("MongoDB query key - search_products_count", {key => $key_count}) if $log->is_debug(); $count = get_cache_results($key_count, $request_ref); if (not defined $count) { @@ -5546,47 +5510,6 @@ sub estimate_result_count ($request_ref, $query_ref, $cache_results_flag) { # Count queries are very expensive, if possible, execute them on the postgres cache if (can_use_query_cache()) { $count = execute_count_tags_query($query_ref); - - if (not defined $count) { - # Count queries are very expensive, if possible, execute them on the smaller products_tags collection - my $only_tags_filters = 1; - - if ($server_options{producers_platform}) { - $only_tags_filters = 0; - } - else { - - foreach my $field (keys %$query_ref) { - if ($field !~ /_tags$/) { - $log->debug( - "non tags field in query filters, cannot use smaller products_tags collection", - {field => $field, value => $query_ref->{field}}) - if $log->is_debug(); - $only_tags_filters = 0; - last; - } - } - } - - if ( ($only_tags_filters) - and ((not defined single_param("no_cache")) or (single_param("no_cache") == 0))) - { - - $count = execute_query( - sub { - $log->debug("count_documents on smaller products_tags collection", {key => $key_count}) - if $log->is_debug(); - return get_products_collection( - get_products_collection_request_parameters($request_ref, {tags => 1})) - ->count_documents($query_ref); - } - ); - $err = $@; - if ($err) { - $log->warn("MongoDB error during count", {error => $err}) if $log->is_warn(); - } - } - } } if (not defined $count) { diff --git a/scripts/checkmongodb.pl b/scripts/checkmongodb.pl index 2fb30a4c98620..ac0e525da0927 100755 --- a/scripts/checkmongodb.pl +++ b/scripts/checkmongodb.pl @@ -40,18 +40,6 @@ ($) return; } -my $count; - -eval { - $count = execute_query( - sub { - return get_products_tags_collection()->count_documents({}); - } - ); -}; - -print STDERR "count: $count\n"; - if ($@) { my $hostname = `hostname`; my $msg = "Host: $hostname - Mongodb down: $@\n"; diff --git a/scripts/refresh_products_tags.js b/scripts/refresh_products_tags.js deleted file mode 100644 index efc09e5fe7676..0000000000000 --- a/scripts/refresh_products_tags.js +++ /dev/null @@ -1,143 +0,0 @@ -/*global db*/ -/* - Name : refresh_products_tags.js - Description : Refresh products_tags collection by copying *_tags fields from products collection - Version : 1.1 - Usage : mongo dbname refresh_products_tags.js - Add in crontab for daily refresh : - 00 02 * * * /usr/bin/mongo off /etc/mongo/refresh_products_tags.js >> /var/log/mongodb/refresh_products_tags.log -*/ - -print(Date() + ' : Refresh products_tags collection...'); -db.products.aggregate( [ -{"$project" : - { - creator:1, - countries_tags:1, - brands_tags:1, - categories_tags:1, - labels_tags:1, - packaging_tags:1, - origins_tags:1, - manufacturing_places_tags:1, - emb_codes_tags:1, - ingredients_tags:1, - additives_tags:1, - vitamins_tags:1, - minerals_tags:1, - amino_acids_tags:1, - nucleotides_tags:1, - allergens_tags:1, - traces_tags:1, - nova_groups_tags:1, - nutrition_grades_tags:1, - languages_tags:1, - creator_tags:1, - editors_tags:1, - states_tags:1, - entry_dates_tags:1, - last_edit_dates_tags:1, - codes_tags:1, - nutrient_levels_tags:1, - stores_tags:1, - informers_tags:1, - photographers_tags:1, - checkers_tags:1, - correctors_tags:1, - ingredients_from_palm_oil_tags:1, - ingredients_that_may_be_from_palm_oil_tags:1, - purchase_places_tags:1, - ingredients_n_tags:1, - pnns_groups_1_tags:1, - pnns_groups_2_tags:1, - misc_tags:1, - quality_tags:1, - unknown_nutrients_tags:1, - last_image_dates_tags:1, - cities_tags:1, - ingredients_analysis_tags:1, - popularity_tags:1, - data_sources_tags:1, - data_quality_tags:1, - data_quality_bugs_tags:1, - data_quality_info_tags:1, - data_quality_warnings_tags:1, - data_quality_errors_tags:1, - teams_tags:1, - categories_properties_tags:1, - ecoscore_tags:1, - owners_tags:1, - food_groups_tags:1, - weighers_tags:1, - }}, -{"$out": "products_tags"} -]); - -// estimatedDocumentCount() is available from mongodb 4.0.3 -//print(Date() + ' : ' + db.products_tags.estimatedDocumentCount() + ' products refreshed in products_tags collection.'); -print(Date() + ' : ' + db.products_tags.count() + ' products refreshed in products_tags collection.'); - - -print(Date() + ' : Create indexes for products_tags collection (if not already existing)...'); - -db.products_tags.createIndex({creator:1}, { background: true }); -db.products_tags.createIndex({countries_tags:1}, { background: true }); -db.products_tags.createIndex({brands_tags:1}, { background: true }); -db.products_tags.createIndex({categories_tags:1}, { background: true }); -db.products_tags.createIndex({labels_tags:1}, { background: true }); -db.products_tags.createIndex({packaging_tags:1}, { background: true }); -db.products_tags.createIndex({origins_tags:1}, { background: true }); -db.products_tags.createIndex({manufacturing_places_tags:1}, { background: true }); -db.products_tags.createIndex({emb_codes_tags:1}, { background: true }); -db.products_tags.createIndex({ingredients_tags:1}, { background: true }); -db.products_tags.createIndex({additives_tags:1}, { background: true }); -db.products_tags.createIndex({vitamins_tags:1}, { background: true }); -db.products_tags.createIndex({minerals_tags:1}, { background: true }); -db.products_tags.createIndex({amino_acids_tags:1}, { background: true }); -db.products_tags.createIndex({nucleotides_tags:1}, { background: true }); -db.products_tags.createIndex({allergens_tags:1}, { background: true }); -db.products_tags.createIndex({traces_tags:1}, { background: true }); -db.products_tags.createIndex({nova_groups_tags:1}, { background: true }); -db.products_tags.createIndex({nutrition_grades_tags:1}, { background: true }); -db.products_tags.createIndex({languages_tags:1}, { background: true }); -db.products_tags.createIndex({creator_tags:1}, { background: true }); -db.products_tags.createIndex({editors_tags:1}, { background: true }); -db.products_tags.createIndex({states_tags:1}, { background: true }); -db.products_tags.createIndex({entry_dates_tags:1}, { background: true }); -db.products_tags.createIndex({last_edit_dates_tags:1}, { background: true }); -db.products_tags.createIndex({codes_tags:1}, { background: true }); -db.products_tags.createIndex({nutrient_levels_tags:1}, { background: true }); -db.products_tags.createIndex({stores_tags:1}, { background: true }); -db.products_tags.createIndex({informers_tags:1}, { background: true }); -db.products_tags.createIndex({photographers_tags:1}, { background: true }); -db.products_tags.createIndex({checkers_tags:1}, { background: true }); -db.products_tags.createIndex({correctors_tags:1}, { background: true }); -db.products_tags.createIndex({ingredients_from_palm_oil_tags:1}, { background: true }); -db.products_tags.createIndex({ingredients_that_may_be_from_palm_oil_tags:1}, { background: true }); -db.products_tags.createIndex({purchase_places_tags:1}, { background: true }); -db.products_tags.createIndex({ingredients_n_tags:1}, { background: true }); -db.products_tags.createIndex({pnns_groups_1_tags:1}, { background: true }); -db.products_tags.createIndex({pnns_groups_2_tags:1}, { background: true }); -db.products_tags.createIndex({misc_tags:1}, { background: true }); -db.products_tags.createIndex({quality_tags:1}, { background: true }); -db.products_tags.createIndex({unknown_nutrients_tags:1}, { background: true }); -db.products_tags.createIndex({last_image_dates_tags:1}, { background: true }); -db.products_tags.createIndex({cities_tags:1}, { background: true }); -db.products_tags.createIndex({ingredients_analysis_tags:1}, { background: true }); -db.products_tags.createIndex({popularity_tags:1}, { background: true }); -db.products_tags.createIndex({data_sources_tags:1}, { background: true }); -db.products_tags.createIndex({data_quality_tags:1}, { background: true }); -db.products_tags.createIndex({data_quality_bugs_tags:1}, { background: true }); -db.products_tags.createIndex({data_quality_info_tags:1}, { background: true }); -db.products_tags.createIndex({data_quality_warnings_tags:1}, { background: true }); -db.products_tags.createIndex({data_quality_errors_tags:1}, { background: true }); -db.products_tags.createIndex({teams_tags:1}, { background: true }); -db.products_tags.createIndex({categories_properties_tags:1}, { background: true }); -db.products_tags.createIndex({owners_tags:1}, { background: true }); -db.products_tags.createIndex({ecoscore_tags:1}, { background: true }); -db.products_tags.createIndex({nutriscore_score_opposite: -1}, { background: true }); -db.products_tags.createIndex({ecoscore_score: -1}, { background: true }); -db.products_tags.createIndex({popularity_key: -1}, { background: true }); -db.products_tags.createIndex({food_groups_tags:1}, { background: true }); - -print(Date() + ' : Refresh done.'); diff --git a/tests/integration/unknown_tags.t b/tests/integration/unknown_tags.t index 87b6dea985654..ead817a085625 100644 --- a/tests/integration/unknown_tags.t +++ b/tests/integration/unknown_tags.t @@ -119,7 +119,7 @@ my $tests_ref = [ { test_case => 'ingredient-someunknowningredient-does-not-exist-but-not-empty-labels', method => 'GET', - # we need &no_cache=1 in order to get results (otherwise we use the products_tags collection) + # we need &no_cache=1 in order to get results (otherwise we use the query service) path => '/ingredient/someunknowningredient/labels&no_cache=1', expected_status_code => 200, expected_type => 'html',