Skip to content

Commit

Permalink
feat: Use query service for obsolete queries (#9228)
Browse files Browse the repository at this point in the history
* Split dockerfile to avoid timeouts

* Pass query parameters on to query service

* Ensure obsolete queries are cached separately

* Remove references to products_tags

* PR suggestion

Co-authored-by: Stéphane Gigandet <[email protected]>

* Fix use of request_param

---------

Co-authored-by: Stéphane Gigandet <[email protected]>
  • Loading branch information
john-gom and stephanegigandet authored Oct 31, 2023
1 parent 67f09c1 commit 9018024
Show file tree
Hide file tree
Showing 9 changed files with 30 additions and 319 deletions.
1 change: 0 additions & 1 deletion .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ nginx:

mongodb:
- conf/mongodb/create_indexes.js
- scripts/refresh_products_tags.js
- .github/workflows/mongo-deploy.yml
- scripts/checkmongodb.pl
- scripts/update_all_products_from_dir_in_mongodb.pl
Expand Down
33 changes: 0 additions & 33 deletions .github/workflows/daily.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,36 +66,3 @@ jobs:
# make task fail intentionally
false
fi
refresh-products-tags:
runs-on: ubuntu-latest
timeout-minutes: 180
strategy:
matrix:
env: [mongo-dev]
environment: ${{ matrix.env }}
concurrency: ${{ matrix.env }}
needs: dev-db-sync
steps:
- name: Set various variables for the staging (.net) deployment
if: matrix.env == 'mongo-dev'
run: |
# deploy target
echo "SSH_HOST=10.1.0.200" >> $GITHUB_ENV
echo "SSH_PROXY_HOST=ovh1.openfoodfacts.org" >> $GITHUB_ENV
echo "SSH_USERNAME=off" >> $GITHUB_ENV
- name: Refresh MongoDB products_tags collection
uses: appleboy/ssh-action@master
with:
host: ${{ env.SSH_HOST }}
username: ${{ env.SSH_USERNAME }}
key: ${{ secrets.SSH_PRIVATE_KEY }}
proxy_host: ${{ env.SSH_PROXY_HOST }}
proxy_username: ${{ env.SSH_USERNAME }}
proxy_key: ${{ secrets.SSH_PRIVATE_KEY }}
command_timeout: 180m
script_stop: false
script: |
cd ${{ matrix.env }}
make refresh_product_tags
make restart_db
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt set -x && \
liblog-any-adapter-log4perl-perl \
# NB: not available in ubuntu 1804 LTS:
libgeoip2-perl \
libemail-valid-perl \
libemail-valid-perl
RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt set -x && \
apt install -y \
#
# cpan dependencies that can be satisfied by apt even if the package itself can't:
#
Expand Down
4 changes: 0 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,6 @@ create_mongodb_indexes:
${DOCKER_COMPOSE} exec -T mongodb //bin/sh -c "mongo off /data/db/create_indexes.js"

refresh_product_tags:
@echo "🥫 Refreshing products tags (update MongoDB products_tags collection) …"
# get id for mongodb container
docker cp scripts/refresh_products_tags.js $(shell docker-compose ps -q mongodb):/data/db
${DOCKER_COMPOSE} exec -T mongodb //bin/sh -c "mongo off /data/db/refresh_products_tags.js"
@echo "🥫 Refreshing product data cached in Postgres …"
${DOCKER_COMPOSE} run --rm backend perl /opt/product-opener/scripts/refresh_postgres.pl ${from}

Expand Down
47 changes: 13 additions & 34 deletions lib/ProductOpener/Data.pm
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,10 @@ ProductOpener::Data - methods to create or get the mongoDB client and fetch "dat
The module implements the methods required to fetch certain collections from the MongoDB database.
The functions used in this module are responsible for executing queries, to get connection to database and also to select the collection required.
The module exposes 2 distinct kinds of collections, products and products_tags, returned by the Data::get_products_collections
and the Data::get_products_tags_collections methods respectively.
The products collection contains a complete document for each product in the OpenFoodFacts database which exposes all
available information about the product.
The products_tags collection contains a stripped down version of the data in the products collection, where each
product entry has a select few fields, including fields used in tags. The main purpose of having this copy is to
improve performance of aggregate queries for an improved user experience and more efficient resource usage. This
collection was initially proposed in L<issue#1610|https://github.com/openfoodfacts/openfoodfacts-server/issues/1610> on
GitHub, where some additional context is available.
Obsolete products that have been withdrawn from the market have separate collections: products_obsolete and products_obsolete_tags
Obsolete products that have been withdrawn from the market have separate collections: products_obsolete
=cut

Expand Down Expand Up @@ -74,6 +65,7 @@ use ProductOpener::Config qw/:all/;
use MongoDB;
use Tie::IxHash;
use JSON::PP;
use CGI ':cgi-lib';
use Log::Any qw($log);

use Action::CircuitBreaker;
Expand Down Expand Up @@ -119,25 +111,27 @@ sub execute_query ($sub) {
)->run();
}

sub execute_aggregate_tags_query ($aggregate_parameters) {
return execute_tags_query('aggregate', $aggregate_parameters);
sub execute_aggregate_tags_query ($query) {
return execute_tags_query('aggregate', $query);
}

sub execute_count_tags_query ($query_ref) {
return execute_tags_query('count', $query_ref);
sub execute_count_tags_query ($query) {
return execute_tags_query('count', $query);
}

sub execute_tags_query ($type, $parameters) {
sub execute_tags_query ($type, $query) {
if ((defined $query_url) and (length($query_url) > 0)) {
$query_url =~ s/^\s+|\s+$//g;
my $path = "$query_url/$type";
$log->debug('Executing PostgreSQL ' . $type . ' query on ' . $path, {query => $parameters})
my $params = Vars();
my $url = URI->new("$query_url/$type");
$url->query_form($params);
$log->debug('Executing PostgreSQL ' . $type . ' query on ' . $url, {query => $query})
if $log->is_debug();

my $ua = LWP::UserAgent->new();
my $resp = $ua->post(
$path,
Content => encode_json($parameters),
$url,
Content => encode_json($query),
'Content-Type' => 'application/json; charset=utf-8'
);
if ($resp->is_success) {
Expand Down Expand Up @@ -184,16 +178,6 @@ This is useful when moving products to another flavour
If set to a true value, the function returns a collection that contains only obsolete products,
otherwise it returns the collection with products that are not obsolete.
=head4 tags
If set to a true value, the function may return a smaller collection that contains only the *_tags fields,
in order to speed aggregate queries. The smaller collection is created every night,
and may therefore contain slightly stale data.
As of 2023/03/13, we return the products_tags collection for non obsolete products.
For obsolete products, we currently return the products_obsolete collection, but we might
create a separate products_obsolete_tags collection in the future, if it becomes necessary to create one.
=head3 Return values
Returns a mongoDB collection object.
Expand All @@ -206,11 +190,6 @@ sub get_products_collection ($parameters_ref = {}) {
if ($parameters_ref->{obsolete}) {
$collection .= '_obsolete';
}
# We don't have a products_obsolete_tags collection at this point
# if it changes, the following elsif should be changed to a if
elsif ($parameters_ref->{tags}) {
$collection .= '_tags';
}
return get_collection($database, $collection, $parameters_ref->{timeout});
}

Expand Down
103 changes: 13 additions & 90 deletions lib/ProductOpener/Display.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1515,6 +1515,14 @@ sub can_use_query_cache() {
and (not $server_options{producers_platform}));
}

sub generate_query_cache_key ($name, $context_ref, $request_ref) {
# Generates a cache key taking the obsolete parameter into account
if (scalar request_param($request_ref, "obsolete")) {
$name .= '_obsolete';
}
return generate_cache_key($name, $context_ref);
}

sub query_list_of_tags ($request_ref, $query_ref) {

add_country_and_owner_filters_to_query($request_ref, $query_ref);
Expand Down Expand Up @@ -1607,7 +1615,7 @@ sub query_list_of_tags ($request_ref, $query_ref) {
}

#get cache results for aggregate query
my $key = generate_cache_key("aggregate", $aggregate_parameters);
my $key = generate_query_cache_key("aggregate", $aggregate_parameters, $request_ref);
$log->debug("MongoDB query key", {key => $key}) if $log->is_debug();
my $results = get_cache_results($key, $request_ref);

Expand All @@ -1617,33 +1625,6 @@ sub query_list_of_tags ($request_ref, $query_ref) {
# or if we are on the producers platform
if (can_use_query_cache()) {
$results = execute_aggregate_tags_query($aggregate_parameters);

if (not defined $results) {
eval {
$log->debug("Executing MongoDB aggregate query on products_tags collection",
{query => $aggregate_parameters})
if $log->is_debug();
$results = execute_query(
sub {
return get_products_collection(
get_products_collection_request_parameters($request_ref, {tags => 1}))
->aggregate($aggregate_parameters, {allowDiskUse => 1});
}
);
# the return value of aggregate has changed from version 0.702
# and v1.4.5 of the perl MongoDB module
$results = [$results->all] if defined $results;
};
my $err = $@;
if ($err) {
$log->warn("MongoDB error", {error => $err}) if $log->is_warn();
}
else {
$log->info("MongoDB query ok", {error => $err}) if $log->is_info();
}

$log->debug("MongoDB query done", {error => $err}) if $log->is_debug();
}
}

if (not defined $results) {
Expand Down Expand Up @@ -1702,7 +1683,7 @@ sub query_list_of_tags ($request_ref, $query_ref) {
else {

#get total count for aggregate (without limit) and put result in cache
my $key_count = generate_cache_key("aggregate_count", $aggregate_count_parameters);
my $key_count = generate_query_cache_key("aggregate_count", $aggregate_count_parameters, $request_ref);
$log->debug("MongoDB aggregate count query key", {key => $key_count}) if $log->is_debug();
my $results_count = get_cache_results($key_count, $request_ref);

Expand All @@ -1713,21 +1694,6 @@ sub query_list_of_tags ($request_ref, $query_ref) {
# or if we are on the producers platform
if (can_use_query_cache()) {
$count_results = execute_aggregate_tags_query($aggregate_count_parameters);

if (not defined $count_results) {
eval {
$log->debug("Executing MongoDB aggregate count query on products_tags collection",
{query => $aggregate_count_parameters})
if $log->is_debug();
$count_results = execute_query(
sub {
return get_products_collection(
get_products_collection_request_parameters($request_ref, {tags => 1}))
->aggregate($aggregate_count_parameters, {allowDiskUse => 1});
}
);
}
}
}

if (not defined $count_results) {
Expand Down Expand Up @@ -4629,8 +4595,6 @@ This function looks at the request object to set parameters to pass to the get_p
=head4 $additional_parameters_ref
An optional reference to a hash of parameters that should be added to the parameters extracted from the request object.
This is useful in particular to request the query to be executed on the smaller products_tags category, by passing
{ tags = 1 }
=head3 Return value
Expand Down Expand Up @@ -5215,7 +5179,7 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa
skip => $skip
];

my $key = generate_cache_key("search_products", $mongodb_query_ref);
my $key = generate_query_cache_key("search_products", $mongodb_query_ref, $request_ref);

$log->debug("MongoDB query key - search_products", {key => $key}) if $log->is_debug();

Expand Down Expand Up @@ -5258,7 +5222,7 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa

$request_ref->{structured_response}{page_count} = $page_count;

# The page count may be higher than the count from the products_tags collection which is updated every night
# The page count may be higher than the count from the query service which is updated every night
# in that case, set $count to $page_count
# It's also possible that the count query had a timeout and that $count is 0 even though we have results
if ($page_count > $count) {
Expand Down Expand Up @@ -5536,7 +5500,7 @@ sub estimate_result_count ($request_ref, $query_ref, $cache_results_flag) {
}
elsif (keys %{$query_ref} > 0) {
#check if count results is in cache
my $key_count = generate_cache_key("search_products_count", $query_ref);
my $key_count = generate_query_cache_key("search_products_count", $query_ref, $request_ref);
$log->debug("MongoDB query key - search_products_count", {key => $key_count}) if $log->is_debug();
$count = get_cache_results($key_count, $request_ref);
if (not defined $count) {
Expand All @@ -5546,47 +5510,6 @@ sub estimate_result_count ($request_ref, $query_ref, $cache_results_flag) {
# Count queries are very expensive, if possible, execute them on the postgres cache
if (can_use_query_cache()) {
$count = execute_count_tags_query($query_ref);

if (not defined $count) {
# Count queries are very expensive, if possible, execute them on the smaller products_tags collection
my $only_tags_filters = 1;

if ($server_options{producers_platform}) {
$only_tags_filters = 0;
}
else {

foreach my $field (keys %$query_ref) {
if ($field !~ /_tags$/) {
$log->debug(
"non tags field in query filters, cannot use smaller products_tags collection",
{field => $field, value => $query_ref->{field}})
if $log->is_debug();
$only_tags_filters = 0;
last;
}
}
}

if ( ($only_tags_filters)
and ((not defined single_param("no_cache")) or (single_param("no_cache") == 0)))
{

$count = execute_query(
sub {
$log->debug("count_documents on smaller products_tags collection", {key => $key_count})
if $log->is_debug();
return get_products_collection(
get_products_collection_request_parameters($request_ref, {tags => 1}))
->count_documents($query_ref);
}
);
$err = $@;
if ($err) {
$log->warn("MongoDB error during count", {error => $err}) if $log->is_warn();
}
}
}
}

if (not defined $count) {
Expand Down
12 changes: 0 additions & 12 deletions scripts/checkmongodb.pl
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,6 @@ ($)
return;
}

my $count;

eval {
$count = execute_query(
sub {
return get_products_tags_collection()->count_documents({});
}
);
};

print STDERR "count: $count\n";

if ($@) {
my $hostname = `hostname`;
my $msg = "Host: $hostname - Mongodb down: $@\n";
Expand Down
Loading

0 comments on commit 9018024

Please sign in to comment.