Skip to content

Commit

Permalink
chore: improve database performance with indices and improved functions
Browse files Browse the repository at this point in the history
  • Loading branch information
ewan-escience committed Jul 26, 2024
1 parent 8e813bc commit 52bd311
Show file tree
Hide file tree
Showing 11 changed files with 128 additions and 117 deletions.
6 changes: 3 additions & 3 deletions data-generation/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function generateUniqueCaseInsensitiveString(randomStringGenerator) {
throw 'Tried to generate a unique (ignoring case) string for 10000 times but failed to do so';
}

function generateMentions(amountExtra = 100) {
function generateMentions(amountExtra = 1000) {
const mentionTypes = [
'blogPost',
'book',
Expand Down Expand Up @@ -91,7 +91,7 @@ function generateMentions(amountExtra = 100) {
return result;
}

function generateSoftware(amount = 500) {
function generateSoftware(amount = 1000) {
// real software has a real concept DOI
const amountRealSoftware = Math.min(conceptDois.length, amount);
const brandNames = [];
Expand Down Expand Up @@ -399,7 +399,7 @@ function generateSoftwareHighlights(ids) {
return result;
}

function generateProjects(amount = 500) {
function generateProjects(amount = 1000) {
const result = [];

const projectStatuses = ['finished', 'running', 'starting'];
Expand Down
12 changes: 10 additions & 2 deletions database/009-create-keyword-and-category.sql
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,16 @@ CREATE TABLE keyword_for_software (
PRIMARY KEY (software, keyword)
);

CREATE INDEX keyword_for_software_keyword_idx ON keyword_for_software(keyword);

CREATE TABLE keyword_for_project (
project UUID references project (id),
keyword UUID references keyword (id),
PRIMARY KEY (project, keyword)
);

CREATE INDEX keyword_for_project_keyword_idx ON keyword_for_project(keyword);

-- ADD basic keywords from topics and tags
INSERT into keyword (value)
VALUES
Expand All @@ -72,6 +76,8 @@ CREATE TABLE keyword_for_community (
PRIMARY KEY (community, keyword)
);

CREATE INDEX keyword_for_community_keyword_idx ON keyword_for_community(keyword);


----------------
-- Categories --
Expand All @@ -92,8 +98,8 @@ CREATE TABLE category (
CONSTRAINT highlight_must_be_top_level_category CHECK (NOT ((properties->>'is_highlight')::boolean AND parent IS NOT NULL))
);

CREATE INDEX category_parent_idx ON category (parent);
CREATE INDEX category_community_idx ON category (community);
CREATE INDEX category_parent_idx ON category(parent);
CREATE INDEX category_community_idx ON category(community);


CREATE TABLE category_for_software (
Expand All @@ -102,6 +108,8 @@ CREATE TABLE category_for_software (
PRIMARY KEY (software_id, category_id)
);

CREATE INDEX category_for_software_category_id_idx ON category_for_software(category_id);


-- sanitize categories

Expand Down
2 changes: 2 additions & 0 deletions database/010-create-research-domain.sql
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ CREATE TABLE research_domain_for_project (
PRIMARY KEY (project, research_domain)
);

CREATE INDEX research_domain_for_project_research_domain_idx ON research_domain_for_project(research_domain);


-- insert RSE list of research domains

Expand Down
9 changes: 9 additions & 0 deletions database/011-create-mention-table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -83,20 +83,26 @@ CREATE TABLE mention_for_software (
PRIMARY KEY (mention, software)
);

CREATE INDEX mention_for_software_software_idx ON mention_for_software(software);


CREATE TABLE reference_paper_for_software (
mention UUID REFERENCES mention (id),
software UUID REFERENCES software (id),
PRIMARY KEY (mention, software)
);

CREATE INDEX reference_paper_for_software_software_idx ON reference_paper_for_software(software);


CREATE TABLE citation_for_mention (
mention UUID REFERENCES mention (id),
citation UUID REFERENCES mention (id),
PRIMARY KEY (mention, citation)
);

CREATE INDEX citation_for_mention_citation_idx ON citation_for_mention(citation);


CREATE FUNCTION search_mentions_for_software(software_id UUID, search_text VARCHAR) RETURNS SETOF mention STABLE LANGUAGE plpgsql AS
$$
Expand All @@ -120,6 +126,7 @@ CREATE TABLE output_for_project (
PRIMARY KEY (mention, project)
);

CREATE INDEX output_for_project_project_idx ON output_for_project(project);


CREATE TABLE impact_for_project (
Expand All @@ -128,6 +135,8 @@ CREATE TABLE impact_for_project (
PRIMARY KEY (mention, project)
);

CREATE INDEX impact_for_project_project_idx ON impact_for_project(project);


CREATE FUNCTION search_impact_for_project(project_id UUID, search_text VARCHAR) RETURNS SETOF mention STABLE LANGUAGE plpgsql AS
$$
Expand Down
2 changes: 2 additions & 0 deletions database/012-create-release-table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ CREATE TABLE release_version (
PRIMARY KEY (release_id, mention_id)
);

CREATE INDEX release_version_mention_id_idx ON release_version(mention_id);


CREATE FUNCTION z_delete_old_releases() RETURNS TRIGGER LANGUAGE plpgsql SECURITY DEFINER AS
$$
Expand Down
5 changes: 3 additions & 2 deletions database/014-create-organisation-table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ CREATE TABLE organisation (
UNIQUE (slug, parent)
);

CREATE UNIQUE INDEX unique_slug_for_top_level_org_idx ON organisation (slug, (parent IS NULL)) WHERE parent IS NULL;
CREATE UNIQUE INDEX unique_name_and_parent_idx ON organisation (name, parent);
CREATE INDEX organisation_parent_idx ON organisation(parent);
CREATE UNIQUE INDEX unique_slug_for_top_level_org_idx ON organisation(slug, (parent IS NULL)) WHERE parent IS NULL;
CREATE UNIQUE INDEX unique_name_and_parent_idx ON organisation(name, parent);

CREATE FUNCTION check_cycle_organisations() RETURNS TRIGGER STABLE LANGUAGE plpgsql SECURITY DEFINER AS
$$
Expand Down
4 changes: 4 additions & 0 deletions database/015-inter-relation-tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ CREATE TABLE software_for_organisation (
PRIMARY KEY (software, organisation)
);

CREATE INDEX software_for_organisation_organisation_idx ON software_for_organisation(organisation);

CREATE FUNCTION sanitise_update_software_for_organisation() RETURNS TRIGGER LANGUAGE plpgsql AS
$$
BEGIN
Expand Down Expand Up @@ -143,6 +145,8 @@ CREATE TABLE project_for_organisation (
PRIMARY KEY (project, organisation, role)
);

CREATE INDEX project_for_organisation_organisation_idx ON project_for_organisation(organisation);

CREATE FUNCTION sanitise_update_project_for_organisation() RETURNS TRIGGER LANGUAGE plpgsql AS
$$
BEGIN
Expand Down
2 changes: 1 addition & 1 deletion database/100-create-api-views.sql
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ CREATE FUNCTION releases_by_organisation() RETURNS TABLE (
release_authors VARCHAR
) LANGUAGE sql STABLE AS
$$
SELECT DISTINCT
SELECT DISTINCT ON (organisation_id, software_id, mention.id)
organisation.id AS organisation_id,
software.id AS software_id,
software.slug AS software_slug,
Expand Down
95 changes: 43 additions & 52 deletions database/104-software-views.sql
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,7 @@ WHERE
)
GROUP BY
reference_paper_for_software.software,
mention.id,
mention.doi,
mention.url,
mention.title,
mention.authors,
mention.publisher,
mention.publication_year,
mention.journal,
mention.page,
mention.image_url,
mention.mention_type,
mention.source
mention.id
;
$$;

Expand All @@ -116,46 +105,48 @@ CREATE FUNCTION mentions_by_software() RETURNS TABLE (
source VARCHAR
) LANGUAGE sql STABLE AS
$$
-- mentions for software
SELECT
mention_for_software.software,
mention.id,
mention.doi,
mention.url,
mention.title,
mention.authors,
mention.publisher,
mention.publication_year,
mention.journal,
mention.page,
mention.image_url,
mention.mention_type,
mention.source
FROM
mention
INNER JOIN
mention_for_software ON mention_for_software.mention = mention.id
-- will deduplicate identical entries
-- from scraped citations
UNION
-- scraped citations from reference papers
SELECT
software,
id,
doi,
url,
title,
authors,
publisher,
publication_year,
journal,
page,
image_url,
mention_type,
source
FROM
citation_by_software()
;
WITH mentions_and_citations AS (
-- mentions for software
SELECT
mention_for_software.software,
mention.id,
mention.doi,
mention.url,
mention.title,
mention.authors,
mention.publisher,
mention.publication_year,
mention.journal,
mention.page,
mention.image_url,
mention.mention_type,
mention.source
FROM
mention
INNER JOIN
mention_for_software ON mention_for_software.mention = mention.id
-- does not deduplicate identical entries, but we will do so below with DISTINCT
-- from scraped citations
UNION ALL
-- scraped citations from reference papers
SELECT
software,
id,
doi,
url,
title,
authors,
publisher,
publication_year,
journal,
page,
image_url,
mention_type,
source
FROM
citation_by_software()
)
SELECT DISTINCT ON (mentions_and_citations.software, mentions_and_citations.id) * FROM mentions_and_citations;
$$;

-- COUNT mentions per software
Expand Down
101 changes: 46 additions & 55 deletions database/106-project-views.sql
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,7 @@ WHERE
)
GROUP BY
output_for_project.project,
mention.id,
mention.doi,
mention.url,
mention.title,
mention.authors,
mention.publisher,
mention.publication_year,
mention.journal,
mention.page,
mention.image_url,
mention.mention_type,
mention.source
mention.id
;
$$;

Expand All @@ -90,49 +79,51 @@ CREATE FUNCTION impact_by_project() RETURNS TABLE (
note VARCHAR
) LANGUAGE sql STABLE AS
$$
-- impact for project
SELECT
impact_for_project.project,
mention.id,
mention.doi,
mention.url,
mention.title,
mention.authors,
mention.publisher,
mention.publication_year,
mention.journal,
mention.page,
mention.image_url,
mention.mention_type,
mention.source,
mention.note
FROM
mention
INNER JOIN
impact_for_project ON impact_for_project.mention = mention.id
-- will deduplicate identical entries
-- from scraped citations
UNION
-- scraped citations from reference papers
SELECT
project,
id,
doi,
url,
title,
authors,
publisher,
publication_year,
journal,
page,
image_url,
mention_type,
source,
-- scraped citations have no note prop
-- we need this prop in the edit impact section
NULL as note
FROM
citation_by_project()
WITH impact_and_citations AS (
-- impact for project
SELECT
impact_for_project.project,
mention.id,
mention.doi,
mention.url,
mention.title,
mention.authors,
mention.publisher,
mention.publication_year,
mention.journal,
mention.page,
mention.image_url,
mention.mention_type,
mention.source,
mention.note
FROM
mention
INNER JOIN
impact_for_project ON impact_for_project.mention = mention.id
-- does not deduplicate identical entries, but we will do so below with DISTINCT
-- from scraped citations
UNION ALL
-- scraped citations from reference papers
SELECT
project,
id,
doi,
url,
title,
authors,
publisher,
publication_year,
journal,
page,
image_url,
mention_type,
source,
-- scraped citations have no note prop
-- we need this prop in the edit impact section
NULL as note
FROM
citation_by_project()
) SELECT DISTINCT ON (impact_and_citations.project, impact_and_citations.id) * FROM impact_and_citations;
;
$$;

Expand Down
Loading

0 comments on commit 52bd311

Please sign in to comment.