Skip to content

Commit

Permalink
Fix ever growing db tables
Browse files Browse the repository at this point in the history
  • Loading branch information
dennis-tra committed Sep 26, 2021
1 parent 6d60e93 commit 5476677
Show file tree
Hide file tree
Showing 42 changed files with 12,838 additions and 4,960 deletions.
1 change: 0 additions & 1 deletion migrations/000033_update_raw_visits_trigger.up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ BEGIN;
DROP TRIGGER insert_raw_visit ON raw_visits;
DROP FUNCTION normalize_raw_visit();


CREATE FUNCTION normalize_raw_visit() RETURNS TRIGGER AS
$normalize_raw_visit$
DECLARE
Expand Down
14 changes: 14 additions & 0 deletions migrations/000037_create_protocols_set_table.down.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
-- Begin the transaction
BEGIN;

ALTER TABLE peers DROP CONSTRAINT fk_peers_protocols_set_id;
ALTER TABLE peers DROP COLUMN protocols_set_id;

ALTER TABLE visits DROP CONSTRAINT fk_visits_protocols_set_id;
ALTER TABLE visits DROP COLUMN protocols_set_id;

DROP TABLE protocols_sets;
DROP TABLE protocols;

-- End the transaction
COMMIT;
119 changes: 119 additions & 0 deletions migrations/000037_create_protocols_set_table.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
-- Begin the transaction
BEGIN;

-- Activate intarray extension for efficient array operations
CREATE EXTENSION IF NOT EXISTS intarray;

-- The protocols table holds all the different protocols that the crawler came across
CREATE TABLE protocols
(
-- The ID for this protocol
id SERIAL,
-- When was this protocol updated the last time, used to retrieve the ID after an upsert operation
updated_at TIMESTAMPTZ NOT NULL,
-- When was this protocol created
created_at TIMESTAMPTZ NOT NULL,

-- The property name
protocol VARCHAR(1000) NOT NULL,

-- There should only be one protocol
CONSTRAINT uq_protocols_protocol UNIQUE (protocol),

PRIMARY KEY (id)
);

-- migrate protocols from the properties table
INSERT INTO protocols (protocol, updated_at, created_at)
SELECT value, p.updated_at, p.created_at
FROM properties p
WHERE p.property = 'protocol';

-- Since the set of protocols for a particular peer doesn't change very often in between crawls. The
-- visits_x_properties table is blowing up quite quickly. This table holds particular sets of protocols
-- that peers support. Each visit is then linked to just one of these sets.
CREATE TABLE protocols_sets
(
-- The ID for this set of properties
id SERIAL,
-- The properties in this set
protocol_ids INT ARRAY NOT NULL,

-- Don't allow identical sets in the database
EXCLUDE USING GIST(protocol_ids WITH =),

PRIMARY KEY (id)
);

-- Allow efficient lookups of particular protocol sets.
CREATE INDEX idx_protocols_sets_protocol_ids on protocols_sets USING GIN (protocol_ids);

-- A temporary table for this transaction. This table holds all sets of protocols for each visit.
CREATE TEMP TABLE visits_agg_protocols ON COMMIT DROP AS (
SELECT visit_id, uniq(sort(array_agg(prot.id))) as protocols_set
FROM visits_x_properties vxp
INNER JOIN properties p on p.id = vxp.property_id
INNER JOIN protocols prot ON p.value = prot.protocol
WHERE p.property = 'protocol'
GROUP BY 1);

-- This temporary table holds all distinct protocol sets
CREATE TEMP TABLE distinct_visits_agg_protocols ON COMMIT DROP AS (
SELECT DISTINCT protocols_set
FROM visits_agg_protocols);

-- Save all the distinct sets in to the protocols_sets table
INSERT
INTO protocols_sets (protocol_ids)
SELECT distinct_visits_agg_protocols.protocols_set
FROM distinct_visits_agg_protocols;

-- Create a column on the visits table to associate a visit with a set of protocols
ALTER TABLE visits
ADD COLUMN protocols_set_id INT;

-- For each visit in the visits_agg_protocols table find the associated protocol set
-- then set the protocols_set_id column to that set.
WITH visits_x_protocols_sets AS (
SELECT ag.visit_id AS visit_id, ps.id AS protocols_set_id
FROM protocols_sets ps
INNER JOIN visits_agg_protocols ag ON ag.protocols_set = ps.protocol_ids
)
UPDATE visits
SET protocols_set_id = vxps.protocols_set_id
FROM visits_x_protocols_sets vxps
WHERE vxps.visit_id = visits.id;

ALTER TABLE visits ADD CONSTRAINT fk_visits_protocols_set_id FOREIGN KEY (protocols_set_id)
REFERENCES protocols_sets (id)
ON DELETE NO ACTION;


CREATE TEMP TABLE peers_agg_protocols ON COMMIT DROP AS (
SELECT peer_id, uniq(sort(array_agg(prot.id))) as protocols_set
FROM peers_x_properties vxp
INNER JOIN properties p on p.id = vxp.property_id
INNER JOIN protocols prot ON p.value = prot.protocol
WHERE p.property = 'protocol'
GROUP BY 1);

-- Create a column on the visits table to associate a visit with a set of protocols
ALTER TABLE peers
ADD COLUMN protocols_set_id INT;

WITH peers_x_protocols_sets AS (
SELECT ag.peer_id AS visit_id, ps.id AS protocols_set_id
FROM protocols_sets ps
INNER JOIN peers_agg_protocols ag ON ag.protocols_set = ps.protocol_ids
)
UPDATE peers
SET protocols_set_id = pxps.protocols_set_id
FROM peers_x_protocols_sets pxps
WHERE pxps.visit_id = peers.id;

ALTER TABLE peers ADD CONSTRAINT fk_peers_protocols_set_id FOREIGN KEY (protocols_set_id)
REFERENCES protocols_sets (id)
ON DELETE NO ACTION;

-- End the transaction
COMMIT;
13 changes: 13 additions & 0 deletions migrations/000038_create_agent_versions_table.down.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
-- Begin the transaction
BEGIN;

ALTER TABLE peers DROP CONSTRAINT fk_peers_agent_version_id;
ALTER TABLE peers DROP COLUMN agent_version_id;

ALTER TABLE visits DROP CONSTRAINT fk_visits_agent_version_id;
ALTER TABLE visits DROP COLUMN agent_version_id;

DROP TABLE agent_versions;

-- End the transaction
COMMIT;
70 changes: 70 additions & 0 deletions migrations/000038_create_agent_versions_table.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
-- Begin the transaction
BEGIN;

-- agent_versions
CREATE TABLE agent_versions
(
-- The ID for this agent version
id SERIAL,
-- When was this agent version updated the last time, used to retrieve the ID after an upsert operation
updated_at TIMESTAMPTZ NOT NULL,
-- When was this agent version created
created_at TIMESTAMPTZ NOT NULL,

-- The property name
agent_version VARCHAR(1000) NOT NULL,

-- There should only be one protocol
CONSTRAINT uq_agent_versions_agent_version UNIQUE (agent_version),

PRIMARY KEY (id)
);

-- migrate agent_versions
INSERT INTO agent_versions (agent_version, updated_at, created_at)
SELECT value, p.updated_at, p.created_at
FROM properties p
WHERE p.property = 'agent_version';

-- add agent version column
ALTER TABLE visits
ADD COLUMN agent_version_id INT;

-- migrate agent versions
UPDATE visits
SET agent_version_id = subquery.agent_version_id
FROM (SELECT visit_id, av.id as agent_version_id
FROM visits_x_properties vxp
INNER JOIN properties p ON p.id = vxp.property_id
INNER JOIN agent_versions av ON p.value = av.agent_version
WHERE p.property = 'agent_version') AS subquery
WHERE visits.id = subquery.visit_id;


ALTER TABLE visits
ADD CONSTRAINT fk_visits_agent_version_id FOREIGN KEY (agent_version_id)
REFERENCES agent_versions (id)
ON DELETE NO ACTION;


-- Create a column on the visits table to associate a visit with a set of protocols
ALTER TABLE peers
ADD COLUMN agent_version_id INT;

-- migrate agent versions
UPDATE peers
SET agent_version_id = subquery.agent_version_id
FROM (SELECT peer_id, av.id as agent_version_id
FROM peers_x_properties vxp
INNER JOIN properties p ON p.id = vxp.property_id
INNER JOIN agent_versions av ON p.value = av.agent_version
WHERE p.property = 'agent_version') AS subquery
WHERE peers.id = subquery.peer_id;

ALTER TABLE peers
ADD CONSTRAINT fk_peers_agent_version_id FOREIGN KEY (agent_version_id)
REFERENCES agent_versions (id)
ON DELETE NO ACTION;

-- End the transaction
COMMIT;
Empty file.
60 changes: 60 additions & 0 deletions migrations/000039_migrate_crawl_properties.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
-- Begin the transaction
BEGIN;

ALTER TABLE crawl_properties
RENAME TO crawl_properties_old;

CREATE TABLE crawl_properties
(
id SERIAL PRIMARY KEY,
crawl_id SERIAL NOT NULL,
protocol_id INT,
agent_version_id INT,
error dial_error,
count INT NOT NULL,
created_at TIMESTAMPTZ NOT NULL,
updated_at TIMESTAMPTZ NOT NULL
);

ALTER TABLE crawl_properties
ADD CONSTRAINT fk_crawl_properties_crawl_id
FOREIGN KEY (crawl_id)
REFERENCES crawls (id)
ON DELETE CASCADE;

ALTER TABLE crawl_properties
ADD CONSTRAINT fk_crawl_properties_protocol_id
FOREIGN KEY (protocol_id)
REFERENCES protocols (id)
ON DELETE NO ACTION;

ALTER TABLE crawl_properties
ADD CONSTRAINT fk_crawl_properties_agent_version_id
FOREIGN KEY (agent_version_id)
REFERENCES agent_versions (id)
ON DELETE NO ACTION;

INSERT INTO crawl_properties (crawl_id, protocol_id, count, created_at, updated_at)
SELECT cp.crawl_id, prot.id protocol_id, count, cp.created_at, cp.updated_at
FROM crawl_properties_old cp
INNER JOIN properties p on cp.property_id = p.id
INNER JOIN protocols prot on prot.protocol = p.value
WHERE p.property = 'protocol';

INSERT INTO crawl_properties (crawl_id, agent_version_id, count, created_at, updated_at)
SELECT cp.crawl_id, av.id agent_version_id, count, cp.created_at, cp.updated_at
FROM crawl_properties_old cp
INNER JOIN properties p on cp.property_id = p.id
INNER JOIN agent_versions av on av.agent_version = p.value
WHERE p.property = 'agent_version';

INSERT INTO crawl_properties (crawl_id, error, count, created_at, updated_at)
SELECT cp.crawl_id, p.value::dial_error, count, cp.created_at, cp.updated_at
FROM crawl_properties_old cp
INNER JOIN properties p on cp.property_id = p.id
WHERE p.property = 'error';

DROP TABLE crawl_properties_old;

-- End the transaction
COMMIT;
1 change: 1 addition & 0 deletions migrations/000040_drop_properties_table.down.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-- can't bother
9 changes: 9 additions & 0 deletions migrations/000040_drop_properties_table.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- Begin the transaction
BEGIN;

DROP TABLE visits_x_properties;
DROP TABLE peers_x_properties;
DROP TABLE properties;

-- End the transaction
COMMIT;
10 changes: 10 additions & 0 deletions migrations/000041_create_multi_addresses_set_table.down.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-- Begin the transaction
BEGIN;

ALTER TABLE visits DROP CONSTRAINT fk_visits_multi_addresses_set_id;
ALTER TABLE visits DROP COLUMN multi_addresses_set_id;

DROP TABLE multi_addresses_sets;

-- End the transaction
COMMIT;
62 changes: 62 additions & 0 deletions migrations/000041_create_multi_addresses_set_table.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
-- Begin the transaction
BEGIN;

CREATE TABLE multi_addresses_sets
(
-- The ID for this set of multi addresses
id SERIAL,

created_at TIMESTAMPTZ NOT NULL,
updated_at TIMESTAMPTZ NOT NULL,

-- The multi addresses in this set
multi_address_ids INT ARRAY NOT NULL,

-- Don't allow identical sets in the database
EXCLUDE USING GIST(multi_address_ids gist__intbig_ops WITH =),

PRIMARY KEY (id)
);

CREATE TEMP TABLE visits_agg_multi_addresses ON COMMIT DROP AS (
SELECT visit_id, uniq(sort(array_agg(multi_address_id))) as multi_address_ids
FROM visits_x_multi_addresses
GROUP BY 1);

-- This temporary table holds all distinct protocol sets
CREATE TEMP TABLE distinct_visits_agg_protocols ON COMMIT DROP AS (
SELECT DISTINCT multi_address_ids
FROM visits_agg_multi_addresses);


-- Create distinct sets of multi_addresses from all visits
INSERT
INTO multi_addresses_sets (multi_address_ids, updated_at, created_at)
SELECT distinct_visits_agg_protocols.multi_address_ids, NOW(), NOW()
FROM distinct_visits_agg_protocols;


-- Create a column to associate a visit with a set of protocols
ALTER TABLE visits
ADD COLUMN multi_addresses_set_id INT;

-- associate a visit with the protocol
WITH visit_x_multi_addresses_set AS (
SELECT ag.visit_id AS visit_id, mas.id AS multi_addresses_set
FROM multi_addresses_sets mas
INNER JOIN visits_agg_multi_addresses ag ON ag.multi_address_ids = mas.multi_address_ids
)
UPDATE visits
SET multi_addresses_set_id = vxma.multi_addresses_set
FROM visit_x_multi_addresses_set vxma
WHERE vxma.visit_id = visits.id;

ALTER TABLE visits
ADD CONSTRAINT fk_visits_multi_addresses_set_id FOREIGN KEY (multi_addresses_set_id)
REFERENCES multi_addresses_sets (id)
ON DELETE NO ACTION;

CREATE INDEX idx_multi_addresses_sets_multi_address_ids ON multi_addresses_sets USING GIN (multi_address_ids);

-- End the transaction
COMMIT;
Empty file.
7 changes: 7 additions & 0 deletions migrations/000042_drop_multi_address_x_tables.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- Begin the transaction
BEGIN;

DROP TABLE visits_x_multi_addresses;

-- End the transaction
COMMIT;
Loading

0 comments on commit 5476677

Please sign in to comment.