Skip to content

Commit

Permalink
Fixes #9, set schema fix
Browse files Browse the repository at this point in the history
  • Loading branch information
JolanThomassin committed Dec 13, 2023
1 parent afffd40 commit 42965b0
Show file tree
Hide file tree
Showing 9 changed files with 73 additions and 97 deletions.
2 changes: 1 addition & 1 deletion sql/2023-11-21-clone-schema.sql
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT clone_schema('louis_v005', '"louis_0.0.6"', TRUE);
SELECT clone_schema('louis_v005', '"louis_0.0.6"', TRUE);
2 changes: 1 addition & 1 deletion sql/2023-11-23-create-chunk-score-table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ CREATE TABLE chunk_score (
id uuid default uuid_generate_v4 (),
score FLOAT,
score_type VARCHAR(50)
);
);
4 changes: 2 additions & 2 deletions sql/2023-11-23-fill-chunk-score-table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ BEGIN
SELECT
COUNT(*) INTO tr_count
FROM
"louis_0.0.6".chunk
chunk
WHERE
id = chunk_id
AND html_content LIKE '%<tr>%';
Expand All @@ -28,4 +28,4 @@ RETURN tr_count;

END;

$ $ LANGUAGE plpgsql;
$ $ LANGUAGE plpgsql;
16 changes: 8 additions & 8 deletions sql/2023-11-28-chunk-didactic-score.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- Set the search path to the louis_006 schema
-- Set the search path to the louis_0.0.6 schema
SET
search_path TO "louis_0.0.6";

Expand All @@ -25,9 +25,9 @@ SELECT
) AS tr_proportion,
'didactic' AS score_type
FROM
"louis_0.0.6".chunk ch
INNER JOIN "louis_0.0.6".html_content_to_chunk hctc ON ch.id = hctc.chunk_id
INNER JOIN "louis_0.0.6".html_content hc ON hctc.md5hash = hc.md5hash
chunk ch
INNER JOIN html_content_to_chunk hctc ON ch.id = hctc.chunk_id
INNER JOIN html_content hc ON hctc.md5hash = hc.md5hash
CROSS JOIN (
SELECT
MIN(
Expand All @@ -37,9 +37,9 @@ FROM
LENGTH(content) - LENGTH(REPLACE(content, '<tr>', ''))
) / LENGTH('<tr>') AS max_val
FROM
"louis_0.0.6".chunk ch
INNER JOIN "louis_0.0.6".html_content_to_chunk hctc ON ch.id = hctc.chunk_id
INNER JOIN "louis_0.0.6".html_content hc ON hctc.md5hash = hc.md5hash
chunk ch
INNER JOIN html_content_to_chunk hctc ON ch.id = hctc.chunk_id
INNER JOIN html_content hc ON hctc.md5hash = hc.md5hash
) AS length_values
ORDER BY
tr_proportion DESC;
tr_proportion DESC;
79 changes: 52 additions & 27 deletions sql/2023-11-28-create-histogram.sql
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
-- Set the search path to the louis_006 schema
SET search_path TO "louis_0.0.6";

-- Set the search path to the louis_0.0.6 schema
SET
search_path TO "louis_0.0.6";

WITH tr_counts AS (
SELECT
c.id,
(LENGTH(h.content) - LENGTH(REPLACE(h.content, '<tr>', ''))) / LENGTH('<tr>') AS tr_count
(
LENGTH(h.content) - LENGTH(REPLACE(h.content, '<tr>', ''))
) / LENGTH('<tr>') AS tr_count
FROM
crawl c
INNER JOIN html_content h ON c.md5hash = h.md5hash
Expand All @@ -23,28 +26,50 @@ histogram AS (
bucket_min,
bucket_max,
COUNT(t.score) AS freq,
REPEAT('',
(COUNT(t.score)::FLOAT / GREATEST(MAX(COUNT(t.score)) OVER(), 1) * 30)::INT
REPEAT(
'',
(
COUNT(t.score) :: FLOAT / GREATEST(MAX(COUNT(t.score)) OVER(), 1) * 30
) :: INT
) AS bar
FROM (
SELECT
CAST(generate_series(0, 9) AS FLOAT) / 10.0 AS bucket_min,
CAST(generate_series(1, 10) AS FLOAT) / 10.0 AS bucket_max
) AS buckets
LEFT JOIN (
SELECT
id,
tr_count,
CASE
WHEN max_tr_count = min_tr_count THEN 0.0
ELSE 1.0 - ((tr_count - min_tr_count)::FLOAT / (max_tr_count - min_tr_count))
END AS score
FROM
tr_stats
) t ON t.score >= buckets.bucket_min AND
(t.score < buckets.bucket_max OR
(buckets.bucket_max = 1.0 AND t.score <= buckets.bucket_max))
GROUP BY buckets.bucket_min, buckets.bucket_max
ORDER BY buckets.bucket_min, buckets.bucket_max
FROM
(
SELECT
CAST(generate_series(0, 9) AS FLOAT) / 10.0 AS bucket_min,
CAST(generate_series(1, 10) AS FLOAT) / 10.0 AS bucket_max
) AS buckets
LEFT JOIN (
SELECT
id,
tr_count,
CASE
WHEN max_tr_count = min_tr_count THEN 0.0
ELSE 1.0 - (
(tr_count - min_tr_count) :: FLOAT / (max_tr_count - min_tr_count)
)
END AS score
FROM
tr_stats
) t ON t.score >= buckets.bucket_min
AND (
t.score < buckets.bucket_max
OR (
buckets.bucket_max = 1.0
AND t.score <= buckets.bucket_max
)
)
GROUP BY
buckets.bucket_min,
buckets.bucket_max
ORDER BY
buckets.bucket_min,
buckets.bucket_max
)
SELECT bucket_min, bucket_max, freq, bar FROM histogram;
SELECT
bucket_min,
bucket_max,
freq,
bar
FROM
histogram;

2 changes: 1 addition & 1 deletion sql/2023-11-28-print-schema-table.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT table_name, column_name, data_type
FROM information_schema.columns
WHERE table_schema = 'louis_006';
WHERE table_schema = 'louis_0.0.6';
4 changes: 4 additions & 0 deletions sql/2023-11-30-tr-count.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
-- Set the search path to the louis_0.0.6 schema
SET
search_path TO "louis_0.0.6";

WITH tr_counts AS (
SELECT
id,
Expand Down
55 changes: 1 addition & 54 deletions sql/2023-12-11-score-by-element.sql
Original file line number Diff line number Diff line change
@@ -1,57 +1,4 @@
-- Set the search path to the louis_006 schema
SET
search_path TO "louis_0.0.6";

WITH tr_counts AS (
SELECT
id,
(LENGTH(h.content) - LENGTH(REPLACE(h.content, '<tr>', ''))) / LENGTH('<tr>') AS tr_count
FROM
crawl c
INNER JOIN html_content h ON c.md5hash = h.md5hash
),
tr_stats AS (
SELECT
id,
tr_count,
MAX(tr_count) OVER () AS max_tr_count,
MIN(tr_count) OVER () AS min_tr_count
FROM
tr_counts
),
histogram AS (
SELECT
bucket_min,
bucket_max,
COUNT(t.score) AS freq,
REPEAT('',
(COUNT(t.score)::FLOAT / GREATEST(MAX(COUNT(t.score)) OVER(), 1) * 30)::INT
) AS bar
FROM (
SELECT
CAST(generate_series(0, 9) AS FLOAT) / 10.0 AS bucket_min,
CAST(generate_series(1, 10) AS FLOAT) / 10.0 AS bucket_max
) AS buckets
LEFT JOIN (
SELECT
id,
tr_count,
CASE
WHEN max_tr_count = min_tr_count THEN 0.0
ELSE 1.0 - ((tr_count - min_tr_count)::FLOAT / (max_tr_count - min_tr_count))
END AS score
FROM
tr_stats
) t ON t.score >= buckets.bucket_min AND
(t.score < buckets.bucket_max OR
(buckets.bucket_max = 1.0 AND t.score <= buckets.bucket_max))
GROUP BY buckets.bucket_min, buckets.bucket_max
ORDER BY buckets.bucket_min, buckets.bucket_max
)
SELECT bucket_min, bucket_max, freq, bar FROM histogram;


-- Set the search path to the louis_006 schema
-- Set the search path to the louis_0.0.6 schema
SET
search_path TO "louis_0.0.6";

Expand Down
6 changes: 3 additions & 3 deletions sql/2023-12-12-create-tabletest.sql
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
-- Set the search path to the louis_006 schema
-- Set the search path to the louis_0.0.6 schema
SET
search_path TO "louis_0.0.6";

CREATE TABLE (
content VARCHAR(255) -- Change 255 to the desired length of the string
);

TRUNCATE TABLE "louis_0.0.6".tabletest;
TRUNCATE TABLE tabletest;

CREATE TABLE "louis_0.0.6".tabletest (
CREATE TABLE tabletest (
id SERIAL PRIMARY KEY,
html_content TEXT
);
Expand Down

0 comments on commit 42965b0

Please sign in to comment.