From 4d0ccc17b0d84dff576f00e7a42c2ee6e7b0ac09 Mon Sep 17 00:00:00 2001 From: Doug Martin Date: Wed, 22 May 2024 15:56:46 -0400 Subject: [PATCH] feat: Hash usernames using SHA1 when hiding names [PT-184362179] --- query-creator/create-query/app.js | 4 +- query-creator/create-query/steps/aws.js | 46 +++++++++++++------ query-creator/create-query/steps/env-vars.js | 3 ++ .../create-query/tests/unit/generated-sql.js | 14 +++--- query-creator/env.sample.json | 3 +- query-creator/template.yaml | 6 ++- 6 files changed, 50 insertions(+), 26 deletions(-) diff --git a/query-creator/create-query/app.js b/query-creator/create-query/app.js index f5022011..d74cf428 100644 --- a/query-creator/create-query/app.js +++ b/query-creator/create-query/app.js @@ -139,8 +139,8 @@ const learnersReport = async (params, body, tokenServiceEnv, debugSQL, reportSer const doLearnerLogReporting= async () => { // generate the sql for the query const sql = narrowLearners - ? aws.generateNarrowLogSQL(queryIdsPerRunnable, authDomain, reportServiceSource) // hideNames not needed here as no learner info is output - : aws.generateLearnerLogSQL(queryIdsPerRunnable, authDomain, reportServiceSource, hideNames); + ? aws.generateNarrowLogSQL(queryIdsPerRunnable, hideNames) + : aws.generateLearnerLogSQL(queryIdsPerRunnable, hideNames); if (debugSQL) { sqlOutput.push(sql); diff --git a/query-creator/create-query/steps/aws.js b/query-creator/create-query/steps/aws.js index 6fc51e07..c18b3a1b 100644 --- a/query-creator/create-query/steps/aws.js +++ b/query-creator/create-query/steps/aws.js @@ -4,6 +4,9 @@ const request = require("./request"); const PAGE_SIZE = 2000; +const usernameHashSalt = process.env.USERNAME_HASH_SALT || "no-username-salt-provided"; +const maybeHashUsername = (hash, col, skipAs) => hash ? `to_hex(sha1(cast(('${usernameHashSalt}' || ${col}) as varbinary)))${!skipAs ? " as username" : ""}` : col + // Column format of: // { name: "column name", value: "main value on each row", header: "optional first row value"} const selectFromColumn = (column) => { @@ -276,6 +279,12 @@ exports.generateNoResourceSQL = (runnableInfo, hideNames) => { value: "arbitrary(l.student_id)" } } + if (md === "username") { + return { + name: md, + value: `arbitrary(${maybeHashUsername(hideNames, "l.username", true)})` + } + } return { name: md, value: md === "resource_url" ? "null" : `arbitrary(l.${md})` @@ -377,7 +386,7 @@ exports.generateSQL = (runnableInfo, usageReport, authDomain, sourceKey, hideNam WHERE a.escaped_url = '${escapeSingleQuote(escapedUrl)}' GROUP BY l.run_remote_endpoint)`) - learnerAndAnswerQueries.push(`learners_and_answers_${resIndex} AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, ${studentNameCol}, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_${resIndex}.kv1 kv1, grouped_answers_${resIndex}.submitted submitted, grouped_answers_${resIndex}.source_key source_key, + learnerAndAnswerQueries.push(`learners_and_answers_${resIndex} AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, ${studentNameCol}, ${maybeHashUsername(hideNames, "username")}, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_${resIndex}.kv1 kv1, grouped_answers_${resIndex}.submitted submitted, grouped_answers_${resIndex}.source_key source_key, IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_${resIndex}.questions)))) num_answers, cardinality(filter(map_values(activities_${resIndex}.questions), x->x.required=TRUE)) num_required_questions, IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers @@ -419,7 +428,7 @@ exports.generateSQL = (runnableInfo, usageReport, authDomain, sourceKey, hideNam const uniqueUserClassQuery = `unique_user_class AS (SELECT class_id, user_id, arbitrary(student_id) as student_id, arbitrary(${hideNames ? "student_id" : "student_name"}) as student_name, - arbitrary(username) as username, + arbitrary(${maybeHashUsername(hideNames, "username", true)}) as username, arbitrary(school) as school, arbitrary(class) as class, arbitrary(permission_forms) as permission_forms, @@ -543,25 +552,31 @@ exports.generateSQL = (runnableInfo, usageReport, authDomain, sourceKey, hideNam GROUP BY l.run_remote_endpoint )` */ +const getLogCols = (hideNames) => { + return ["id", "session", "username", "application", "activity", "event", "event_value", "time", "parameters", "extras", "run_remote_endpoint", "timestamp"] + .map(col => `"log"."${col}"`) + .map(col => col === `"log"."username"` ? maybeHashUsername(hideNames, `"log"."username"`) : col) +} + +const getLearnerCols = (hideNames) => { + return ["learner_id", "run_remote_endpoint", "class_id", "runnable_url", "student_id", "class", "school", "user_id", "offering_id", "permission_forms", "username", "student_name", "teachers", "last_run", "query_id"] + .map(col => `"learner"."${col}"`) + .map(col => col === `"learner"."username"` ? maybeHashUsername(hideNames, `"learner"."username"`) : col) + .map(col => col === `"learner"."student_name"` && hideNames ? `"learner"."student_id" as student_name` : col) +} + /* Generates a very wide row including all fields from the log and learner. */ -exports.generateLearnerLogSQL = (queryIdsPerRunnable, authDomain, sourceKey, hideNames) => { +exports.generateLearnerLogSQL = (queryIdsPerRunnable, hideNames) => { const logDb = process.env.LOG_ATHENA_DB_NAME; const runnableUrls = Object.keys(queryIdsPerRunnable); const queryIds = Object.values(queryIdsPerRunnable); - const logCols = [ - "id", "session", "username", "application", "activity", "event", "event_value", "time", "parameters", "extras", "run_remote_endpoint", "timestamp" - ].map(col => `"log"."${col}"`) - - const learnerCols = [ - "learner_id", "run_remote_endpoint", "class_id", "runnable_url", "student_id", "class", "school", "user_id", "offering_id", "permission_forms", "username", "student_name", "teachers", "last_run", "query_id" - ] - .map(col => `"learner"."${col}"`) - .map(col => col === `"learner"."student_name"` && hideNames ? `"learner"."student_id" as student_name` : col).join(", ") + const logCols = getLogCols(hideNames) + const learnerCols = getLearnerCols(hideNames) - const cols = logCols.concat(learnerCols) + const cols = logCols.concat(learnerCols).join(", ") return ` -- name ${runnableUrls.join(", ")} @@ -615,17 +630,18 @@ exports.generateUserLogSQL = (usernames, activities, start_date, end_date) => { /* Generates a smaller row of event details only, no portal info. */ -exports.generateNarrowLogSQL = (queryIdsPerRunnable, authDomain, sourceKey) => { +exports.generateNarrowLogSQL = (queryIdsPerRunnable, hideNames) => { const logDb = process.env.LOG_ATHENA_DB_NAME; const runnableUrls = Object.keys(queryIdsPerRunnable); const queryIds = Object.values(queryIdsPerRunnable); + const logCols = getLogCols(hideNames).join(", "); return ` -- name ${runnableUrls.join(", ")} -- type learner event log ⎯ [qids: ${queryIds.join(", ")}] -- reportType narrow-learner-event-log - SELECT log.* + SELECT ${logCols} FROM "${logDb}"."logs_by_time" log INNER JOIN "report-service"."learners" learner ON diff --git a/query-creator/create-query/steps/env-vars.js b/query-creator/create-query/steps/env-vars.js index c5ab73c6..ca63dab4 100644 --- a/query-creator/create-query/steps/env-vars.js +++ b/query-creator/create-query/steps/env-vars.js @@ -19,4 +19,7 @@ exports.validate = () => { if (!process.env.LOG_ATHENA_DB_NAME) { missingVar("LOG_ATHENA_DB_NAME"); } + if (!process.env.USERNAME_HASH_SALT) { + missingVar("USERNAME_HASH_SALT"); + } } \ No newline at end of file diff --git a/query-creator/create-query/tests/unit/generated-sql.js b/query-creator/create-query/tests/unit/generated-sql.js index 287a31b2..fe979b15 100644 --- a/query-creator/create-query/tests/unit/generated-sql.js +++ b/query-creator/create-query/tests/unit/generated-sql.js @@ -297,7 +297,7 @@ grouped_answers_2 AS ( WHERE a.escaped_url = 'https---authoring-staging-concord-org-activities-000001' GROUP BY l.run_remote_endpoint), -learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key, +learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key, IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_1.questions)))) num_answers, cardinality(filter(map_values(activities_1.questions), x->x.required=TRUE)) num_required_questions, IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers @@ -307,7 +307,7 @@ learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable ON l.run_remote_endpoint = grouped_answers_1.remote_endpoint WHERE l.query_id = '123456789'), -learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key, +learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key, IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_2.questions)))) num_answers, cardinality(filter(map_values(activities_2.questions), x->x.required=TRUE)) num_required_questions, IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers @@ -320,7 +320,7 @@ learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable unique_user_class AS (SELECT class_id, user_id, arbitrary(student_id) as student_id, arbitrary(student_id) as student_name, - arbitrary(username) as username, + arbitrary(to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary)))) as username, arbitrary(school) as school, arbitrary(class) as class, arbitrary(permission_forms) as permission_forms, @@ -677,7 +677,7 @@ grouped_answers_2 AS ( WHERE a.escaped_url = 'https---authoring-staging-concord-org-activities-000001' GROUP BY l.run_remote_endpoint), -learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key, +learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key, IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_1.questions)))) num_answers, cardinality(filter(map_values(activities_1.questions), x->x.required=TRUE)) num_required_questions, IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers @@ -687,7 +687,7 @@ learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable ON l.run_remote_endpoint = grouped_answers_1.remote_endpoint WHERE l.query_id = '123456789'), -learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key, +learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key, IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_2.questions)))) num_answers, cardinality(filter(map_values(activities_2.questions), x->x.required=TRUE)) num_required_questions, IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers @@ -700,7 +700,7 @@ learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable unique_user_class AS (SELECT class_id, user_id, arbitrary(student_id) as student_id, arbitrary(student_id) as student_name, - arbitrary(username) as username, + arbitrary(to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary)))) as username, arbitrary(school) as school, arbitrary(class) as class, arbitrary(permission_forms) as permission_forms, @@ -812,7 +812,7 @@ array_join(transform(teachers, teacher -> teacher.district), ',') AS teacher_dis array_join(transform(teachers, teacher -> teacher.state), ',') AS teacher_states, array_join(transform(teachers, teacher -> teacher.email), ',') AS teacher_emails FROM -( SELECT l.run_remote_endpoint remote_endpoint, arbitrary(l.student_id) AS student_id, arbitrary(l.user_id) AS user_id, arbitrary(l.student_id) AS student_name, arbitrary(l.username) AS username, arbitrary(l.school) AS school, arbitrary(l.class) AS class, arbitrary(l.class_id) AS class_id, arbitrary(l.learner_id) AS learner_id, null AS resource_url, arbitrary(l.last_run) AS last_run, arbitrary(l.permission_forms) AS permission_forms, arbitrary(l.teachers) teachers +( SELECT l.run_remote_endpoint remote_endpoint, arbitrary(l.student_id) AS student_id, arbitrary(l.user_id) AS user_id, arbitrary(l.student_id) AS student_name, arbitrary(to_hex(sha1(cast(('no-username-salt-provided' || l.username) as varbinary)))) AS username, arbitrary(l.school) AS school, arbitrary(l.class) AS class, arbitrary(l.class_id) AS class_id, arbitrary(l.learner_id) AS learner_id, null AS resource_url, arbitrary(l.last_run) AS last_run, arbitrary(l.permission_forms) AS permission_forms, arbitrary(l.teachers) teachers FROM "report-service"."learners" l WHERE l.query_id IN ('123456789') GROUP BY l.run_remote_endpoint ) diff --git a/query-creator/env.sample.json b/query-creator/env.sample.json index 57cb6e83..b319b590 100644 --- a/query-creator/env.sample.json +++ b/query-creator/env.sample.json @@ -6,6 +6,7 @@ "PORTAL_REPORT_URL": "https://portal-report.concord.org/branch/master/index.html", "FIREBASE_APP": "report-service-dev", "LOG_ATHENA_DB_NAME": "log_ingester_qa", - "RESEARCHER_REPORTS_URL": "https://researcher-reports.concord.org/branch/master/" + "RESEARCHER_REPORTS_URL": "https://researcher-reports.concord.org/branch/master/", + "USERNAME_HASH_SALT": "replace-this-with-random-salt-bytes" } } diff --git a/query-creator/template.yaml b/query-creator/template.yaml index 9ae336d4..56cedb0a 100644 --- a/query-creator/template.yaml +++ b/query-creator/template.yaml @@ -25,10 +25,13 @@ Parameters: Description: Firebase app name PortalReportUrl: Type: String - Description: Url to the Portal Report where reseachers can load learner's models + Description: Url to the Portal Report where researchers can load learner's models LogAthenaDBName: Type: String Description: The Athena Database Name + UsernameHashSalt: + Type: String + Description: Salt used to hash usernames. This is available in 1Password in the "Query Creator Username Hash Salts (staging and production)" note. # More info about Globals: https://github.com/awslabs/serverless-application-model/blob/master/docs/globals.rst Globals: @@ -43,6 +46,7 @@ Globals: FIREBASE_APP: !Ref FirebaseApp PORTAL_REPORT_URL: !Ref PortalReportUrl LOG_ATHENA_DB_NAME: !Ref LogAthenaDBName + USERNAME_HASH_SALT: !Ref UsernameHashSalt Resources: CreateQueryFunction: