Skip to content

Commit

Permalink
feat: Hash usernames using SHA1 when hiding names [PT-184362179]
Browse files Browse the repository at this point in the history
  • Loading branch information
dougmartin committed May 23, 2024
1 parent c01fcf3 commit 4d0ccc1
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 26 deletions.
4 changes: 2 additions & 2 deletions query-creator/create-query/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ const learnersReport = async (params, body, tokenServiceEnv, debugSQL, reportSer
const doLearnerLogReporting= async () => {
// generate the sql for the query
const sql = narrowLearners
? aws.generateNarrowLogSQL(queryIdsPerRunnable, authDomain, reportServiceSource) // hideNames not needed here as no learner info is output
: aws.generateLearnerLogSQL(queryIdsPerRunnable, authDomain, reportServiceSource, hideNames);
? aws.generateNarrowLogSQL(queryIdsPerRunnable, hideNames)
: aws.generateLearnerLogSQL(queryIdsPerRunnable, hideNames);

if (debugSQL) {
sqlOutput.push(sql);
Expand Down
46 changes: 31 additions & 15 deletions query-creator/create-query/steps/aws.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ const request = require("./request");

const PAGE_SIZE = 2000;

const usernameHashSalt = process.env.USERNAME_HASH_SALT || "no-username-salt-provided";
const maybeHashUsername = (hash, col, skipAs) => hash ? `to_hex(sha1(cast(('${usernameHashSalt}' || ${col}) as varbinary)))${!skipAs ? " as username" : ""}` : col

// Column format of:
// { name: "column name", value: "main value on each row", header: "optional first row value"}
const selectFromColumn = (column) => {
Expand Down Expand Up @@ -276,6 +279,12 @@ exports.generateNoResourceSQL = (runnableInfo, hideNames) => {
value: "arbitrary(l.student_id)"
}
}
if (md === "username") {
return {
name: md,
value: `arbitrary(${maybeHashUsername(hideNames, "l.username", true)})`
}
}
return {
name: md,
value: md === "resource_url" ? "null" : `arbitrary(l.${md})`
Expand Down Expand Up @@ -377,7 +386,7 @@ exports.generateSQL = (runnableInfo, usageReport, authDomain, sourceKey, hideNam
WHERE a.escaped_url = '${escapeSingleQuote(escapedUrl)}'
GROUP BY l.run_remote_endpoint)`)

learnerAndAnswerQueries.push(`learners_and_answers_${resIndex} AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, ${studentNameCol}, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_${resIndex}.kv1 kv1, grouped_answers_${resIndex}.submitted submitted, grouped_answers_${resIndex}.source_key source_key,
learnerAndAnswerQueries.push(`learners_and_answers_${resIndex} AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, ${studentNameCol}, ${maybeHashUsername(hideNames, "username")}, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_${resIndex}.kv1 kv1, grouped_answers_${resIndex}.submitted submitted, grouped_answers_${resIndex}.source_key source_key,
IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_${resIndex}.questions)))) num_answers,
cardinality(filter(map_values(activities_${resIndex}.questions), x->x.required=TRUE)) num_required_questions,
IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers
Expand Down Expand Up @@ -419,7 +428,7 @@ exports.generateSQL = (runnableInfo, usageReport, authDomain, sourceKey, hideNam
const uniqueUserClassQuery = `unique_user_class AS (SELECT class_id, user_id,
arbitrary(student_id) as student_id,
arbitrary(${hideNames ? "student_id" : "student_name"}) as student_name,
arbitrary(username) as username,
arbitrary(${maybeHashUsername(hideNames, "username", true)}) as username,
arbitrary(school) as school,
arbitrary(class) as class,
arbitrary(permission_forms) as permission_forms,
Expand Down Expand Up @@ -543,25 +552,31 @@ exports.generateSQL = (runnableInfo, usageReport, authDomain, sourceKey, hideNam
GROUP BY l.run_remote_endpoint )`
*/

const getLogCols = (hideNames) => {
return ["id", "session", "username", "application", "activity", "event", "event_value", "time", "parameters", "extras", "run_remote_endpoint", "timestamp"]
.map(col => `"log"."${col}"`)
.map(col => col === `"log"."username"` ? maybeHashUsername(hideNames, `"log"."username"`) : col)
}

const getLearnerCols = (hideNames) => {
return ["learner_id", "run_remote_endpoint", "class_id", "runnable_url", "student_id", "class", "school", "user_id", "offering_id", "permission_forms", "username", "student_name", "teachers", "last_run", "query_id"]
.map(col => `"learner"."${col}"`)
.map(col => col === `"learner"."username"` ? maybeHashUsername(hideNames, `"learner"."username"`) : col)
.map(col => col === `"learner"."student_name"` && hideNames ? `"learner"."student_id" as student_name` : col)
}

/*
Generates a very wide row including all fields from the log and learner.
*/
exports.generateLearnerLogSQL = (queryIdsPerRunnable, authDomain, sourceKey, hideNames) => {
exports.generateLearnerLogSQL = (queryIdsPerRunnable, hideNames) => {
const logDb = process.env.LOG_ATHENA_DB_NAME;
const runnableUrls = Object.keys(queryIdsPerRunnable);
const queryIds = Object.values(queryIdsPerRunnable);

const logCols = [
"id", "session", "username", "application", "activity", "event", "event_value", "time", "parameters", "extras", "run_remote_endpoint", "timestamp"
].map(col => `"log"."${col}"`)

const learnerCols = [
"learner_id", "run_remote_endpoint", "class_id", "runnable_url", "student_id", "class", "school", "user_id", "offering_id", "permission_forms", "username", "student_name", "teachers", "last_run", "query_id"
]
.map(col => `"learner"."${col}"`)
.map(col => col === `"learner"."student_name"` && hideNames ? `"learner"."student_id" as student_name` : col).join(", ")
const logCols = getLogCols(hideNames)
const learnerCols = getLearnerCols(hideNames)

const cols = logCols.concat(learnerCols)
const cols = logCols.concat(learnerCols).join(", ")

return `
-- name ${runnableUrls.join(", ")}
Expand Down Expand Up @@ -615,17 +630,18 @@ exports.generateUserLogSQL = (usernames, activities, start_date, end_date) => {
/*
Generates a smaller row of event details only, no portal info.
*/
exports.generateNarrowLogSQL = (queryIdsPerRunnable, authDomain, sourceKey) => {
exports.generateNarrowLogSQL = (queryIdsPerRunnable, hideNames) => {
const logDb = process.env.LOG_ATHENA_DB_NAME;
const runnableUrls = Object.keys(queryIdsPerRunnable);
const queryIds = Object.values(queryIdsPerRunnable);
const logCols = getLogCols(hideNames).join(", ");

return `
-- name ${runnableUrls.join(", ")}
-- type learner event log ⎯ [qids: ${queryIds.join(", ")}]
-- reportType narrow-learner-event-log
SELECT log.*
SELECT ${logCols}
FROM "${logDb}"."logs_by_time" log
INNER JOIN "report-service"."learners" learner
ON
Expand Down
3 changes: 3 additions & 0 deletions query-creator/create-query/steps/env-vars.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ exports.validate = () => {
if (!process.env.LOG_ATHENA_DB_NAME) {
missingVar("LOG_ATHENA_DB_NAME");
}
if (!process.env.USERNAME_HASH_SALT) {
missingVar("USERNAME_HASH_SALT");
}
}
14 changes: 7 additions & 7 deletions query-creator/create-query/tests/unit/generated-sql.js
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ grouped_answers_2 AS (
WHERE a.escaped_url = 'https---authoring-staging-concord-org-activities-000001'
GROUP BY l.run_remote_endpoint),
learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key,
learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key,
IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_1.questions)))) num_answers,
cardinality(filter(map_values(activities_1.questions), x->x.required=TRUE)) num_required_questions,
IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers
Expand All @@ -307,7 +307,7 @@ learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable
ON l.run_remote_endpoint = grouped_answers_1.remote_endpoint
WHERE l.query_id = '123456789'),
learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key,
learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key,
IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_2.questions)))) num_answers,
cardinality(filter(map_values(activities_2.questions), x->x.required=TRUE)) num_required_questions,
IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers
Expand All @@ -320,7 +320,7 @@ learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable
unique_user_class AS (SELECT class_id, user_id,
arbitrary(student_id) as student_id,
arbitrary(student_id) as student_name,
arbitrary(username) as username,
arbitrary(to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary)))) as username,
arbitrary(school) as school,
arbitrary(class) as class,
arbitrary(permission_forms) as permission_forms,
Expand Down Expand Up @@ -677,7 +677,7 @@ grouped_answers_2 AS (
WHERE a.escaped_url = 'https---authoring-staging-concord-org-activities-000001'
GROUP BY l.run_remote_endpoint),
learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key,
learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key,
IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_1.questions)))) num_answers,
cardinality(filter(map_values(activities_1.questions), x->x.required=TRUE)) num_required_questions,
IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers
Expand All @@ -687,7 +687,7 @@ learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable
ON l.run_remote_endpoint = grouped_answers_1.remote_endpoint
WHERE l.query_id = '123456789'),
learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key,
learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key,
IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_2.questions)))) num_answers,
cardinality(filter(map_values(activities_2.questions), x->x.required=TRUE)) num_required_questions,
IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers
Expand All @@ -700,7 +700,7 @@ learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable
unique_user_class AS (SELECT class_id, user_id,
arbitrary(student_id) as student_id,
arbitrary(student_id) as student_name,
arbitrary(username) as username,
arbitrary(to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary)))) as username,
arbitrary(school) as school,
arbitrary(class) as class,
arbitrary(permission_forms) as permission_forms,
Expand Down Expand Up @@ -812,7 +812,7 @@ array_join(transform(teachers, teacher -> teacher.district), ',') AS teacher_dis
array_join(transform(teachers, teacher -> teacher.state), ',') AS teacher_states,
array_join(transform(teachers, teacher -> teacher.email), ',') AS teacher_emails
FROM
( SELECT l.run_remote_endpoint remote_endpoint, arbitrary(l.student_id) AS student_id, arbitrary(l.user_id) AS user_id, arbitrary(l.student_id) AS student_name, arbitrary(l.username) AS username, arbitrary(l.school) AS school, arbitrary(l.class) AS class, arbitrary(l.class_id) AS class_id, arbitrary(l.learner_id) AS learner_id, null AS resource_url, arbitrary(l.last_run) AS last_run, arbitrary(l.permission_forms) AS permission_forms, arbitrary(l.teachers) teachers
( SELECT l.run_remote_endpoint remote_endpoint, arbitrary(l.student_id) AS student_id, arbitrary(l.user_id) AS user_id, arbitrary(l.student_id) AS student_name, arbitrary(to_hex(sha1(cast(('no-username-salt-provided' || l.username) as varbinary)))) AS username, arbitrary(l.school) AS school, arbitrary(l.class) AS class, arbitrary(l.class_id) AS class_id, arbitrary(l.learner_id) AS learner_id, null AS resource_url, arbitrary(l.last_run) AS last_run, arbitrary(l.permission_forms) AS permission_forms, arbitrary(l.teachers) teachers
FROM "report-service"."learners" l
WHERE l.query_id IN ('123456789')
GROUP BY l.run_remote_endpoint )
Expand Down
3 changes: 2 additions & 1 deletion query-creator/env.sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"PORTAL_REPORT_URL": "https://portal-report.concord.org/branch/master/index.html",
"FIREBASE_APP": "report-service-dev",
"LOG_ATHENA_DB_NAME": "log_ingester_qa",
"RESEARCHER_REPORTS_URL": "https://researcher-reports.concord.org/branch/master/"
"RESEARCHER_REPORTS_URL": "https://researcher-reports.concord.org/branch/master/",
"USERNAME_HASH_SALT": "replace-this-with-random-salt-bytes"
}
}
6 changes: 5 additions & 1 deletion query-creator/template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@ Parameters:
Description: Firebase app name
PortalReportUrl:
Type: String
Description: Url to the Portal Report where reseachers can load learner's models
Description: Url to the Portal Report where researchers can load learner's models
LogAthenaDBName:
Type: String
Description: The Athena Database Name
UsernameHashSalt:
Type: String
Description: Salt used to hash usernames. This is available in 1Password in the "Query Creator Username Hash Salts (staging and production)" note.

# More info about Globals: https://github.com/awslabs/serverless-application-model/blob/master/docs/globals.rst
Globals:
Expand All @@ -43,6 +46,7 @@ Globals:
FIREBASE_APP: !Ref FirebaseApp
PORTAL_REPORT_URL: !Ref PortalReportUrl
LOG_ATHENA_DB_NAME: !Ref LogAthenaDBName
USERNAME_HASH_SALT: !Ref UsernameHashSalt

Resources:
CreateQueryFunction:
Expand Down

0 comments on commit 4d0ccc1

Please sign in to comment.