Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Hash usernames using SHA1 when hiding names [PT-184362179] #247

Merged
merged 1 commit into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions query-creator/create-query/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ const learnersReport = async (params, body, tokenServiceEnv, debugSQL, reportSer
const doLearnerLogReporting= async () => {
// generate the sql for the query
const sql = narrowLearners
? aws.generateNarrowLogSQL(queryIdsPerRunnable, authDomain, reportServiceSource) // hideNames not needed here as no learner info is output
: aws.generateLearnerLogSQL(queryIdsPerRunnable, authDomain, reportServiceSource, hideNames);
? aws.generateNarrowLogSQL(queryIdsPerRunnable, hideNames)
: aws.generateLearnerLogSQL(queryIdsPerRunnable, hideNames);

if (debugSQL) {
sqlOutput.push(sql);
Expand Down
46 changes: 31 additions & 15 deletions query-creator/create-query/steps/aws.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ const request = require("./request");

const PAGE_SIZE = 2000;

const usernameHashSalt = process.env.USERNAME_HASH_SALT || "no-username-salt-provided";
const maybeHashUsername = (hash, col, skipAs) => hash ? `to_hex(sha1(cast(('${usernameHashSalt}' || ${col}) as varbinary)))${!skipAs ? " as username" : ""}` : col

// Column format of:
// { name: "column name", value: "main value on each row", header: "optional first row value"}
const selectFromColumn = (column) => {
Expand Down Expand Up @@ -276,6 +279,12 @@ exports.generateNoResourceSQL = (runnableInfo, hideNames) => {
value: "arbitrary(l.student_id)"
}
}
if (md === "username") {
return {
name: md,
value: `arbitrary(${maybeHashUsername(hideNames, "l.username", true)})`
}
}
return {
name: md,
value: md === "resource_url" ? "null" : `arbitrary(l.${md})`
Expand Down Expand Up @@ -377,7 +386,7 @@ exports.generateSQL = (runnableInfo, usageReport, authDomain, sourceKey, hideNam
WHERE a.escaped_url = '${escapeSingleQuote(escapedUrl)}'
GROUP BY l.run_remote_endpoint)`)

learnerAndAnswerQueries.push(`learners_and_answers_${resIndex} AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, ${studentNameCol}, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_${resIndex}.kv1 kv1, grouped_answers_${resIndex}.submitted submitted, grouped_answers_${resIndex}.source_key source_key,
learnerAndAnswerQueries.push(`learners_and_answers_${resIndex} AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, ${studentNameCol}, ${maybeHashUsername(hideNames, "username")}, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_${resIndex}.kv1 kv1, grouped_answers_${resIndex}.submitted submitted, grouped_answers_${resIndex}.source_key source_key,
IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_${resIndex}.questions)))) num_answers,
cardinality(filter(map_values(activities_${resIndex}.questions), x->x.required=TRUE)) num_required_questions,
IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers
Expand Down Expand Up @@ -419,7 +428,7 @@ exports.generateSQL = (runnableInfo, usageReport, authDomain, sourceKey, hideNam
const uniqueUserClassQuery = `unique_user_class AS (SELECT class_id, user_id,
arbitrary(student_id) as student_id,
arbitrary(${hideNames ? "student_id" : "student_name"}) as student_name,
arbitrary(username) as username,
arbitrary(${maybeHashUsername(hideNames, "username", true)}) as username,
arbitrary(school) as school,
arbitrary(class) as class,
arbitrary(permission_forms) as permission_forms,
Expand Down Expand Up @@ -543,25 +552,31 @@ exports.generateSQL = (runnableInfo, usageReport, authDomain, sourceKey, hideNam
GROUP BY l.run_remote_endpoint )`
*/

const getLogCols = (hideNames) => {
return ["id", "session", "username", "application", "activity", "event", "event_value", "time", "parameters", "extras", "run_remote_endpoint", "timestamp"]
.map(col => `"log"."${col}"`)
.map(col => col === `"log"."username"` ? maybeHashUsername(hideNames, `"log"."username"`) : col)
}

const getLearnerCols = (hideNames) => {
return ["learner_id", "run_remote_endpoint", "class_id", "runnable_url", "student_id", "class", "school", "user_id", "offering_id", "permission_forms", "username", "student_name", "teachers", "last_run", "query_id"]
.map(col => `"learner"."${col}"`)
.map(col => col === `"learner"."username"` ? maybeHashUsername(hideNames, `"learner"."username"`) : col)
.map(col => col === `"learner"."student_name"` && hideNames ? `"learner"."student_id" as student_name` : col)
}

/*
Generates a very wide row including all fields from the log and learner.
*/
exports.generateLearnerLogSQL = (queryIdsPerRunnable, authDomain, sourceKey, hideNames) => {
exports.generateLearnerLogSQL = (queryIdsPerRunnable, hideNames) => {
const logDb = process.env.LOG_ATHENA_DB_NAME;
const runnableUrls = Object.keys(queryIdsPerRunnable);
const queryIds = Object.values(queryIdsPerRunnable);

const logCols = [
"id", "session", "username", "application", "activity", "event", "event_value", "time", "parameters", "extras", "run_remote_endpoint", "timestamp"
].map(col => `"log"."${col}"`)

const learnerCols = [
"learner_id", "run_remote_endpoint", "class_id", "runnable_url", "student_id", "class", "school", "user_id", "offering_id", "permission_forms", "username", "student_name", "teachers", "last_run", "query_id"
]
.map(col => `"learner"."${col}"`)
.map(col => col === `"learner"."student_name"` && hideNames ? `"learner"."student_id" as student_name` : col).join(", ")
const logCols = getLogCols(hideNames)
const learnerCols = getLearnerCols(hideNames)

const cols = logCols.concat(learnerCols)
const cols = logCols.concat(learnerCols).join(", ")

return `
-- name ${runnableUrls.join(", ")}
Expand Down Expand Up @@ -615,17 +630,18 @@ exports.generateUserLogSQL = (usernames, activities, start_date, end_date) => {
/*
Generates a smaller row of event details only, no portal info.
*/
exports.generateNarrowLogSQL = (queryIdsPerRunnable, authDomain, sourceKey) => {
exports.generateNarrowLogSQL = (queryIdsPerRunnable, hideNames) => {
const logDb = process.env.LOG_ATHENA_DB_NAME;
const runnableUrls = Object.keys(queryIdsPerRunnable);
const queryIds = Object.values(queryIdsPerRunnable);
const logCols = getLogCols(hideNames).join(", ");

return `
-- name ${runnableUrls.join(", ")}
-- type learner event log ⎯ [qids: ${queryIds.join(", ")}]
-- reportType narrow-learner-event-log

SELECT log.*
SELECT ${logCols}
FROM "${logDb}"."logs_by_time" log
INNER JOIN "report-service"."learners" learner
ON
Expand Down
3 changes: 3 additions & 0 deletions query-creator/create-query/steps/env-vars.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ exports.validate = () => {
if (!process.env.LOG_ATHENA_DB_NAME) {
missingVar("LOG_ATHENA_DB_NAME");
}
if (!process.env.USERNAME_HASH_SALT) {
missingVar("USERNAME_HASH_SALT");
}
}
14 changes: 7 additions & 7 deletions query-creator/create-query/tests/unit/generated-sql.js
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ grouped_answers_2 AS (
WHERE a.escaped_url = 'https---authoring-staging-concord-org-activities-000001'
GROUP BY l.run_remote_endpoint),

learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key,
learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key,
IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_1.questions)))) num_answers,
cardinality(filter(map_values(activities_1.questions), x->x.required=TRUE)) num_required_questions,
IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers
Expand All @@ -307,7 +307,7 @@ learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable
ON l.run_remote_endpoint = grouped_answers_1.remote_endpoint
WHERE l.query_id = '123456789'),

learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key,
learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key,
IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_2.questions)))) num_answers,
cardinality(filter(map_values(activities_2.questions), x->x.required=TRUE)) num_required_questions,
IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers
Expand All @@ -320,7 +320,7 @@ learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable
unique_user_class AS (SELECT class_id, user_id,
arbitrary(student_id) as student_id,
arbitrary(student_id) as student_name,
arbitrary(username) as username,
arbitrary(to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary)))) as username,
arbitrary(school) as school,
arbitrary(class) as class,
arbitrary(permission_forms) as permission_forms,
Expand Down Expand Up @@ -677,7 +677,7 @@ grouped_answers_2 AS (
WHERE a.escaped_url = 'https---authoring-staging-concord-org-activities-000001'
GROUP BY l.run_remote_endpoint),

learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key,
learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_1.kv1 kv1, grouped_answers_1.submitted submitted, grouped_answers_1.source_key source_key,
IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_1.questions)))) num_answers,
cardinality(filter(map_values(activities_1.questions), x->x.required=TRUE)) num_required_questions,
IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers
Expand All @@ -687,7 +687,7 @@ learners_and_answers_1 AS ( SELECT run_remote_endpoint remote_endpoint, runnable
ON l.run_remote_endpoint = grouped_answers_1.remote_endpoint
WHERE l.query_id = '123456789'),

learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key,
learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable_url as resource_url, learner_id, student_id, user_id, offering_id, student_id as student_name, to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary))) as username, school, class, class_id, permission_forms, last_run, teachers, grouped_answers_2.kv1 kv1, grouped_answers_2.submitted submitted, grouped_answers_2.source_key source_key,
IF (kv1 is null, 0, cardinality(array_intersect(map_keys(kv1),map_keys(activities_2.questions)))) num_answers,
cardinality(filter(map_values(activities_2.questions), x->x.required=TRUE)) num_required_questions,
IF (submitted is null, 0, cardinality(filter(map_values(submitted), x->x=TRUE))) num_required_answers
Expand All @@ -700,7 +700,7 @@ learners_and_answers_2 AS ( SELECT run_remote_endpoint remote_endpoint, runnable
unique_user_class AS (SELECT class_id, user_id,
arbitrary(student_id) as student_id,
arbitrary(student_id) as student_name,
arbitrary(username) as username,
arbitrary(to_hex(sha1(cast(('no-username-salt-provided' || username) as varbinary)))) as username,
arbitrary(school) as school,
arbitrary(class) as class,
arbitrary(permission_forms) as permission_forms,
Expand Down Expand Up @@ -812,7 +812,7 @@ array_join(transform(teachers, teacher -> teacher.district), ',') AS teacher_dis
array_join(transform(teachers, teacher -> teacher.state), ',') AS teacher_states,
array_join(transform(teachers, teacher -> teacher.email), ',') AS teacher_emails
FROM
( SELECT l.run_remote_endpoint remote_endpoint, arbitrary(l.student_id) AS student_id, arbitrary(l.user_id) AS user_id, arbitrary(l.student_id) AS student_name, arbitrary(l.username) AS username, arbitrary(l.school) AS school, arbitrary(l.class) AS class, arbitrary(l.class_id) AS class_id, arbitrary(l.learner_id) AS learner_id, null AS resource_url, arbitrary(l.last_run) AS last_run, arbitrary(l.permission_forms) AS permission_forms, arbitrary(l.teachers) teachers
( SELECT l.run_remote_endpoint remote_endpoint, arbitrary(l.student_id) AS student_id, arbitrary(l.user_id) AS user_id, arbitrary(l.student_id) AS student_name, arbitrary(to_hex(sha1(cast(('no-username-salt-provided' || l.username) as varbinary)))) AS username, arbitrary(l.school) AS school, arbitrary(l.class) AS class, arbitrary(l.class_id) AS class_id, arbitrary(l.learner_id) AS learner_id, null AS resource_url, arbitrary(l.last_run) AS last_run, arbitrary(l.permission_forms) AS permission_forms, arbitrary(l.teachers) teachers
FROM "report-service"."learners" l
WHERE l.query_id IN ('123456789')
GROUP BY l.run_remote_endpoint )
Expand Down
3 changes: 2 additions & 1 deletion query-creator/env.sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"PORTAL_REPORT_URL": "https://portal-report.concord.org/branch/master/index.html",
"FIREBASE_APP": "report-service-dev",
"LOG_ATHENA_DB_NAME": "log_ingester_qa",
"RESEARCHER_REPORTS_URL": "https://researcher-reports.concord.org/branch/master/"
"RESEARCHER_REPORTS_URL": "https://researcher-reports.concord.org/branch/master/",
"USERNAME_HASH_SALT": "replace-this-with-random-salt-bytes"
}
}
6 changes: 5 additions & 1 deletion query-creator/template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@ Parameters:
Description: Firebase app name
PortalReportUrl:
Type: String
Description: Url to the Portal Report where reseachers can load learner's models
Description: Url to the Portal Report where researchers can load learner's models
LogAthenaDBName:
Type: String
Description: The Athena Database Name
UsernameHashSalt:
Type: String
Description: Salt used to hash usernames. This is available in 1Password in the "Query Creator Username Hash Salts (staging and production)" note.

# More info about Globals: https://github.com/awslabs/serverless-application-model/blob/master/docs/globals.rst
Globals:
Expand All @@ -43,6 +46,7 @@ Globals:
FIREBASE_APP: !Ref FirebaseApp
PORTAL_REPORT_URL: !Ref PortalReportUrl
LOG_ATHENA_DB_NAME: !Ref LogAthenaDBName
USERNAME_HASH_SALT: !Ref UsernameHashSalt

Resources:
CreateQueryFunction:
Expand Down
Loading