Skip to content

Commit b28e44d

Browse files
authored
Merge pull request #1774 from boazsender/automated-transcriptions
Working end-to-end transcription integrations.
2 parents 5b11899 + 64475c4 commit b28e44d

File tree

2 files changed

+182
-112
lines changed

2 files changed

+182
-112
lines changed

functions/src/events/scrapeEvents.ts

Lines changed: 93 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,76 @@ class SessionScraper extends EventScraper<SessionContent, Session> {
137137
}
138138
}
139139

140+
const submitTranscription = async ({
141+
EventId,
142+
maybeVideoUrl
143+
}: {
144+
EventId: number
145+
maybeVideoUrl: string
146+
}) => {
147+
const newToken = randomBytes(16).toString("hex")
148+
149+
const transcript = await assembly.transcripts.submit({
150+
audio:
151+
// test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
152+
maybeVideoUrl,
153+
webhook_url:
154+
// test with: "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
155+
process.env.NODE_ENV === "development"
156+
? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
157+
: "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription",
158+
speaker_labels: true,
159+
webhook_auth_header_name: "x-maple-webhook",
160+
webhook_auth_header_value: newToken
161+
})
162+
163+
await db
164+
.collection("events")
165+
.doc(`hearing-${String(EventId)}`)
166+
.collection("private")
167+
.doc("webhookAuth")
168+
.set({
169+
videoAssemblyWebhookToken: sha256(newToken)
170+
})
171+
172+
return transcript.id
173+
}
174+
175+
const getHearingVideoUrl = async (EventId: number) => {
176+
const req = await fetch(
177+
`https://malegislature.gov/Events/Hearings/Detail/${EventId}`
178+
)
179+
const res = await req.text()
180+
if (res) {
181+
const dom = new JSDOM(res)
182+
if (dom) {
183+
const maybeVideoSource =
184+
dom.window.document.querySelectorAll("video source")
185+
if (maybeVideoSource.length && maybeVideoSource[0]) {
186+
const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement
187+
return firstVideoSource.src
188+
}
189+
}
190+
}
191+
return null
192+
}
193+
194+
const shouldScrapeVideo = async (EventId: number) => {
195+
const eventInDb = await db
196+
.collection("events")
197+
.doc(`hearing-${String(EventId)}`)
198+
.get()
199+
const eventData = eventInDb.data()
200+
201+
if (!eventData) {
202+
return false
203+
}
204+
if (!eventData.videoFetchedAt) {
205+
return withinCutoff(new Date(Hearing.check(eventData).startsAt.toDate()))
206+
}
207+
return false
208+
}
209+
140210
class HearingScraper extends EventScraper<HearingListItem, Hearing> {
141211
constructor() {
142212
super("every 60 minutes", 240)
@@ -150,88 +220,33 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
150220
async getEvent({ EventId }: HearingListItem /* e.g. 4962 */) {
151221
const data = await api.getHearing(EventId)
152222
const content = HearingContent.check(data)
153-
const eventInDb = await db
154-
.collection("events")
155-
.doc(`hearing-${String(EventId)}`)
156-
.get()
157-
const eventData = eventInDb.data()
158-
const hearing = Hearing.check(eventData)
159-
const shouldScrape = withinCutoff(hearing.startsAt.toDate())
160-
161-
let payload: Hearing = {
223+
224+
if (await shouldScrapeVideo(EventId)) {
225+
const maybeVideoUrl = await getHearingVideoUrl(EventId)
226+
if (maybeVideoUrl) {
227+
const transcriptId = await submitTranscription({
228+
maybeVideoUrl,
229+
EventId
230+
})
231+
232+
return {
233+
id: `hearing-${EventId}`,
234+
type: "hearing",
235+
content,
236+
...this.timestamps(content),
237+
videoURL: maybeVideoUrl,
238+
videoFetchedAt: Timestamp.now(),
239+
videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId
240+
} as Hearing
241+
}
242+
}
243+
244+
return {
162245
id: `hearing-${EventId}`,
163246
type: "hearing",
164247
content,
165248
...this.timestamps(content)
166-
}
167-
if (hearing) {
168-
payload = {
169-
...payload,
170-
videoURL: hearing.videoURL,
171-
videoFetchedAt: hearing.videoFetchedAt,
172-
videoAssemblyId: hearing.videoAssemblyId
173-
}
174-
}
175-
let maybeVideoURL = null
176-
let transcript = null
177-
178-
if (!hearing.videoFetchedAt && shouldScrape) {
179-
const req = await fetch(
180-
`https://malegislature.gov/Events/Hearings/Detail/${EventId}`
181-
)
182-
const res = await req.text()
183-
if (res) {
184-
const dom = new JSDOM(res)
185-
if (dom) {
186-
const maybeVideoSource =
187-
dom.window.document.querySelectorAll("video source")
188-
if (maybeVideoSource.length && maybeVideoSource[0]) {
189-
const newToken = randomBytes(16).toString("hex")
190-
const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement
191-
maybeVideoURL = firstVideoSource.src
192-
193-
transcript = await assembly.transcripts.submit({
194-
webhook_url:
195-
process.env.NODE_ENV === "development"
196-
? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
197-
: "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription",
198-
webhook_auth_header_name: "X-Maple-Webhook",
199-
webhook_auth_header_value: newToken,
200-
audio: firstVideoSource.src,
201-
auto_highlights: true,
202-
custom_topics: true,
203-
entity_detection: true,
204-
iab_categories: false,
205-
format_text: true,
206-
punctuate: true,
207-
speaker_labels: true,
208-
summarization: true,
209-
summary_model: "informative",
210-
summary_type: "bullets"
211-
})
212-
213-
await db
214-
.collection("events")
215-
.doc(`hearing-${String(EventId)}`)
216-
.collection("private")
217-
.doc("webhookAuth")
218-
.set({
219-
videoAssemblyWebhookToken: sha256(newToken)
220-
})
221-
222-
payload = {
223-
...payload,
224-
videoURL: maybeVideoURL,
225-
videoFetchedAt: Timestamp.now(),
226-
videoAssemblyId: transcript.id
227-
}
228-
}
229-
}
230-
}
231-
}
232-
233-
const event: Hearing = payload
234-
return event
249+
} as Hearing
235250
}
236251
}
237252

functions/src/webhooks/transcription.ts

Lines changed: 89 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,112 @@
11
import * as functions from "firebase-functions"
22
import { AssemblyAI } from "assemblyai"
3-
import { db } from "../firebase"
3+
import { db, Timestamp } from "../firebase"
44
import { sha256 } from "js-sha256"
55

66
const assembly = new AssemblyAI({
77
apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : ""
88
})
99

1010
export const transcription = functions.https.onRequest(async (req, res) => {
11-
if (
12-
req.headers["X-Maple-Webhook"] &&
13-
req.headers["webhook_auth_header_value"]
14-
) {
11+
if (req.headers["x-maple-webhook"]) {
1512
if (req.body.status === "completed") {
13+
// If we get a request with the right header and status, get the
14+
// transcription from the assembly API.
1615
const transcript = await assembly.transcripts.get(req.body.transcript_id)
1716
if (transcript && transcript.webhook_auth) {
18-
const maybeEventInDb = await db
17+
// If there is a transcript and the transcript has an auth property,
18+
// look for an event (aka Hearing) in the DB with a matching ID.
19+
const maybeEventsInDb = await db
1920
.collection("events")
20-
.where("videoAssemblyId", "==", transcript.id)
21+
.where("videoTranscriptionId", "==", transcript.id)
2122
.get()
22-
if (maybeEventInDb.docs.length) {
23-
const authenticatedEventsInDb = maybeEventInDb.docs.filter(
24-
async e => {
25-
const hashedToken = sha256(
26-
String(req.headers["webhook_auth_header_value"])
27-
)
2823

29-
const tokenInDb = await db
30-
.collection("events")
31-
.doc(e.id)
32-
.collection("private")
33-
.doc("webhookAuth")
34-
.get()
35-
const tokenInDbData = tokenInDb.data()
36-
if (tokenInDbData) {
37-
return hashedToken === tokenInDbData.videoAssemblyWebhookToken
38-
}
39-
return false
24+
if (maybeEventsInDb.docs.length) {
25+
// If we have a match look for one that matches a hash of the token
26+
// we gave Assembly. There should only be one of these but firestore
27+
// gives us an array. If there is more than one member, something is
28+
// wrong
29+
const authenticatedEventIds = [] as string[]
30+
const hashedToken = sha256(String(req.headers["x-maple-webhook"]))
31+
32+
for (const index in maybeEventsInDb.docs) {
33+
const doc = maybeEventsInDb.docs[index]
34+
35+
const tokenDocInDb = await db
36+
.collection("events")
37+
.doc(doc.id)
38+
.collection("private")
39+
.doc("webhookAuth")
40+
.get()
41+
42+
const tokenDataInDb = tokenDocInDb.data()?.videoAssemblyWebhookToken
43+
44+
if (hashedToken === tokenDataInDb) {
45+
authenticatedEventIds.push(doc.id)
4046
}
41-
)
42-
if (authenticatedEventsInDb) {
47+
}
48+
49+
// Log edge cases
50+
if (maybeEventsInDb.docs.length === 0) {
51+
console.log("No matching event in db.")
52+
}
53+
if (authenticatedEventIds.length === 0) {
54+
console.log("No authenticated events in db.")
55+
}
56+
if (authenticatedEventIds.length > 1) {
57+
console.log("More than one matching event in db.")
58+
}
59+
60+
if (authenticatedEventIds.length === 1) {
61+
// If there is one authenticated event, pull out the parts we want to
62+
// save and try to save them in the db.
63+
const { id, text, audio_url, utterances } = transcript
4364
try {
44-
await db
65+
const transcriptionInDb = await db
4566
.collection("transcriptions")
46-
.doc(transcript.id)
47-
.set({ _timestamp: new Date(), ...transcript })
67+
.doc(id)
4868

49-
authenticatedEventsInDb.forEach(async d => {
50-
await d.ref.update({
51-
["webhook_auth_header_value"]: null
52-
})
69+
await transcriptionInDb.set({
70+
id,
71+
text,
72+
createdAt: Timestamp.now(),
73+
audio_url
5374
})
54-
console.log("transcript saved in db")
75+
76+
// Put each `utterance` in a separate doc in an utterances
77+
// collection. Previously had done the same for `words` but
78+
// got worried about collection size and write times since
79+
// `words` can be tens of thousands of members.
80+
if (utterances) {
81+
const writer = db.bulkWriter()
82+
for (let utterance of utterances) {
83+
const { speaker, confidence, start, end, text } = utterance
84+
85+
writer.set(
86+
db
87+
.collection("transcriptions")
88+
.doc(`${transcript.id}`)
89+
.collection("utterances")
90+
.doc(),
91+
{ speaker, confidence, start, end, text }
92+
)
93+
}
94+
95+
await writer.close()
96+
}
97+
98+
// Delete the hashed webhook auth token from our db now that
99+
// we're done.
100+
for (const index in authenticatedEventIds) {
101+
await db
102+
.collection("events")
103+
.doc(authenticatedEventIds[index])
104+
.collection("private")
105+
.doc("webhookAuth")
106+
.set({
107+
videoAssemblyWebhookToken: null
108+
})
109+
}
55110
} catch (error) {
56111
console.log(error)
57112
}

0 commit comments

Comments
 (0)