@@ -137,6 +137,76 @@ class SessionScraper extends EventScraper<SessionContent, Session> {
137
137
}
138
138
}
139
139
140
+ const submitTranscription = async ( {
141
+ EventId,
142
+ maybeVideoUrl
143
+ } : {
144
+ EventId : number
145
+ maybeVideoUrl : string
146
+ } ) => {
147
+ const newToken = randomBytes ( 16 ) . toString ( "hex" )
148
+
149
+ const transcript = await assembly . transcripts . submit ( {
150
+ audio :
151
+ // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
152
+ maybeVideoUrl ,
153
+ webhook_url :
154
+ // test with: "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
155
+ process . env . NODE_ENV === "development"
156
+ ? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
157
+ : "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription" ,
158
+ speaker_labels : true ,
159
+ webhook_auth_header_name : "x-maple-webhook" ,
160
+ webhook_auth_header_value : newToken
161
+ } )
162
+
163
+ await db
164
+ . collection ( "events" )
165
+ . doc ( `hearing-${ String ( EventId ) } ` )
166
+ . collection ( "private" )
167
+ . doc ( "webhookAuth" )
168
+ . set ( {
169
+ videoAssemblyWebhookToken : sha256 ( newToken )
170
+ } )
171
+
172
+ return transcript . id
173
+ }
174
+
175
+ const getHearingVideoUrl = async ( EventId : number ) => {
176
+ const req = await fetch (
177
+ `https://malegislature.gov/Events/Hearings/Detail/${ EventId } `
178
+ )
179
+ const res = await req . text ( )
180
+ if ( res ) {
181
+ const dom = new JSDOM ( res )
182
+ if ( dom ) {
183
+ const maybeVideoSource =
184
+ dom . window . document . querySelectorAll ( "video source" )
185
+ if ( maybeVideoSource . length && maybeVideoSource [ 0 ] ) {
186
+ const firstVideoSource = maybeVideoSource [ 0 ] as HTMLSourceElement
187
+ return firstVideoSource . src
188
+ }
189
+ }
190
+ }
191
+ return null
192
+ }
193
+
194
+ const shouldScrapeVideo = async ( EventId : number ) => {
195
+ const eventInDb = await db
196
+ . collection ( "events" )
197
+ . doc ( `hearing-${ String ( EventId ) } ` )
198
+ . get ( )
199
+ const eventData = eventInDb . data ( )
200
+
201
+ if ( ! eventData ) {
202
+ return false
203
+ }
204
+ if ( ! eventData . videoFetchedAt ) {
205
+ return withinCutoff ( new Date ( Hearing . check ( eventData ) . startsAt . toDate ( ) ) )
206
+ }
207
+ return false
208
+ }
209
+
140
210
class HearingScraper extends EventScraper < HearingListItem , Hearing > {
141
211
constructor ( ) {
142
212
super ( "every 60 minutes" , 240 )
@@ -150,88 +220,33 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
150
220
async getEvent ( { EventId } : HearingListItem /* e.g. 4962 */ ) {
151
221
const data = await api . getHearing ( EventId )
152
222
const content = HearingContent . check ( data )
153
- const eventInDb = await db
154
- . collection ( "events" )
155
- . doc ( `hearing-${ String ( EventId ) } ` )
156
- . get ( )
157
- const eventData = eventInDb . data ( )
158
- const hearing = Hearing . check ( eventData )
159
- const shouldScrape = withinCutoff ( hearing . startsAt . toDate ( ) )
160
-
161
- let payload : Hearing = {
223
+
224
+ if ( await shouldScrapeVideo ( EventId ) ) {
225
+ const maybeVideoUrl = await getHearingVideoUrl ( EventId )
226
+ if ( maybeVideoUrl ) {
227
+ const transcriptId = await submitTranscription ( {
228
+ maybeVideoUrl,
229
+ EventId
230
+ } )
231
+
232
+ return {
233
+ id : `hearing-${ EventId } ` ,
234
+ type : "hearing" ,
235
+ content,
236
+ ...this . timestamps ( content ) ,
237
+ videoURL : maybeVideoUrl ,
238
+ videoFetchedAt : Timestamp . now ( ) ,
239
+ videoTranscriptionId : transcriptId // using the assembly Id as our transcriptionId
240
+ } as Hearing
241
+ }
242
+ }
243
+
244
+ return {
162
245
id : `hearing-${ EventId } ` ,
163
246
type : "hearing" ,
164
247
content,
165
248
...this . timestamps ( content )
166
- }
167
- if ( hearing ) {
168
- payload = {
169
- ...payload ,
170
- videoURL : hearing . videoURL ,
171
- videoFetchedAt : hearing . videoFetchedAt ,
172
- videoAssemblyId : hearing . videoAssemblyId
173
- }
174
- }
175
- let maybeVideoURL = null
176
- let transcript = null
177
-
178
- if ( ! hearing . videoFetchedAt && shouldScrape ) {
179
- const req = await fetch (
180
- `https://malegislature.gov/Events/Hearings/Detail/${ EventId } `
181
- )
182
- const res = await req . text ( )
183
- if ( res ) {
184
- const dom = new JSDOM ( res )
185
- if ( dom ) {
186
- const maybeVideoSource =
187
- dom . window . document . querySelectorAll ( "video source" )
188
- if ( maybeVideoSource . length && maybeVideoSource [ 0 ] ) {
189
- const newToken = randomBytes ( 16 ) . toString ( "hex" )
190
- const firstVideoSource = maybeVideoSource [ 0 ] as HTMLSourceElement
191
- maybeVideoURL = firstVideoSource . src
192
-
193
- transcript = await assembly . transcripts . submit ( {
194
- webhook_url :
195
- process . env . NODE_ENV === "development"
196
- ? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
197
- : "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription" ,
198
- webhook_auth_header_name : "X-Maple-Webhook" ,
199
- webhook_auth_header_value : newToken ,
200
- audio : firstVideoSource . src ,
201
- auto_highlights : true ,
202
- custom_topics : true ,
203
- entity_detection : true ,
204
- iab_categories : false ,
205
- format_text : true ,
206
- punctuate : true ,
207
- speaker_labels : true ,
208
- summarization : true ,
209
- summary_model : "informative" ,
210
- summary_type : "bullets"
211
- } )
212
-
213
- await db
214
- . collection ( "events" )
215
- . doc ( `hearing-${ String ( EventId ) } ` )
216
- . collection ( "private" )
217
- . doc ( "webhookAuth" )
218
- . set ( {
219
- videoAssemblyWebhookToken : sha256 ( newToken )
220
- } )
221
-
222
- payload = {
223
- ...payload ,
224
- videoURL : maybeVideoURL ,
225
- videoFetchedAt : Timestamp . now ( ) ,
226
- videoAssemblyId : transcript . id
227
- }
228
- }
229
- }
230
- }
231
- }
232
-
233
- const event : Hearing = payload
234
- return event
249
+ } as Hearing
235
250
}
236
251
}
237
252
0 commit comments