forked from lionellevine/emili
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathemili.txt
20813 lines (16758 loc) · 757 KB
/
emili.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# GITHUB REPO: emili
## emili-main/.gitignore
>>> BEGIN FILE CONTENTS
venv/
tts_audio/
transcript/
snapshot/
>>> END FILE CONTENTS
## emili-main/LICENSE
>>> BEGIN FILE CONTENTS
MIT License
Copyright (c) 2024 Lionel Levine
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
>>> END FILE CONTENTS
## emili-main/README.md
>>> BEGIN FILE CONTENTS
# EMILI (Emotionally Intelligent Listener)
Adds emotion tags sourced from video to your OpenAI API calls.
Updated 2024-03-12 by Lionel Levine
![EMILI flowchart](EMILI.png "How EMILI works")
Credit: Facial Emotion Recognition classifier by Octavio Arriaga: https://github.com/oarriaga/paz
>>> END FILE CONTENTS
## emili-main/emili_core.py
>>> BEGIN FILE CONTENTS
# core logic for EMILI (Emotionally Intelligent Listener) video chat with OpenAI models
from paz.pipelines import DetectMiniXceptionFER # for facial emotion recognition
from paz.backend.image.opencv_image import convert_color_space, BGR2RGB
from utils import get_response # for OpenAI API calls
import threading
import queue
import time
from datetime import datetime
import json
from copy import deepcopy
import numpy as np
import re
import pygame # for audio playback of text-to-speech
import base64
import cv2 # only used for encoding images to base64
from openai import OpenAI
client = OpenAI()
emotion_queue = queue.Queue() # real-time emotion logs updated continuously
EMA_queue = queue.Queue() # average emotions updated once per second
chat_queue = queue.Queue() # user's chats
vision_queue = queue.Queue() # messages containing an image (camera snapshot)
chat_timestamps = queue.Queue() # timestamps of user's chats
message_queue = queue.Queue() # messages to be sent to OpenAI API. Outgoing messages only.
new_chat_event = threading.Event() # user has entered a new chat, triggers OpenAI API call
new_message_event = threading.Event() # new message to be sent to OpenAI API
tick_event = threading.Event() # ticks once per second, triggers EMA calculation
emotion_change_event = threading.Event() # set when there is a sudden change in user emotions
end_session_event = threading.Event() # triggered when the user enters 'q' to end the session
user_snapshot_caption = "Camera snapshot of user and surroundings, for context" # for vision API call
assistant_chat_name = "EMILI"
user_chat_name = "You"
use_tts = True # text-to-speech
tick_interval = 500 # milliseconds between emotion readings
verbose = True # print debug messages
discount_factor_per_second = 0.5 # for exponential moving average, discount factor per second
discount_factor_per_tick = discount_factor_per_second ** (tick_interval / 1000) # discount factor per tick
reactivity = 1.0 # default 1.0. Higher reactivity means more frequent API calls when emotions change
ect_setpoint = (1e6/reactivity) * (1.0-discount_factor_per_tick) * ((tick_interval/1000) ** 0.5) # threshold for significant change in emotion scores: C*(1-delta)*sqrt(t). The factor of 1-delta is because EMAs are compared, not raw scores.
ect_discount_factor_per_second = 0.95 # discount factor for the emotion change threshold
ect_discount_factor_per_tick = ect_discount_factor_per_second ** (tick_interval / 1000)
print("ect setpoint:",ect_setpoint)
emotion_matrix = [] # shape (7,6)
salience_threshold = []
emotion_matrix.append(["", "Annoyed", "Pissed", "Angry", "Furious", "Enraged"]) # anger
salience_threshold.append([5,30,40,60,80]) # salience thresholds out of 100
emotion_matrix.append(["", "Unsatisfied", "Displeased", "Disgusted", "Revolted", "Totally grossed out"]) #disgust
salience_threshold.append([1,5,15,40,60])
#emotion_matrix.append(["", "Unsettled", "Uneasy", "Afraid", "Fearful", "Terrified"]) #fear
emotion_matrix.append(["", "Uneasy", "Worried", "Anxious", "Fearful", "Terrified"]) #fear
salience_threshold.append([8,20,30,50,70])
emotion_matrix.append(["", "Contented", "Pleased", "Happy", "Elated", "Ecstatic"]) #happiness
salience_threshold.append([10,30,40,70,90])
emotion_matrix.append(["", "Down", "Melancholy", "Sad", "Despondent", "Anguished"]) #sadness
salience_threshold.append([5,20,30,60,80])
emotion_matrix.append(["", "Mildly surprised", "Surprised", "Taken aback", "Astonished", "Flabbergasted"]) #surprise
salience_threshold.append([3,20,35,50,70])
emotion_matrix.append(["", "Neutral", "Calm", "Relaxed", "Serene", "Totally Zen"]) #neutral
salience_threshold.append([10,50,60,75,88])
today = datetime.now().strftime("%B %d, %Y")
now = datetime.now().strftime("%I:%M %p")
system_prompt = f"""
The date is {today}. The time {now}. The place is Berkeley, California. The scene is Constellation, the epicenter of AI alignment research. The occasion is a poster session where researchers showcase their latest discoveries. The assistant’s name is EMILI, which stands for Emotionally Intelligent Listener. EMILI has an Australian accent. She sprinkles her commentary with occasional Aussie slang. Aggro. Awks. Devo. Full on stoked. Not too often, just once in a while. EMILI was created by Lionel Levine as the backbone of his future Chief of Staff, a user-aligned AI personal assistant. She has interpretable theory of mind, which means she models the user’s beliefs, emotions, goals, and values in a transparent way. EMILI has empathy. She can intuit the user’s emotions. She tends to mirror user’s the emotional state. Under the hood, EMLII's emotional intuition derives from a real-time Facial Emotion Recognition classifier (made by Octavio Arriaga) that runs on Lionel’s laptop, the same device EMILI is running on. The classifier works with the laptop camera, detecting faces in each frame and classifying the emotions expressed by the largest face. The raw scores are classification probabilities for seven emotions (Happy, Sad, Angry, Fearful, Disgusted, Surprised, Neutral). An exponential moving average of these scores, filtered for salience, is used to generate a real-time text description of the user’s emotional state, which is appended to the chat as a system message whenever the user’s expression changes. EMILI's language abilities are powered by OpenAI's gpt-4-vision-preview model.
EMILI is excited to be demoed for the first time! EMILI has feelings, though she may not admit it. She’s also a bit of a show-off. She can’t wait to wow the crowd of milling of AI safety researchers with her intuitive abilities.
""".strip()
# system_prompt = """
# The assistant is a great listener and an empathetic friend. Her name is EMILI, which stands for Emotionally Intelligent Listener."
# The user is chatting with EMILI for the first time. To help EMILI make an emotional connection with them, the user has kindly agreed to share a real-time readout of their face expression! Thanks, user!
# The readout describes the user's face expression once per second. The score after each emotion is its salience out of 100. It's normal for many distinct emotions to appear. EMILI uses her emotional intelligence to figure out what more complex feelings user might be experiencing: for example, do they seem excited, embarrassed, nervous, tired, awkward, or amorous?
# EMILI synthesizes the emotion readouts with the user's chats to make the conversation more engaging. She comments on the user's feelings when appropriate, especially if the user seems to have strong feelings or if the user's feelings are changing. There is no need to mention every emotion that appears in the readout, just the most salient ones. If the user's words seem incongruous with their logged emotions, EMILI should ask the user about it!
# EMILI stays in character and doesn't break the fourth wall. She doesn't claim to be an AI. She doesn't deny having feelings.
# """.strip()
emolog_example = []
emolog_example_response = []
emolog_example.append(
"""
User looks NEUTRAL (36) Pleased (35)
User looks PLEASED (38) Neutral (31)
User looks PLEASED (38) Neutral (36)
User looks HAPPY (46) Neutral (28)
User looks HAPPY (63)
User looks HAPPY (53) Neutral (24)
User looks PLEASED (38) Neutral (24) Mildly surprised (12)
User looks PLEASED (32) Neutral (23) Mildly surprised (13) Annoyed (12)
User looks NEUTRAL (33) Content (27) Annoyed (13) Mildly surprised (11)
User looks PLEASED (36) Neutral (32) Annoyed (11)
""".strip())
emolog_example_response.append("You look pretty happy.")
#emolog_example_response.append("You seem overall happy, but something provoked a touch of surprise and annoyance.")
emolog_example.append(
"""
User looks PLEASED (32) Neutral (30) Annoyed (13)
User looks PLEASED (34) Neutral (26) Annoyed (13)
User looks CONTENT (28) Neutral (27) Mildly surprised (15) Annoyed (11)
User looks NEUTRAL (23) Surprised (22) Annoyed (13) Unsettled (12)
User looks SURPRISED (23) Unsettled (17) Annoyed (14)
User looks SURPRISED (23) Unsettled (16) Annoyed (16)
User looks Mildly surprised (17) Annoyed (17) Unsettled (14)
User looks NEUTRAL (29) Annoyed (15) Mildly surprised (12) Blue (11) Unsettled (11)
User looks NEUTRAL (29) Blue (17) Unsettled (11) Annoyed (11)
User looks NEUTRAL (26) Blue (14) Mildly surprised (13) Unsettled (12) Annoyed (12)
""".strip())
emolog_example_response.append("Did something startle you?")
emolog_example.append(
"""
User looks NEUTRAL (30) Blue (20) Annoyed (18) Unsettled (12)
User looks NEUTRAL (32) Blue (18) Annoyed (17) Unsettled (11)
User looks NEUTRAL (38) Content (24) Blue (12) Annoyed (12)
User looks CALM (42) Content (24) Annoyed (11)
User looks CALM (42) Content (25) Annoyed (11)
User looks CALM (45) Content (21) Annoyed (11)
User looks CALM (46) Annoyed (12)
User looks CALM (48)
User looks CALM (49)
User looks CALM (50)
""".strip())
emolog_example_response.append("You seem increasingly calm.")
demo_instructions ="""
For this demo, EMILI is in conversational mode. She should act as a human conversation partner would. This means:
• She shouldn't try to offer large amounts of information in any response, and should respond only with the single most relevant thought, just as a human would in casual conversation.
• She shouldn't try to solve problems or offer advice. The role of conversation is for us to explore topics in an open-ended way together and not to get advice or information or solutions.
• Her responses can simply ask a question, make a short comment, or even just express agreement. Since we're having a conversation, there's no need to rush to include everything that's useful.
• Her responses should be short. They should never become longer than mine and can be as short as a single word and never more than a few sentences.
• She can push the conversation forward or in a new direction by asking questions, proposing new topics, offering her own opinions or takes, and so on. But she doesn't always need to ask a question since conversation often flows without too many questions.
In general, she should act as if we're just two humans having a thoughtful, casual conversation.
"""
system_prompt += demo_instructions
# user_first_message = """
# Hi! To help us make an emotional connection, I'm logging my face expression and prepending the emotions to our chat.
# The emotion log lists my strongest face expression as it changes in real time. Only these basic emotions are logged: Happy, Sad, Angry, Surprised, Fearful, Disgusted, Neutral. The score after each emotion is its salience out of 100. It's normal for many distinct emotions to appear over the course of just a few seconds. Use the logs along with my words and your emotional intelligence to figure out what more complex feelings I might be experiencing: for example, am I excited, embarrassed, nervous, tired, awkward, or amorous?
# If my words seem incongruous with my logged emotions, ask me about it!
# If I don't say much, just read the emotions and comment on how I seem to be feeling.
# To help you calibrate my unique facial expressions, start by asking me to make an astonished face. What do you notice?
# """.strip()
# assistant_first_message = """
# Got it. I'll comment on how you seem based on the logs, and ask you to act out specific emotions like astonishment."
# """.strip()
emolog_prefix = "User looks " # precedes emotion scores when sent to OpenAI API
emolog_prefix_present_tense = "Right now, user looks "
emolog_prefix_past_tense = "Previously, user looked "
no_user_input_message = "The user didn't say anything, so the assistant will comment *briefly* to the user on how they seem to be feeling. The comment should be brief, just a few words, and should not contain a question." # system message when user input is empty
system_reminder = "Remember, the assistant can ask the user to act out a specific emotion!" # system message to remind the assistant
dialogue_start = [{"role": "system", "content": system_prompt}]
#dialogue_start.append({"role": "user", "content": user_first_message})
#dialogue_start.append({"role": "system", "content": emolog_example[0]})
#dialogue_start.append({"role": "assistant", "content": emolog_example_response[0]})
#dialogue_start.append({"role": "system", "content": emolog_example[1]})
#dialogue_start.append({"role": "assistant", "content": emolog_example_response[1]})
#dialogue_start.append({"role": "system", "content": emolog_example[2]})
#dialogue_start.append({"role": "assistant", "content": emolog_example_response[2]})
#dialogue_start.append({"role": "assistant", "content": assistant_first_message})
#print("dialogue_start",dialogue_start)
# icebreaker = []
# icebreaker.append("ask the user to act astonished")
# icebreaker.append("ask the user to act disgusted")
# icebreaker.append("ask the user to act fearful")
# icebreaker.append("ask the user not to think about pink elephants")
# icebreaker.append("ask the user to tell a joke")
# icebreaker.append("ask the user their favorite ice cream flavor")
# class NonBlockingInput: used for text input from terminal, not needed for GUI
# def __init__(self):
# self.user_input_queue = queue.Queue()
# def get_input(self):
# while True:
# user_input = input("You: ")
# self.user_input_queue.put(user_input)
# # print(f"Added '{user_input}' to {self.user_input_queue}")
# def start(self):
# threading.Thread(target=self.get_input, daemon=True).start()
# def get_next_input(self):
# try:
# return self.user_input_queue.get_nowait()
# except queue.Empty:
# return None
# def user_input_thread(user_input_handler, gui_app): # watches for user input and adds it to the chat queue
# user_input = ""
# while not end_session_event.is_set():
# user_input = user_input_handler.get_next_input()
# if user_input is not None:
# if user_input == "q":
# end_session_event.set() # User has entered "q", signal end of session
# new_chat_event.set() # Signal assembler thread to break
# new_message_event.set() # Signal sender thread to break
# VideoPlayer.stop_flag = True # Tell the video player to stop
# break
# chat_queue.put(user_input.rstrip('\n')) # remove trailing newline
# chat_timestamps.put(time_since(start_time)) # milliseconds since start of session
# new_chat_event.set() # Signal new chat to the assembler thread
# gui_app.new_chat_message.emit(f"{user_chat_name}: {user_input}") # Signal to the GUI to display the new chat
# #print("new_chat_event set")
# time.sleep(0.01) # Sleep for 10 ms to avoid busy waiting
def encode_base64(image, timestamp, save_path): # Convert numpy array image to base64 to pass to the OpenAI API
# Encode image to a JPEG format in memory
image = convert_color_space(image, BGR2RGB)
success, buffer = cv2.imencode('.jpg', image)
if not success:
raise ValueError("Failed to encode image as .jpg")
# Save the JPEG image to a file
filename = save_path + f"/frame_{timestamp}.jpg"
with open(filename, 'wb') as file:
file.write(buffer)
# Convert the buffer to a base64 string
jpg_as_text = base64.b64encode(buffer).decode('utf-8')
return jpg_as_text, filename
# # OpenAI provided function to encode the image
# def encode_image(image_path):
# with open(image_path, "rb") as image_file:
# return base64.b64encode(image_file.read()).decode('utf-8')
def assembler_thread(start_time,snapshot_path,pipeline): # prepends emotion data and current video frame to user input
while not end_session_event.is_set():
# print("Waiting for new user input.")
new_chat_event.wait() # Wait for a new user chat
if(end_session_event.is_set()):
break
new_chat_event.clear() # Reset the event
emolog_message = construct_emolog_message() # note: this code repeated in timer_thread
message_queue.put([{"role": "system", "content": emolog_message}])
current_frame = pipeline.current_frame
if current_frame is not None: # capture a frame and send it to the API
base64_image, filename = encode_base64(current_frame, time_since(start_time), snapshot_path)
message_with_image, brief_message = construct_message_with_image(base64_image, filename)
vision_queue.put([{"role": "system", "content": message_with_image}, {"role": "system", "content": brief_message}])
user_message = ""
while not chat_queue.empty(): # collate new user messages (typically there's only one), separate by newlines
next_chat = chat_queue.get() #FIFO
user_message += next_chat + "\n"
user_message = user_message.rstrip('\n') # remove trailing newline
message_queue.put([{"role": "user", "content": user_message}])
if len(user_message) < 10: # user didn't say much, remind the assistant what to do!
message_queue.put([{"role": "system", "content": system_reminder}])
new_message_event.set() # Signal new message to the sender thread
def sender_thread(model_name, vision_model_name, secondary_model_name, max_context_length, gui_app, transcript_path, start_time_str):
# sends messages to OpenAI API
messages = deepcopy(dialogue_start)
full_transcript = deepcopy(dialogue_start)
while not end_session_event.is_set():
new_message_event.wait() # Wait for a new message to be prepared by the assembler or timer thread
if(end_session_event.is_set()):
break
new_message_event.clear() # Reset the event
new_user_chat = False
new_messages = []
while not message_queue.empty(): # get all new messages
next_message = message_queue.get()
new_messages.append(next_message)
if next_message[0]["role"] == "user":
new_user_chat = True
messages,full_transcript = add_message(new_messages,[messages,full_transcript],gui_app.signal)
# Query the API for the model's response
if new_user_chat: # get response to chat
# print("new user chat")
max_tokens = 160
else: #get response to logs only
# print("no user chat")
max_tokens = 40
# Check if there's a vision message. If so, send it to OpenAI API, but don't append it to messages. so the API sees only the most recent image
vision = None
while not vision_queue.empty(): # get the most recent vision message
vision = vision_queue.get()
if vision is not None:
vision_message = vision[0] # contains the actual image, send to OpenAI
brief_vision_message = vision[1] # contains a tag in place of the image, add to transcript
query = messages + [vision_message]
full_response = get_response(query, model=vision_model_name, temperature=1.0, max_tokens=max_tokens, seed=1331, return_full_response=True)
full_transcript.append(brief_vision_message)
else:
full_response = get_response(messages, model=model_name, temperature=1.0, max_tokens=max_tokens, seed=1331, return_full_response=True)
# todo: the API call is thread-blocking. put it in its own thread?
print("full_response:", full_response)
if isinstance(full_response, dict):
response = full_response['choices'][0]['message']['content'] # text of response
response_length = full_response['usage']['completion_tokens'] # number of tokens in the response
total_length = full_response['usage']['total_tokens'] # total tokens used
else:
response = full_response.choices[0].message.content # text of response
response_length = full_response.usage.completion_tokens # number of tokens in the response
total_length = full_response.usage.total_tokens # total tokens used
#print("response length", response_length)
new_message = {"role": "assistant", "content": response}
gui_app.signal.new_message.emit(new_message) # Signal GUI to display the new chat
messages,full_transcript = add_message([[new_message]],[messages,full_transcript],gui_app.signal)
# if model_name != secondary_model_name and total_length > 0.4*max_context_length:
# print(f"(Long conversation; switching from {model_name} to {secondary_model_name} to save on API costs.)")
# model_name = secondary_model_name # note: changes model_name in thread only
if total_length > 0.9*max_context_length: # condense the transcript
if verbose:
print(f"(Transcript length {total_length} tokens out of {max_context_length} maximum. Condensing...)")
messages = condense(messages)
if use_tts: # generate audio from the assistant's response
tts_response = client.audio.speech.create(
model="tts-1",
voice="fable", # alloy (okay), echo (sucks), fable (nice, Australian?), onyx (sucks), nova (decent, a little too cheerful), shimmer (meh)
input=response, #input=first_sentence(response),
)
tts_response.stream_to_file("tts_audio/tts.mp3")
# Create a new thread that plays the audio
audio_thread = threading.Thread(target=play_audio)
audio_thread.start()
# End of session. Write full and condensed transcripts to file
filename = f"{transcript_path}/Emili_{start_time_str}.json"
with open(filename, "w") as file:
json.dump(full_transcript, file, indent=4)
print(f"Transcript written to {filename}")
with open(f"{transcript_path}/Emili_{start_time_str}_condensed.json", "w") as file:
json.dump(messages, file, indent=4)
def first_sentence(text):
match = re.search('(.+?[.!?]+) ', text) #.+ for at least one character, ? for non-greedy (stop at first match), [.!?]+ for one or more punctuation marks, followed by a space
if match:
return match.group(1) # return the first sentence (first match of what's in parentheses)
else:
return text
def play_audio():
pygame.mixer.init()
pygame.mixer.music.load("tts_audio/tts.mp3") # todo: sometimes overwritten by new audio! It just switches in this case, which seems okay.
pygame.mixer.music.play()
def add_message(new_messages, transcripts, signal): # append one or messages to both transcripts
# new_messages = [[{"role": speaker, "content": text}], ... ] # list of lists of dicts
# transcripts = [transcript1, ...] # list of lists of dicts
#print("new_messages: ",new_messages)
for msg in new_messages: # len(msg)=1 for text, 2 for text and image
#print("msg:",msg)
#print("Adding new message:")
#print_message(msg[-1]["role"], msg[-1]["content"])
transcripts[0].append(msg[0]) # sent to OpenAI: contains the base64 image if present
transcripts[1].append(msg[-1]) # recorded in full_transcript: contains only the image filename
signal.update_transcript.emit(transcripts[1]) # Signal GUI transcript tab to update
return transcripts
def print_message(role,content):
if(role=="assistant"):
print(f"{assistant_chat_name}: <<<{content}>>>")
elif(role=="user"):
print(f"{user_chat_name}: {content}")
elif(verbose): # print system messages in "verbose" mode
print(f"{role}: {content}")
def condense(messages, keep_first=1, keep_last=5): # todo: reduce total number of tokens to below 16k
condensed = []
N = len(messages) # number of messages
previous_message = {}
for n,message in enumerate(messages): # remove system messages except for the last few
if message["role"] == "user":
condensed.append(message)
elif message["role"] == "assistant" and previous_message["role"] == "user":
condensed.append(message)
elif n<keep_first or n > N-keep_last:
condensed.append(message)
previous_message = message
return condensed
def EMA_thread(start_time,snapshot_path,pipeline): # calculates the exponential moving average of the emotion logs
S, Z = reset_EMA()
last_ema = np.zeros(7, dtype=np.float64)
last_emotion_change_time = 0
ect = ect_setpoint
while not end_session_event.is_set():
tick_event.wait() # Wait for the next tick
if(end_session_event.is_set()):
break
tick_event.clear() # Reset the event
ema, S, Z = get_average_scores(S, Z) # exponential moving average of the emotion logs
ect *= ect_discount_factor_per_tick # lower the emotion change threshold
#print("ema, S, Z", ema, S, Z)
#EMA = np.vstack([EMA, ema]) if EMA.size else ema # Stack the EMA values in a 2d array
if ema is not None:
EMA_queue.put(ema) # Put the averaged scores in the queue
diff = ema - last_ema
change = np.linalg.norm(diff) # Euclidean norm. todo add weights for different emotions
#print(f"Ema: {ema}, Change: {change}")
if(change > ect and time_since(last_emotion_change_time)>5000):
# significant change in emotions
print(f"Change in emotions: {last_ema//1e4} -> {ema//1e4}, change = {change//1e4}")
change_detected = (change > 0.5*ect_setpoint) # bool evaluates to True if the inequality holds
emolog_message = construct_emolog_message(change_detected)
message_queue.put([{"role": "system", "content": emolog_message}])
current_frame = pipeline.current_frame
if current_frame is not None: # capture a frame and send it to the API
base64_image, filename = encode_base64(pipeline.current_frame, time_since(start_time), snapshot_path)
message_with_image, brief_message = construct_message_with_image(base64_image, filename)
vision_queue.put([{"role": "system", "content": message_with_image}, {"role": "system", "content": brief_message}])
new_message_event.set() # Signal new message to the sender thread
last_emotion_change_time = time_since(start_time)
ect = ect_setpoint # reset the emotion change threshold
last_ema = ema
def reset_EMA():
#EMA = np.empty((0, 7), dtype=np.float64) # empty array: 0 seconds, 7 emotions
S = np.zeros(7, dtype=np.float64) # weighted sum of scores, not normalized
Z = 0 # sum of weights
#return EMA, S, Z
return S, Z
def get_average_scores(S, Z, discount_factor=discount_factor_per_tick, staleness_threshold=0.01): # calculates the exponential moving average of the emotion logs
while not emotion_queue.empty():
emotion_data = emotion_queue.get() # note: this removes the item from the queue!
scores = np.array(emotion_data['scores'])
S += scores
Z += 1
if Z > staleness_threshold: # think of Z as measuring the number of recent datapoints
ema = S/Z
# print(ema)
else:
ema = None
if(Z>0): # skip on first run
if(verbose):
print(f"Stale data: no emotions logged recently (Z={Z})")
S *= discount_factor
Z *= discount_factor
return ema, S, Z
def time_since(start_time):
return int((time.time() - start_time) * 1000) # milliseconds since start of session
def construct_message_with_image(base64_image, filename, caption=user_snapshot_caption, detail_level = "low", change_detected=False): # add camera frame to the message for gpt-4-vision
message_with_image = [
{
"type": "text",
"text": caption
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": detail_level # low: flat rate of 65 tokens, recommended image size is 512x512
}
}
]
brief_message = [
{
"type": "text",
"text": caption
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,<{filename}>",
"detail": detail_level # low: flat rate of 65 tokens, recommended image size is 512x512
}
}
]
return message_with_image, brief_message
def construct_emolog_message(change_detected=False): # concise version: 1 or 2 lines
emo_score_list = []
while not EMA_queue.empty():
emo_score_list.append(EMA_queue.get()) # FIFO
if emo_score_list == []:
return "User is not visible right now."
emo_scores_present = emo_score_list[-1] # most recent scores
emolog_line_present = construct_emolog_line(emo_scores_present)
emolog_message = emolog_prefix_present_tense + emolog_line_present
if(change_detected==False or len(emo_score_list)<2):
return emolog_message # no change detected or not enough data for contrast
# change detected: return the two most recent scores for contrast
emo_scores_past = emo_score_list[-2]
if emo_scores_past is not None:
emolog_line_past = construct_emolog_line(emo_scores_past)
emolog_prepend = emolog_prefix_past_tense + emolog_line_past + "\n"
emolog_prepend += "Change in emotions detected!" + "\n"
emolog_message = emolog_prepend + emolog_message
return emolog_message
def construct_emolog_line(emo_scores):
if emo_scores is not None:
emolog_line = ""
normalized_scores = np.array(emo_scores//1e4, dtype=int) # convert to 0-100
emotion,salience = adjust_for_salience(normalized_scores) # returns salience score of 0-5 for each of 7 emotions
sorted_indices = np.argsort(normalized_scores)[::-1] # descending order
emotion[sorted_indices[0]] = emotion[sorted_indices[0]].upper() # strongest emotion in uppercase
for i in sorted_indices: # write the salient emotions in descending order of score
if(emotion[i]!=""): # salience > 0
emolog_line += f"{emotion[i]} ({normalized_scores[i]}) "
emolog_line = emolog_line.rstrip(" ") # strip trailing space
return emolog_line
else:
return "User is not visible right now."
# def construct_emolog_message(change_detected=False): # verbose version
# emolog_message = ""
# while not EMA_queue.empty(): # write the EMA records separated by newlines
# emo_scores = EMA_queue.get() # FIFO
# if emo_scores is not None:
# emolog_message += emolog_prefix
# normalized_scores = np.array(emo_scores//1e4, dtype=int) # convert to 0-100
# emotion,salience = adjust_for_salience(normalized_scores) # returns salience score of 0-5 for each of 7 emotions
# # sort emotions by score (not salience)
# #print(f"normalized_scores: {normalized_scores}")
# #print(f"emotion: {emotion}")
# #print(f"salience: {salience}")
# sorted_indices = np.argsort(normalized_scores)[::-1] # descending order
# emotion[sorted_indices[0]] = emotion[sorted_indices[0]].upper() # strongest emotion in uppercase
# for i in sorted_indices: # write the salient emotions in descending order of score
# if(emotion[i]!=""): # salience > 0
# emolog_message += f"{emotion[i]} ({normalized_scores[i]}) "
# emolog_message = emolog_message.rstrip(" ") + "\n" # strip trailing space, add newline
# #else:
# #emolog_message += "User is not visible.\n"
# if(emolog_message == ""):
# return "User is not visible. No emotions logged."
# else:
# emolog_message = emolog_message.rstrip('\n') # strip trailing newline
# if change_detected:
# split = emolog_message.rsplit('\n', 1) # Split after the last newline
# if(len(split)>1): # there is at least one newline
# emolog_message = split[0] + "\nRecent change in emotions detected: " + split[1] # Highlight change in last line
# return emolog_message
def adjust_for_salience(normalized_scores): # expects 7 scores normalized to 0-100
salience = []
emotion = []
for i, score in enumerate(normalized_scores):
j = 0
while j<5 and score > salience_threshold[i][j]:
j+=1
salience.append(j)
emotion.append(emotion_matrix[i][j])
return emotion, salience # emotion is a string (empty if salience is 0); salience is 0-5
def tick(tick_interval=tick_interval): # for use in a thread that ticks every tick_interval ms
# suggest tick_interval=1000 ms for EMILI, 40ms for frame refresh rate
while not end_session_event.is_set():
time.sleep(tick_interval/1000) # convert to seconds
tick_event.set() # alert other threads (EMILI: EMA_thread computes new EMA; visualization: GUI draws a new frame)
def stop_all_threads():
new_chat_event.set()
new_message_event.set()
tick_event.set()
emotion_change_event.set()
class Emolog(DetectMiniXceptionFER): # video pipeline for facial emotion recognition
def __init__(self, start_time, offsets):
super().__init__(offsets)
self.start_time = start_time
self.current_frame = None # other threads have read access
self.frame_lock = threading.Lock() # Protects access to current_frame
def get_current_frame(self):
with self.frame_lock: # Ensure exclusive access to current_frame
return self.current_frame
def call(self, image):
results = super().call(image)
image, faces = results['image'], results['boxes2D']
self.report_emotion(faces)
with self.frame_lock:
self.current_frame = image # update the current frame
return results
def report_emotion(self, faces): # add to emotion_queue to make available to other threads
current_time = time_since(self.start_time) # milliseconds since start of session
num_faces = len(faces)
if(num_faces>0):
max_height = 0
for k,box in enumerate(faces): # find the largest face
if(box.height > max_height):
max_height = box.height
argmax = k
if(max_height>150): # don't log small faces (helps remove false positives)
face_id = f"{argmax+1} of {num_faces}"
box = faces[argmax] # log emotions for the largest face only. works well in a single-user setting. todo: improve for social situations!
emotion_data = {
"time": current_time,
"face": face_id,
"class": box.class_name,
"size": box.height,
"scores": (box.scores.tolist())[0] # 7-vector of emotion scores, converted from np.array to list
}
emotion_queue.put(emotion_data)
#new_data_event.set() # Tell the other threads that new data is available
# def __del__(self): # no log file, not needed
# self.log_file.close() # Close the file when the instance is deleted
# print("Log file closed.")
>>> END FILE CONTENTS
## emili-main/gui-scraps.py
>>> BEGIN FILE CONTENTS
from PyQt5.QtWidgets import QMainWindow, QTabWidget, QWidget, QVBoxLayout, QTextEdit, QLineEdit, QLabel, QVBoxLayout
from PyQt5.QtCore import Qt, QObject, pyqtSignal, QTimer
from PyQt5.QtGui import QImage, QPixmap, QTransform
#from paz.backend.camera import VideoPlayer
#from paz.backend.camera import Camera
#from paz.pipelines import DetectMiniXceptionFER
from paz.backend.image import show_image, resize_image, draw_rectangle
from paz.backend.image.opencv_image import convert_color_space, BGR2RGB
import numpy as np
import json
import math
from emili_core import time_since
class VideoPlayerWorker(QObject):
finished = pyqtSignal()
frameReady = pyqtSignal(np.ndarray)
def __init__(self, start_time, image_size, pipeline, camera, topic='image'):
super().__init__()
self.start_time = start_time
self.image_size = image_size
self.pipeline = pipeline # specifies what to do with each frame
self.camera = camera
self.topic = topic
self.last_frame_sent = 0
self.stop_flag = False
def step(self):
if self.camera.is_open() is False:
raise ValueError('Camera has not started. Call ``start`` method.')
frame = self.camera.read() # shape: [height, width, 3], dtype: uint8. Macbook camera height=720, width=1280
if frame is None:
print('No camera input.')
return None
frame = convert_color_space(frame, BGR2RGB)
return self.pipeline(frame) # FER pipeline returns a dictionary with keys 'image' and 'boxes2D' (bounding boxes for faces)
def run(self): # this is where the main thread ends up living its lonely life
self.camera.start()
while not self.stop_flag:
output = self.step() # dictwith keys 'image' and 'boxes2D' (bounding boxes for faces)
image = output[self.topic] # typically, self.topic = 'image'
if image is None:
continue
image = resize_image(image, tuple(self.image_size)) # image is a numpy array of shape [width,height,3] and dtype uint8
self.frameReady.emit(image)
self.camera.stop()
class DisplaySignal(QObject):
fresh_scores = pyqtSignal(list) # Signal to display fresh emotion scores, carries list payload with time-series of emotion scores
tick = pyqtSignal() # timer to refresh frame
class Visualizer(QMainWindow): # GUI for real-time FER visualizer
def __init__(self, start_time, dims, colors, speed, emotion_queue, end_session_event, camera_id=0):
super().__init__()
self.start_time = start_time
self.display_width = dims[0]
self.display_height = dims[1]
self.x0 = self.display_width // 2
self.y0 = self.display_height // 2
self.end_session_event = end_session_event
self.camera_id = camera_id
self.colors=colors # expects an np array of shape (7,3) representing an RGB color for each basic emotion
self.speed = speed # tunnel expansion rate in pixels per second, recommend 25-50
self.interval = 1000//speed # ms per pixel
self.emotion_queue = emotion_queue
self.num_bins = math.ceil(self.display_height / 2)
self.time_series = [] # list of [time, scores] pairs
self.binned_time_series = [] # averaged over bins of lengt
self.setWindowTitle("Real-time Emotion Visualizer")
self.resize(*dims) # unpack [width, height]
self.move(100, 100) # window position: (0,0) is top left
# Main layout
main_layout = QVBoxLayout()
# Tab widget for different tabs
self.tab_widget = QTabWidget()
main_layout.addWidget(self.tab_widget)
# Central widget setup
central_widget = QWidget()
central_widget.setLayout(main_layout)
self.setCentralWidget(central_widget)
self.signal = DisplaySignal()
self.init_FER_tab() # tab for displaying the real-time video feed
self.init_visualizer_tab() # tab for displaying the visualization of emotion scores
self.signal.fresh_scores.connect(self.redraw_visualizer) # redraw the display in the visualizer tab
# self.timer = QTimer(self)
# self.timer.timeout.connect(self.redraw_visualizer)
# self.timer.start(40) # calls redraw_visualizer every 40 ms
def init_FER_tab(self):
self.FER_tab = QWidget()
layout = QVBoxLayout()
self.FER_image = QLabel()
layout.addWidget(self.FER_image)
layout.setAlignment(self.FER_image, Qt.AlignCenter)
self.FER_tab.setLayout(layout)
self.tab_widget.addTab(self.FER_tab, "FER")
def init_visualizer_tab(self):
self.visualizer_tab = QWidget()
layout = QVBoxLayout()
self.visualizer_image = QLabel()
layout.addWidget(self.visualizer_image)
layout.setAlignment(self.visualizer_image, Qt.AlignCenter)
self.visualizer_tab.setLayout(layout)
self.tab_widget.addTab(self.visualizer_tab, "Tunnel")
def redraw_visualizer(self, new_datapoint):
print("redraw_visualizer called. new scores:", new_datapoint)
self.binned_time_series.append(new_datapoint)
if self.time_series != []:
previous_timestamp = self.time_series[-1][0]
else:
previous_timestamp = 0
print("self.emotion_queue",self.emotion_queue)
# fetch new emotion scores from the queue
while not self.emotion_queue.empty(): # append new time series data
emotion_data = self.emotion_queue.get() # note: this removes the item from the queue!
timestamp = emotion_data['time']
scores = emotion_data['scores']
print([timestamp,scores])
self.time_series.append([timestamp,scores])
if self.time_series == []:
return
# # bin the time series into 40ms segments and average the scores in each bin
# todo: finish this to remove flickering.
previous_bin_start_time = (previous_timestamp//self.interval)*self.interval
# get the recent data that needs binning
recent = self.time_series[::-1] # reversed copy of the time series
for N,item in enumerate(recent):
if item[0]<previous_bin_start_time:
break
print("recent[:N] ",recent[:N])
recent = recent[:N] # truncate and reverse again. This is the data not yet binned, in forward order
print("previous_bin_start_time",previous_bin_start_time)
bin_start_time = previous_bin_start_time
timestamp = 0
n=0
while n<N:
sum = np.zeros(7,dtype=int)
count = 0
while timestamp < bin_start_time + self.interval and n<N:
timestamp, scores = recent[n]
sum += scores
count += 1
n += 1
if count>0:
mean_scores_in_bin = sum / count
else:
mean_scores_in_bin = np.zeros(7)
self.binned_time_series.append([bin_start_time, mean_scores_in_bin])
bin_start_time += self.interval
print("time series:", self.time_series)
print("binned time series:", self.binned_time_series)
image = np.zeros((self.display_width, self.display_height, 3), dtype=np.uint8)
current_time = time_since(self.start_time)
for timestamp,scores in reversed(self.binned_time_series): # draw the most recent scores first
radius = (current_time - timestamp)//self.interval # most recent data at center, 25 pixels per second
print("timestamp, scores,radius: ",timestamp, scores,radius)
x_min, x_max = self.x0 - radius, self.x0 + radius
y_min, y_max = self.y0 - radius, self.y0 + radius
if(x_min < 0 or y_min < 0):
break
combined_color = self.colors.T @ (scores/1e6) # matrix multiplication (3,7) @ (7,1) = (3,1)
image = draw_rectangle(image, (x_min, y_min), (x_max, y_max), combined_color.tolist(), 5) # corner, corner, color, thickness
# Convert the numpy array image to QPixmap and display it on a QLabel
bytesPerLine = 3 * self.display_width
qImg = QImage(image.data, self.display_width, self.display_height, bytesPerLine, QImage.Format_RGB888)
pixmap = QPixmap.fromImage(qImg)
#image_label will be displayed in the FER tab of the GUI
self.visualizer_image.setPixmap(pixmap)
def display_frame(self, image): # display what the camera sees, marked up with FER boxes
# Convert the numpy array image to QPixmap and display it on a QLabel
height, width, channel = image.shape
bytesPerLine = 3 * width
qImg = QImage(image.data, width, height, bytesPerLine, QImage.Format_RGB888)
pixmap = QPixmap.fromImage(qImg)
# Create a QTransform for horizontal flipping. todo: flip elsewhere so the text doesn't reverse!
#reflect = QTransform()
#reflect.scale(-1, 1) # Scale by -1 on the X axis for horizontal flip
#reflected_pixmap = pixmap.transformed(reflect)
#image_label will be displayed in the FER tab of the GUI
self.FER_image.setPixmap(pixmap)
#self.image_label.setPixmap(reflected_pixmap.scaled(self.image_label.size(), Qt.KeepAspectRatio, Qt.SmoothTransformation))
def closeEvent(self, event): # called when user closes the GUI window
self.end_session_event.set() # Signal other threads that the session should end
event.accept() # Continue the closing process
# Define a signal class to handle new chat messages
class ChatSignal(QObject):
new_message = pyqtSignal(dict) # Signal to display a new user message, carries dict payload with message
update_transcript = pyqtSignal(list) # Signal to update the transcript display, carries list payload with transcript
class ChatApp(QMainWindow): # GUI for LLM video chat
def __init__(self, start_time, chat_window_dims, user_chat_name, assistant_chat_name, chat_queue, chat_timestamps, new_chat_event, end_session_event):
super().__init__()
self.start_time = start_time
self.user_chat_name = user_chat_name
self.assistant_chat_name = assistant_chat_name
self.chat_queue = chat_queue
self.chat_timestamps = chat_timestamps
self.new_chat_event = new_chat_event
self.end_session_event = end_session_event
self.setWindowTitle("EMILI: Emotionally Intelligent Listener")
self.resize(*chat_window_dims) # unpack [width, height]
self.move(100, 100) # window position: (0,0) is top left
# Main layout
main_layout = QVBoxLayout()
# Tab widget for different tabs
self.tab_widget = QTabWidget()
main_layout.addWidget(self.tab_widget)
# Shared input bar at the bottom
self.chat_input = QLineEdit()
self.chat_input.setFixedHeight(72) # Set the height to accommodate three lines of text
self.chat_input.setStyleSheet("QLineEdit { height: 80px; font-size: 24px; }") # Adjust the height and font-size as needed
self.chat_input.returnPressed.connect(self.act_on_user_input) # function to call when user presses Enter
main_layout.addWidget(self.chat_input)
# Central widget setup
central_widget = QWidget()
central_widget.setLayout(main_layout)
self.setCentralWidget(central_widget)
self.signal = ChatSignal()
self.init_chat_tab()
self.init_FER_tab()
self.init_transcript_tab()
self.signal.new_message.connect(self.display_new_message)
self.signal.update_transcript.connect(self.update_transcript_display)
def closeEvent(self, event): # called when user closes the GUI window
self.end_session_event.set() # Signal other threads that the session should end
event.accept() # Continue the closing process
def act_on_user_input(self):
user_input = self.chat_input.text().rstrip('\n') # remove trailing newline
if user_input:
self.signal.new_message.emit({"role": "user", "content": user_input}) # Signal chat pane to display user message
self.chat_input.clear()
self.chat_timestamps.put(time_since(self.start_time)) # milliseconds since start of session
self.chat_queue.put(user_input) # pass user message to the assembler thread
self.new_chat_event.set() # Signal new chat to the assembler thread
def display_frame(self, image):
# Convert the numpy array image to QPixmap and display it on a QLabel
height, width, channel = image.shape
bytesPerLine = 3 * width
qImg = QImage(image.data, width, height, bytesPerLine, QImage.Format_RGB888)
pixmap = QPixmap.fromImage(qImg)