Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing issue #362 #364

Merged
merged 7 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions examples/Analyze_a_Video_Classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@
],
"source": [
"model = genai.GenerativeModel(model_name=\"models/gemini-1.5-flash\", system_instruction=system_prompt)\n",
"response = model.generate_content([video_file])\n",
"response = model.generate_content([\"Please identify the animal(s) in this video\",video_file])\n",
"print(response.text)"
]
},
Expand All @@ -292,7 +292,9 @@
"id": "CYLXPx2lq45r"
},
"source": [
"As you can see, the model accurately named the animal and provided a correct Latin name. You can delete the video to prevent unnecessary data storage."
"As you can see, the model accurately named the animal and provided a correct Latin name. \n",
"\n",
"You can delete the video to prevent unnecessary data storage."
]
},
{
Expand Down
12 changes: 11 additions & 1 deletion gemini-2/live_api_starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
# And to run this script, ensure the GOOGLE_API_KEY environment
# variable is set to the key you obtained from Google AI Studio.

# Add the "--mode screen" if you want to share your screen to the model
# instead of your camera stream

import asyncio
import base64
import io
Expand All @@ -33,13 +36,20 @@
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--mode', type=str, default='camera', help='pixels to stream from', choices=['camera', 'screen'])
parser.add_argument(
"--mode",
type=str,
default="camera",
help="pixels to stream from",
choices=["camera", "screen"],
)
args = parser.parse_args()

from google import genai

if sys.version_info < (3, 11, 0):
import taskgroup, exceptiongroup

asyncio.TaskGroup = taskgroup.TaskGroup
asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup

Expand Down
49 changes: 48 additions & 1 deletion gemini-2/websockets/live_api_starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
# And to run this script, ensure the GOOGLE_API_KEY environment
# variable is set to the key you obtained from Google AI Studio.

# Add the "--mode screen" if you want to share your screen to the model
# instead of your camera stream

import asyncio
import base64
import json
Expand All @@ -29,11 +32,24 @@
import cv2
import pyaudio
import PIL.Image
import mss
import argparse

parser = argparse.ArgumentParser()
parser.add_argument(
"--mode",
type=str,
default="camera",
help="pixels to stream from",
choices=["camera", "screen"],
)
args = parser.parse_args()

from websockets.asyncio.client import connect

if sys.version_info < (3, 11, 0):
import taskgroup, exceptiongroup

asyncio.TaskGroup = taskgroup.TaskGroup
asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup

Expand All @@ -46,6 +62,8 @@
host = "generativelanguage.googleapis.com"
model = "gemini-2.0-flash-exp"

MODE = args.mode

api_key = os.environ["GOOGLE_API_KEY"]
uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"

Expand Down Expand Up @@ -119,6 +137,32 @@ async def get_frames(self):
# Release the VideoCapture object
cap.release()

def _get_screen(self):
sct = mss.mss()
monitor = sct.monitors[0]

i = sct.grab(monitor)
mime_type = "image/jpeg"
image_bytes = mss.tools.to_png(i.rgb, i.size)
img = PIL.Image.open(io.BytesIO(image_bytes))

image_io = io.BytesIO()
img.save(image_io, format="jpeg")
image_io.seek(0)

image_bytes = image_io.read()
return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}

async def get_screen(self):
while True:
frame = await asyncio.to_thread(self._get_screen)
if frame is None:
break

await asyncio.sleep(1.0)

await self.out_queue.put(frame)

async def send_realtime(self):
while True:
msg = await self.out_queue.get()
Expand Down Expand Up @@ -210,7 +254,10 @@ async def run(self):

tg.create_task(self.send_realtime())
tg.create_task(self.listen_audio())
tg.create_task(self.get_frames())
if MODE == "camera":
tg.create_task(self.get_frames())
elif MODE == "screen":
tg.create_task(self.get_screen())
tg.create_task(self.receive_audio())
tg.create_task(self.play_audio())

Expand Down
Loading