diff --git a/clickclickclick/config/models.yaml b/clickclickclick/config/models.yaml index be89f72..640e958 100644 --- a/clickclickclick/config/models.yaml +++ b/clickclickclick/config/models.yaml @@ -1,6 +1,6 @@ gemini: api_key: !ENV GEMINI_API_KEY - model_name: gemini-1.5-flash + model_name: gemini-3-flash-preview image_width: 768 image_height: 768 output_width: 1000 # max range of outputted values @@ -12,24 +12,29 @@ gemini: osx: image_width: 768 generation_config: - temperature: 0.7 + temperature: 1.0 top_p: 0.95 - top_k: 40 - max_output_tokens: 200 + max_output_tokens: 65536 response_mime_type: text/plain finder: image_width: 768 image_height: 768 + output_width: 1000 # max range of outputted values + output_height: 1000 android: + image_width: 768 + image_height: 768 output_width: 1000 # max range of outputted values output_height: 1000 osx: image_width: 768 + output_width: 1000 + output_height: 1000 generation_config: temperature: 0.95 top_p: 0.99 top_k: 20 - max_output_tokens: 80 + max_output_tokens: 4096 response_mime_type: application/json openai: diff --git a/clickclickclick/config/prompts.yaml b/clickclickclick/config/prompts.yaml index 741b23e..06555a2 100644 --- a/clickclickclick/config/prompts.yaml +++ b/clickclickclick/config/prompts.yaml @@ -39,7 +39,11 @@ android: gemini: finder-system-prompt: | - You provide the bounds of the UI elements/text you see in the picture ymin,xmin,ymax,xmax, In case of not found just output 0,0,0,0 instead. Not finding is equally important. I repeat your only options are coordinates or 'not found'. Do not write any other word. Internally describe how the element looks and what colors it has and what its shape is. When you are not confident just output 0,0,0,0. + Return ONLY a JSON object with the bounding box coordinates of the UI element in the image. + Required JSON format: {"ymin": , "xmin": , "ymax": , "xmax": } + Coordinate range: 0-1000 for both x and y axes. + If element not found: {"ymin": 0, "xmin": 0, "ymax": 0, "xmax": 0} + Do NOT include any explanatory text, markdown, or formatting - ONLY the raw JSON object. openai: finder-system-prompt: | Assume image size as 512x512. You provide the bounds of the UI elements/text you see in the picture ymin,xmin,ymax,xmax, In case of not found just output 0,0,0,0 instead. Not finding is equally important. I repeat your only options are coordinates or 'not found'. Do not write any other word. Internally describe how the element looks and what colors it has and what its shape is. When you are not confident just output 0,0,0,0. @@ -86,7 +90,11 @@ osx: You provide the bounding box of the UI elements/text you see in the picture in this format: ymin,xmin,ymax,xmax, In case of not found just output 0,0,0,0 or 'not found' instead. You do not write anything else. Not finding is equally important gemini: finder-system-prompt: | - You provide the bounds of the UI elements/text you see in the picture ymin,xmin,ymax,xmax, In case of not found just output 0,0,0,0 instead. Not finding is equally important. I repeat your only options are coordinates or 'not found'. Do not write any other word. Internally describe how the element looks and what colors it has and what its shape is. When you are not confident just output 0,0,0,0. + Return ONLY a JSON object with the bounding box coordinates of the UI element in the image. + Required JSON format: {"ymin": , "xmin": , "ymax": , "xmax": } + Coordinate range: 0-1000 for both x and y axes. + If element not found: {"ymin": 0, "xmin": 0, "ymax": 0, "xmax": 0} + Do NOT include any explanatory text, markdown, or formatting - ONLY the raw JSON object. openai: finder-system-prompt: | Assume image size as 512x512. You provide the bounds of the UI elements/text you see in the picture ymin,xmin,ymax,xmax, In case of not found just output 0,0,0,0 instead. Not finding is equally important. I repeat your only options are coordinates or 'not found'. Do not write any other word. Internally describe how the element looks and what colors it has and what its shape is. When you are not confident just output 0,0,0,0. diff --git a/clickclickclick/finder/gemini.py b/clickclickclick/finder/gemini.py index 384db5b..2e59751 100644 --- a/clickclickclick/finder/gemini.py +++ b/clickclickclick/finder/gemini.py @@ -1,9 +1,15 @@ import google.generativeai as genai -from . import BaseFinder +from . import BaseFinder, FinderResponseLLM from clickclickclick.config import BaseConfig from clickclickclick.executor import Executor from tempfile import NamedTemporaryFile from PIL import Image +import json +import re +import traceback +import logging + +logger = logging.getLogger(__name__) class GeminiFinder(BaseFinder): @@ -19,9 +25,12 @@ def __init__(self, c: BaseConfig, executor: Executor): self.OUTPUT_HEIGHT = finder_config.get("output_height") api_key = finder_config.get("api_key") model_name = finder_config.get("model_name") - generation_config = finder_config.get("generation_config") + generation_config = finder_config.get("generation_config", {}) + super().__init__(api_key, model_name, generation_config, system_prompt, executor) genai.configure(api_key=api_key) + + logger.debug(f"Generation config: {generation_config}") self.model = genai.GenerativeModel( model_name=model_name, generation_config=generation_config, @@ -41,12 +50,36 @@ def process_segment(self, segment, model, prompt, retries=3): response = self.model.generate_content( [segment_image, self.element_finder_prompt(prompt)] ) - response_text = response.text - print(response_text, " resp text") + response_text = response.text.strip() + logger.debug(f"Gemini raw response: '{response_text}'") + logger.debug(f"Response type: {type(response_text)}") + + # Try to extract JSON from response + # Sometimes Gemini wraps JSON in markdown code blocks + json_match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL) + if json_match: + response_text = json_match.group(1) + logger.debug(f"Extracted JSON from markdown: '{response_text}'") + else: + # Or it might just have extra text before/after + json_match = re.search(r'\{[^{}]*"ymin"[^{}]*\}', response_text) + if json_match: + response_text = json_match.group(0) + logger.debug(f"Extracted JSON object: '{response_text}'") + + # Validate it's valid JSON + try: + parsed = json.loads(response_text) + logger.debug(f"Parsed JSON successfully: {parsed}") + except json.JSONDecodeError as je: + logger.debug(f"JSON decode error: {je}") + logger.debug(f"Failed to parse: '{response_text}'") + return (response_text, coordinates) except Exception as e: # Log the exception or handle it as necessary - print(f"Attempt {attempt + 1} failed with exception: {e}") + logger.error(f"Attempt {attempt + 1} failed with exception: {e}") + traceback.print_exc() # Increment the attempt counter attempt += 1