instavm · Vasiliy-b · Jan 21, 2026 · Jan 21, 2026 · gemini-code-assist · Jan 21, 2026
diff --git a/clickclickclick/config/models.yaml b/clickclickclick/config/models.yaml
@@ -1,6 +1,6 @@
 gemini:
   api_key: !ENV GEMINI_API_KEY
-  model_name: gemini-1.5-flash
+  model_name: gemini-3-flash-preview
   image_width: 768
   image_height: 768
   output_width: 1000  # max range of outputted values
@@ -12,24 +12,29 @@ gemini:
     osx:
       image_width: 768
     generation_config:
-      temperature: 0.7
+      temperature: 1.0
       top_p: 0.95
-      top_k: 40
-      max_output_tokens: 200
+      max_output_tokens: 65536
       response_mime_type: text/plain
   finder:
     image_width: 768
     image_height: 768
+    output_width: 1000  # max range of outputted values
+    output_height: 1000
     android:
+      image_width: 768
+      image_height: 768
       output_width: 1000  # max range of outputted values
       output_height: 1000
     osx:
       image_width: 768
+      output_width: 1000
+      output_height: 1000
     generation_config:
       temperature: 0.95
       top_p: 0.99
       top_k: 20
-      max_output_tokens: 80
+      max_output_tokens: 4096
       response_mime_type: application/json
 
 openai:

diff --git a/clickclickclick/config/prompts.yaml b/clickclickclick/config/prompts.yaml
@@ -39,7 +39,11 @@ android:
 
   gemini:
     finder-system-prompt: |
-      You provide the bounds of the UI elements/text you see in the picture ymin,xmin,ymax,xmax, In case of not found just output 0,0,0,0 instead. Not finding is equally important. I repeat your only options are coordinates or 'not found'. Do not write any other word. Internally describe how the element looks and what colors it has and what its shape is. When you are not confident just output 0,0,0,0.
+      Return ONLY a JSON object with the bounding box coordinates of the UI element in the image.
+      Required JSON format: {"ymin": <int>, "xmin": <int>, "ymax": <int>, "xmax": <int>}
+      Coordinate range: 0-1000 for both x and y axes.
+      If element not found: {"ymin": 0, "xmin": 0, "ymax": 0, "xmax": 0}
+      Do NOT include any explanatory text, markdown, or formatting - ONLY the raw JSON object.
   openai:
     finder-system-prompt: |
       Assume image size as 512x512. You provide the bounds of the UI elements/text you see in the picture ymin,xmin,ymax,xmax, In case of not found just output 0,0,0,0 instead. Not finding is equally important. I repeat your only options are coordinates or 'not found'. Do not write any other word. Internally describe how the element looks and what colors it has and what its shape is. When you are not confident just output 0,0,0,0.
@@ -86,7 +90,11 @@ osx:
       You provide the bounding box of the UI elements/text you see in the picture in this format: ymin,xmin,ymax,xmax, In case of not found just output 0,0,0,0 or 'not found' instead. You do not write anything else. Not finding is equally important
   gemini:
     finder-system-prompt: |
-      You provide the bounds of the UI elements/text you see in the picture ymin,xmin,ymax,xmax, In case of not found just output 0,0,0,0 instead. Not finding is equally important. I repeat your only options are coordinates or 'not found'. Do not write any other word. Internally describe how the element looks and what colors it has and what its shape is. When you are not confident just output 0,0,0,0.
+      Return ONLY a JSON object with the bounding box coordinates of the UI element in the image.
+      Required JSON format: {"ymin": <int>, "xmin": <int>, "ymax": <int>, "xmax": <int>}
+      Coordinate range: 0-1000 for both x and y axes.
+      If element not found: {"ymin": 0, "xmin": 0, "ymax": 0, "xmax": 0}
+      Do NOT include any explanatory text, markdown, or formatting - ONLY the raw JSON object.
   openai:
     finder-system-prompt: |
       Assume image size as 512x512. You provide the bounds of the UI elements/text you see in the picture ymin,xmin,ymax,xmax, In case of not found just output 0,0,0,0 instead. Not finding is equally important. I repeat your only options are coordinates or 'not found'. Do not write any other word. Internally describe how the element looks and what colors it has and what its shape is. When you are not confident just output 0,0,0,0.

diff --git a/clickclickclick/finder/gemini.py b/clickclickclick/finder/gemini.py
@@ -1,9 +1,11 @@
 import google.generativeai as genai
-from . import BaseFinder
+from . import BaseFinder, FinderResponseLLM
 from clickclickclick.config import BaseConfig
 from clickclickclick.executor import Executor
 from tempfile import NamedTemporaryFile
 from PIL import Image
+import json
+import re
-import re
+import re
+import traceback
-import re
+import re
+import traceback
 
 
 class GeminiFinder(BaseFinder):
@@ -19,9 +21,12 @@ def __init__(self, c: BaseConfig, executor: Executor):
         self.OUTPUT_HEIGHT = finder_config.get("output_height")
         api_key = finder_config.get("api_key")
         model_name = finder_config.get("model_name")
-        generation_config = finder_config.get("generation_config")
+        generation_config = finder_config.get("generation_config", {})
+
         super().__init__(api_key, model_name, generation_config, system_prompt, executor)
         genai.configure(api_key=api_key)
+
+        print(f"DEBUG - Generation config: {generation_config}")
-        print(f"DEBUG - Generation config: {generation_config}")
+        logger.debug(f"DEBUG - Generation config: {generation_config}")
-        print(f"DEBUG - Generation config: {generation_config}")
+        logger.debug(f"DEBUG - Generation config: {generation_config}")
         self.model = genai.GenerativeModel(
             model_name=model_name,
             generation_config=generation_config,
@@ -41,12 +46,37 @@ def process_segment(self, segment, model, prompt, retries=3):
                     response = self.model.generate_content(
                         [segment_image, self.element_finder_prompt(prompt)]
                     )
-                    response_text = response.text
-                    print(response_text, " resp text")
+                    response_text = response.text.strip()
+                    print(f"DEBUG - Gemini raw response: '{response_text}'")
+                    print(f"DEBUG - Response type: {type(response_text)}")
+
+                    # Try to extract JSON from response
+                    # Sometimes Gemini wraps JSON in markdown code blocks
+                    json_match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL)
+                    if json_match:
+                        response_text = json_match.group(1)
+                        print(f"DEBUG - Extracted JSON from markdown: '{response_text}'")
+
+                    # Or it might just have extra text before/after
+                    json_match = re.search(r'\{[^{}]*"ymin"[^{}]*\}', response_text)
+                    if json_match:
+                        response_text = json_match.group(0)
+                        print(f"DEBUG - Extracted JSON object: '{response_text}'")
-                    json_match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL)
-                    if json_match:
-                        response_text = json_match.group(1)
-                        print(f"DEBUG - Extracted JSON from markdown: '{response_text}'")
-
-                    # Or it might just have extra text before/after
-                    json_match = re.search(r'\{[^{}]*"ymin"[^{}]*\}', response_text)
-                    if json_match:
-                        response_text = json_match.group(0)
-                        print(f"DEBUG - Extracted JSON object: '{response_text}'")
+                    json_match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL)
+                    if json_match:
+                        response_text = json_match.group(1)
+                        print(f"DEBUG - Extracted JSON from markdown: '{response_text}'")
+                    else:
+                        # Or it might just have extra text before/after
+                        json_match = re.search(r'\{[^{}]*"ymin"[^{}]*\}', response_text)
+                        if json_match:
+                            response_text = json_match.group(0)
+                            print(f"DEBUG - Extracted JSON object: '{response_text}'")
-                    json_match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL)
-                    if json_match:
-                        response_text = json_match.group(1)
-                        print(f"DEBUG - Extracted JSON from markdown: '{response_text}'")
-
-                    # Or it might just have extra text before/after
-                    json_match = re.search(r'\{[^{}]*"ymin"[^{}]*\}', response_text)
-                    if json_match:
-                        response_text = json_match.group(0)
-                        print(f"DEBUG - Extracted JSON object: '{response_text}'")
+                    json_match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL)
+                    if json_match:
+                        response_text = json_match.group(1)
+                        print(f"DEBUG - Extracted JSON from markdown: '{response_text}'")
+                    else:
+                        # Or it might just have extra text before/after
+                        json_match = re.search(r'\{[^{}]*"ymin"[^{}]*\}', response_text)
+                        if json_match:
+                            response_text = json_match.group(0)
+                            print(f"DEBUG - Extracted JSON object: '{response_text}'")
+
+                    # Validate it's valid JSON
+                    try:
+                        parsed = json.loads(response_text)
+                        print(f"DEBUG - Parsed JSON successfully: {parsed}")
+                    except json.JSONDecodeError as je:
+                        print(f"DEBUG - JSON decode error: {je}")
+                        print(f"DEBUG - Failed to parse: '{response_text}'")
+
                     return (response_text, coordinates)
             except Exception as e:
                 # Log the exception or handle it as necessary
                 print(f"Attempt {attempt + 1} failed with exception: {e}")
+                import traceback
+                traceback.print_exc()
 
                 # Increment the attempt counter
                 attempt += 1