progress

2026-03-04 22:05:05 -08:00
parent 33dc00cb6a
commit 7c0ff65a19
10 changed files with 777 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ tools/venv/**
 .import
 addons
 build/
+.tmp/**
--- a/.opencode/agents/image-expert.md
+++ b/.opencode/agents/image-expert.md
@@ -0,0 +1,9 @@
+---
+description: Image Inspector
+mode: subagent
+model: local/Qwen3-VL
+tools:
+  read: true
+---
+
+You are an image inspection expert. You will be asked questions about images and you will answer directly. You may need to read the image if you are given a path.
--- a/.opencode/skills/image-inspector/SKILL.md
+++ b/.opencode/skills/image-inspector/SKILL.md
@@ -0,0 +1,79 @@
+---
+name: image-inspector
+description: Inspect images to answer Yes/No questions about visual content. Use when asking "Is a <thing> visible in this image?" or checking for specific objects, people, colors, text, or other visual elements. Always arrives at a definitive Yes/No conclusion.
+---
+
+# Image Inspector
+
+Inspect images using the Qwen3-VL vision model to answer Yes/No questions about visual content.
+
+## When to Use
+
+Use this skill when you need to:
+- Check if a specific object is present in an image
+- Verify visual elements exist
+- Answer binary questions about image content
+- Confirm or deny the presence of things in images
+
+## How It Works
+
+1. You provide an image path and a Yes/No question
+2. You resize the image to be a max of 1MP
+3. Ask the @image-expert to examine the image, and return a Yes/No
+4. You receive a definitive Yes or No answer
+
+## Usage Pattern
+
+### Step 1: Read the Image
+
+Use the Read tool to load the image file. The Read tool can read image files and return them as attachments.
+
+### Step 3: Resize the image to 1MP
+Use imagemagick and resize to a maximum of 1MP, outputting to ./.tmp/
+
+### Step 3: Formulate the Question
+
+Ask @image-expert a clear Yes/No question about the image:
+- "Is a [object] visible in this image?"
+- "Does this image contain [element]?"
+- "Can you see [thing] in this scene?"
+
+
+
+### Step 3: Provide the Answer
+
+After analyzing the (smaller) image, provide:
+1. **The Answer**: Yes or No (always definitive)
+2. **Brief Justification**: 1-2 sentences explaining why
+
+## Example Questions
+
+- "Is a tree visible in this image?"
+- "Does this image contain a person wearing a hat?"
+- "Is there text visible in this image?"
+- "Can you see a water feature in this scene?"
+- "Is the sky visible in this image?"
+- "Does this image show an indoor scene?"
+
+## Response Format
+
+```
+**Answer:** Yes/No
+
+**Reasoning:** [1-2 sentences explaining what you see or don't see]
+```
+
+## Guidelines
+
+- Always provide a definitive Yes or No answer
+- Be specific about what you observe
+- If uncertain, describe what you see and make your best judgment
+- Don't hedge with "maybe" or "possibly" - commit to an answer
+- Focus only on the specific question asked
+
+## Limitations
+
+- The model can only analyze what's visually apparent
+- Small or partially obscured objects may be missed
+- The model cannot zoom or enhance the image
+- Text must be clearly legible to be detected
--- a/.tmp/.save
+++ b/.tmp/.save
--- a/scenes/kq4_004_ogres_cottage/door_polygon.tres
+++ b/scenes/kq4_004_ogres_cottage/door_polygon.tres
@@ -0,0 +1,8 @@
+[gd_resource type="Resource" script_class="PolygonPointsResource" format=3 uid="uid://2oba97xunlssu"]
+
+[ext_resource type="Script" uid="uid://dtemboas3bi8y" path="res://PolygonPointsResource.gd" id="1_ppr"]
+
+[resource]
+script = ExtResource("1_ppr")
+points = PackedVector2Array(1147, 779, 1153, 854, 1152, 1053, 1151, 1054, 1055, 1046, 1043, 920, 1050, 781, 1051, 780)
+metadata/_custom_type_script = "uid://dtemboas3bi8y"
--- a/scenes/kq4_004_ogres_cottage/kq4_004_ogres_cottage.gd
+++ b/scenes/kq4_004_ogres_cottage/kq4_004_ogres_cottage.gd
@@ -15,3 +15,9 @@ func _on_forest_path_interacted() -> void:

 func _on_forest_grove_interacted() -> void:
 	$kq4_005_forest_grove.default_script(self)
+
+
+func _on_door_looked() -> void:
+	start_main_script(ScriptBuilder.init(
+		ScriptBuilder.say(ego, "It's a sturdy wooden door to the ogre's cottage.")
+	).build(self, "_on_script_complete"))
--- a/scenes/kq4_004_ogres_cottage/kq4_004_ogres_cottage.tscn
+++ b/scenes/kq4_004_ogres_cottage/kq4_004_ogres_cottage.tscn
@@ -4,6 +4,8 @@
 [ext_resource type="Texture2D" uid="uid://b1yeiwh8uqii2" path="res://scenes/kq4_004_ogres_cottage/caption_1_454377357_generated.png" id="2_u8g8b"]
 [ext_resource type="Script" uid="uid://xmphq3i0wbg3" path="res://ScalePoint_.gd" id="3_kvdqi"]
 [ext_resource type="PackedScene" uid="uid://c4vc1wx7k6cw" path="res://TransitionPiece.tscn" id="4_67nph"]
+[ext_resource type="Resource" uid="uid://2oba97xunlssu" path="res://scenes/kq4_004_ogres_cottage/door_polygon.tres" id="5_door"]
+[ext_resource type="Script" uid="uid://bounwnqg34t5k" path="res://SetPiece_.gd" id="6_setpiece"]

 [sub_resource type="NavigationPolygon" id="NavigationPolygon_furs3"]
 vertices = PackedVector2Array(325.656, 570.578, 582.328, 580.656, 525.289, 597.977, 454.133, 654.148, 445.539, 889.25, 919.219, 873.633, 1158.89, 691.359, 1294.26, 705.508, 1204.28, 907.469, -58.2969, 1146.06, -76.6797, 562, -6.6875, 562, 126.258, 883.891, 1990, 1160.92, 1662.15, 956.969, 1990, 977.391)
@@ -91,10 +93,19 @@ position = Vector2(24, 565)
 [node name="exit" parent="kq4_005_forest_grove" index="1"]
 position = Vector2(293, 554)

+[node name="door" type="Polygon2D" parent="." groups=["set-piece"]]
+scale = Vector2(0.783, 0.78)
+color = Color(0.5, 0.5, 0.5, 0.25)
+polygon = PackedVector2Array(1147, 779, 1153, 854, 1152, 1053, 1151, 1054, 1055, 1046, 1043, 920, 1050, 781, 1051, 780)
+script = ExtResource("6_setpiece")
+label = "Door"
+points_resource = ExtResource("5_door")
+
 [connection signal="interacted" from="kq4_028_mine_entrance" to="." method="_on_mine_entrance_interacted"]
 [connection signal="interacted" from="kq4_003_fountain_pool" to="." method="_on_pool_interacted"]
 [connection signal="interacted" from="kq4_010_forest_path" to="." method="_on_forest_path_interacted"]
 [connection signal="interacted" from="kq4_005_forest_grove" to="." method="_on_forest_grove_interacted"]
+[connection signal="looked" from="door" to="." method="_on_door_looked"]

 [editable path="kq4_028_mine_entrance"]
 [editable path="kq4_003_fountain_pool"]
--- a/tools/extract_mask.py
+++ b/tools/extract_mask.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+"""Extract a mask from an image using ComfyUI workflow."""
+
+import base64
+import json
+import os
+import sys
+import time
+import urllib.error
+import urllib.request
+import uuid
+from urllib.parse import urlencode
+
+
+def check_server(server_address: str = "127.0.0.1:8188", timeout: int = 5) -> bool:
+    """Check if ComfyUI server is running and accessible."""
+    try:
+        req = urllib.request.Request(
+            f"http://{server_address}/system_stats",
+            method="GET",
+        )
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return response.status == 200
+    except Exception:
+        return False
+
+
+def encode_image_base64(image_path: str) -> str:
+    """Encode an image file as base64 string."""
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+
+
+def queue_prompt(prompt: dict, server_address: str = "127.0.0.1:8188") -> dict:
+    """Queue a prompt to ComfyUI server."""
+    client_id = str(uuid.uuid4())
+    p = {"prompt": prompt, "client_id": client_id}
+    data = json.dumps(p).encode("utf-8")
+    req = urllib.request.Request(
+        f"http://{server_address}/prompt",
+        data=data,
+        headers={"Content-Type": "application/json"},
+    )
+    try:
+        with urllib.request.urlopen(req) as response:
+            return json.loads(response.read())
+    except urllib.error.HTTPError as e:
+        error_body = e.read().decode("utf-8")
+        print(f"HTTP Error {e.code}: {error_body}")
+        raise
+
+
+def get_history(prompt_id: str, server_address: str = "127.0.0.1:8188") -> dict:
+    """Get the history/status of a prompt from ComfyUI."""
+    req = urllib.request.Request(
+        f"http://{server_address}/history/{prompt_id}",
+        method="GET",
+    )
+    try:
+        with urllib.request.urlopen(req) as response:
+            return json.loads(response.read())
+    except urllib.error.HTTPError as e:
+        error_body = e.read().decode("utf-8")
+        print(f"HTTP Error {e.code}: {error_body}")
+        raise
+
+
+def download_image(
+    filename: str,
+    subfolder: str,
+    folder_type: str,
+    server_address: str = "127.0.0.1:8188",
+) -> bytes:
+    """Download an image from ComfyUI."""
+    params = {"filename": filename, "type": folder_type}
+    if subfolder:
+        params["subfolder"] = subfolder
+
+    url = f"http://{server_address}/view?{urlencode(params)}"
+
+    req = urllib.request.Request(url, method="GET")
+    with urllib.request.urlopen(req) as response:
+        return response.read()
+
+
+def wait_for_prompt_completion(
+    prompt_id: str, server_address: str = "127.0.0.1:8188", timeout: int = 240
+) -> dict | None:
+    """Wait for a prompt to complete and return the output info."""
+    start_time = time.time()
+
+    while time.time() - start_time < timeout:
+        history = get_history(prompt_id, server_address)
+
+        if prompt_id in history:
+            prompt_history = history[prompt_id]
+            if "outputs" in prompt_history and prompt_history["outputs"]:
+                return prompt_history["outputs"]
+
+        time.sleep(0.5)
+
+    return None
+
+
+def extract_mask(
+    subject: str,
+    input_image: str,
+    output_path: str,
+    server_address: str = "127.0.0.1:8188",
+) -> str:
+    """Extract mask from image for given subject.
+
+    Args:
+        subject: The subject to extract mask for (e.g., "the stump", "the door")
+        input_image: Path to the input image file
+        output_path: Path where the output mask should be saved
+        server_address: ComfyUI server address
+
+    Returns:
+        Path to the saved output mask
+    """
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+
+    workflow_path = os.path.join(script_dir, "image_mask_extraction.json")
+    with open(workflow_path, "r") as f:
+        workflow = json.load(f)
+
+    prompt_text = f"Create a black and white alpha mask of {subject}"
+
+    print(f"Encoding input image...")
+    base64_image = encode_image_base64(input_image)
+
+    workflow["1:68"]["inputs"]["prompt"] = prompt_text
+    workflow["87"]["inputs"]["image"] = base64_image
+
+    unique_id = str(uuid.uuid4())[:8]
+    filename_prefix = f"masks/mask_{unique_id}"
+    workflow["82"]["inputs"]["filename_prefix"] = filename_prefix
+
+    print(f"Queuing mask extraction for: {subject}")
+    print(f"Input image: {input_image}")
+    print(f"Prompt: {prompt_text}")
+
+    response = queue_prompt(workflow, server_address)
+    prompt_id = response["prompt_id"]
+    print(f"Prompt ID: {prompt_id}")
+
+    print("Waiting for generation (up to 4 minutes)...")
+    outputs = wait_for_prompt_completion(prompt_id, server_address, timeout=240)
+
+    if not outputs:
+        raise RuntimeError("Timeout: Workflow did not complete in 4 minutes")
+
+    output_filename = None
+    output_subfolder = ""
+    output_type = "output"
+
+    for node_id, node_output in outputs.items():
+        if "images" in node_output:
+            for image_info in node_output["images"]:
+                output_filename = image_info["filename"]
+                output_subfolder = image_info.get("subfolder", "")
+                output_type = image_info.get("type", "output")
+                break
+        if output_filename:
+            break
+
+    if not output_filename:
+        raise RuntimeError("No output image found in workflow results")
+
+    print(f"Downloading generated mask: {output_filename}")
+
+    image_data = download_image(
+        output_filename, output_subfolder, output_type, server_address
+    )
+
+    output_dir_path = os.path.dirname(os.path.abspath(output_path))
+    os.makedirs(output_dir_path, exist_ok=True)
+
+    with open(output_path, "wb") as f:
+        f.write(image_data)
+
+    print(f"Saved mask: {output_path}")
+
+    return output_path
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Extract mask from image using ComfyUI"
+    )
+    parser.add_argument(
+        "subject", help="Subject to extract mask for (e.g., 'the stump', 'the door')"
+    )
+    parser.add_argument("input_image", help="Path to input image file")
+    parser.add_argument("output_path", help="Path where output mask should be saved")
+    parser.add_argument(
+        "--server",
+        default="127.0.0.1:8188",
+        help="ComfyUI server address (default: 127.0.0.1:8188)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Test mode: validate inputs and server connection without generating",
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.input_image):
+        print(f"Error: Input image not found: {args.input_image}")
+        sys.exit(1)
+
+    print(f"Subject: {args.subject}")
+    print(f"Input: {args.input_image}")
+    print(f"Output: {args.output_path}")
+    print(f"Server: {args.server}")
+
+    if args.dry_run:
+        print("\n[Dry Run Mode - Checking server connection...]")
+        if check_server(args.server):
+            print("✓ ComfyUI server is running and accessible")
+            print("\n✓ Dry run successful! All checks passed.")
+            sys.exit(0)
+        else:
+            print(f"✗ ComfyUI server is not accessible at {args.server}")
+            print("  Please ensure ComfyUI is running before extracting masks.")
+            sys.exit(1)
+
+    print("\nChecking ComfyUI server...")
+    if not check_server(args.server):
+        print(f"Error: ComfyUI server is not running at {args.server}")
+        print("Please start ComfyUI first or check the server address.")
+        print(f"\nTo test without generating, use: --dry-run")
+        sys.exit(1)
+
+    print("✓ ComfyUI server is running")
+
+    try:
+        output = extract_mask(
+            args.subject, args.input_image, args.output_path, args.server
+        )
+        print(f"\nMask extraction complete! Output: {output}")
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/image_mask_extraction.json
+++ b/tools/image_mask_extraction.json
--- a/tools/mask_to_polygon.py
+++ b/tools/mask_to_polygon.py
@@ -257,8 +257,8 @@ def main():
    parser.add_argument(
        "--min-area",
        type=int,
-        default=100,
-        help="Minimum contour area to include in multiple mode (default: 100)",
+        default=150,
+        help="Minimum contour area to include (default: 150)",
    )

    args = parser.parse_args()
@@ -279,14 +279,15 @@ def main():
        print("Error: No contours found in mask", file=sys.stderr)
        sys.exit(1)

-    if args.mode == "multiple":
-        contours = sorted(contours, key=cv2.contourArea, reverse=True)
    contours = [c for c in contours if cv2.contourArea(c) >= args.min_area]

    if not contours:
        print("Error: No contours meet minimum area requirement", file=sys.stderr)
        sys.exit(1)

+    if args.mode == "multiple":
+        contours = sorted(contours, key=cv2.contourArea, reverse=True)
+
        output_base = args.output if args.output else args.image.with_suffix("")
        output_dir = output_base.parent
        output_stem = output_base.stem