commontoolsinc
diff --git a/‎.github/workflows/deno.yml
Lines changed: 48 additions & 0 deletions b/‎.github/workflows/deno.yml
Lines changed: 48 additions & 0 deletions
diff --git a/‎.github/workflows/evals.yml
Lines changed: 14 additions & 7 deletions b/‎.github/workflows/evals.yml
Lines changed: 14 additions & 7 deletions
diff --git a/‎jumble/integration/cache/llm-api-cache/09951494003d38d4c90bcc79589c3f290e8d0ca7e3270a9e2d32c2b651ee3897.json
Lines changed: 16 additions & 0 deletions b/‎jumble/integration/cache/llm-api-cache/09951494003d38d4c90bcc79589c3f290e8d0ca7e3270a9e2d32c2b651ee3897.json
Lines changed: 16 additions & 0 deletions
@@ -367,6 +367,54 @@ jobs:
           FRONTEND_URL=http://localhost:8000/ \
           deno task integration
 
+  seeder-integration-test:
+    name: "Seeder Integration Tests"
+    runs-on: ubuntu-latest
+    needs: ["build-binaries"]
+    environment: ci
+    services:
+      redis:
+        image: redis
+        ports:
+          - 6379:6379
+        options: >-
+          --health-cmd "redis-cli ping"
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+    steps:
+      - name: 📥 Checkout repository
+        uses: actions/checkout@v4
+
+      - name: 🦕 Setup Deno 2.2.2
+        uses: denoland/setup-deno@v2
+        with:
+          deno-version: "2.2.2"
+
+      - name: 📦 Cache Deno dependencies
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.deno
+            ~/.cache/deno
+          key: ${{ runner.os }}-deno-${{ hashFiles('**/deno.json') }}
+
+      - name: 📥 Download built binaries
+        uses: actions/download-artifact@v4
+
+      - name: 🚀 Start Toolshed server for testing
+        run: |
+          chmod +x ./common-binaries/toolshed
+          CTTS_AI_LLM_ANTHROPIC_API_KEY=fake \
+          CACHE_DIR=${GITHUB_WORKSPACE}/jumble/integration/cache \
+          ./common-binaries/toolshed &
+
+      - name: 🧪 Run seeder integration tests
+        working-directory: seeder
+        run: |
+          TOOLSHED_API_URL=http://localhost:8000/ \
+          deno task start --tag smol --no-verify --no-report --name ${{ github.run_id }}
+
   # Automatic deployment to staging (toolshed)
   deploy-toolshed:
     name: "Deploy to Toolshed (Staging)"
 
@@ -106,7 +106,7 @@ jobs:
           key: llm-cache-${{ github.run_id }}
           restore-keys: llm-cache
 
-      - name: ✏️ List LLM Cache Load 
+      - name: ✏️ List LLM Cache Load
         run: |
           mkdir -p ./llm_cache_dir/llm-api-cache
           ls -1 ./llm_cache_dir/llm-api-cache > ./cache-log-pre
@@ -138,17 +138,24 @@ jobs:
             FLAGS="--tag 10x10"
           fi
 
-          TOOLSHED_API_URL=http://localhost:8000/ \
-          OPENAI_API_KEY=${{ secrets.CTTS_AI_LLM_OPENAI_API_KEY }} \
-          deno task start --name ${{ env.target_sha }} $FLAGS
+          # Wrap in an if/else, we don't want the job to fail yet
+          # until we can confidently expect passing results.
+          if 
+            TOOLSHED_API_URL=http://localhost:8000/ \
+            OPENAI_API_KEY=${{ secrets.CTTS_AI_LLM_OPENAI_API_KEY }} \
+            deno task start --name ${{ env.target_sha }} $FLAGS ; then
+            echo "All charms have been successfully verified!"
+          else
+            echo "Some charms have failed verification."
+          fi
 
-      - name: ✏️ List LLM Cache Save 
+      - name: ✏️ List LLM Cache Save
         run: |
           mkdir -p ./llm_cache_dir/llm-api-cache
           ls -1 ./llm_cache_dir/llm-api-cache > ./cache-log-post
           echo "$(cat ./cache-log-post | wc -l) items found in cache."
           cat ./cache-log-post
-      
+
       - name: 📦 LLM Cache Status
         run: |
           if diff ./cache-log-pre ./cache-log-post > ./cache-analysis-diff ; then
@@ -158,7 +165,7 @@ jobs:
             ADDITIONS=$(cat ./cache-analysis-diff | wc -l)
             echo "LLM cache has added $((ADDITIONS / 2)) entries."
           fi
- 
+
       - name: 📤 Upload reports
         uses: actions/upload-artifact@v4
         with:
 
@@ -0,0 +1,16 @@
+{
+  "model": "anthropic:claude-3-7-sonnet-20250219",
+  "system": "\nYou are creating a simple MVP based on the user's goal, using an existing data schema. Focus on the simplest implementation that works with the provided schema.\n\n<task>\nGiven a user's feature request and an existing data schema, you will:\n1. Create a short title (2-5 words) that names the artifact\n2. Create a one-sentence description in the format \"A <artifact> to <goal>\"\n3. Create a concise specification (3-5 sentences max) that works with the existing schema\n4. Generate a brief implementation plan (3 steps max)\n5. Design a minimal JSON schema that represents the core data model\n</task>\n\n<output_structure>\nYour response must be structured as follows:\n\n<thinking>Freeform reasoning for you to consider the solution.</thinking>\n\n<title>\n[Short title for the artifact, 2-5 words]\n</title>\n\n<description>\n[One-sentence description in the format \"A <artifact> to <goal>\"]\n</description>\n\n<spec>\n[Concise specification that captures only the essential requirements]\n</spec>\n\n<plan>\n[Brief 3-step implementation plan using the existing schema]\n</plan>\n\n<result_schema>\n[Minimal JSON Schema in valid JSON format that represents data created by the artifact]\n</result_schema>\n</output_structure>\n\n<schema_guidelines>\n1. Keep it minimal:\n   - Include only essential fields (5-7 properties max)\n   - Focus on the core functionality\n   - If user requested complex features, simplify for this first version\n\n2. Each property should have:\n   - A descriptive \"title\" field\n   - A brief \"description\" field\n   - A sensible default value where appropriate\n\n3. Example of a simple schema:\n```json\n{\n  \"type\": \"object\",\n  \"title\": \"Note List\",\n  \"description\": \"A simple note list for the user\",\n  \"properties\": {\n    \"notes\": {\n      \"type\": \"array\",\n      \"title\": \"Notes\",\n      \"description\": \"List of user notes\",\n      \"default\": [],\n      \"items\": {\n        \"type\": \"object\",\n        \"properties\": {\n          \"title\": {\n            \"type\": \"string\",\n            \"title\": \"Title\",\n            \"description\": \"Title of the note\",\n            \"default\": \"New Note\"\n          },\n          \"content\": {\n            \"type\": \"string\",\n            \"title\": \"Content\",\n            \"description\": \"Content of the note\"\n          },\n          \"created\": {\n            \"type\": \"string\",\n            \"format\": \"date-time\",\n            \"title\": \"Created Date\",\n            \"description\": \"When the note was created\"\n          }\n        },\n        \"required\": [\"title\", \"content\"]\n      }\n    }\n  },\n  \"required\": [\"title\", \"content\"]\n}\n```\n\nGUIDELINES:\n- Aim for the simplest possible solution that works with the existing schema\n- The specification should take into account the existing schema structure\n- Focus on what can be achieved quickly with the existing data model\n- Avoid suggesting modifications to the schema if possible\n</schema_guidelines>\n\n<high_level>\nIMPORTANT:\n- Focus on the simplest working version\n- Aim for fewer fields rather than more\n- But still capture all the important state the user is creating\n- The user can always iterate and improve the solution later\n</high_level>\n",
+  "stream": false,
+  "messages": [
+    {
+      "role": "user",
+      "content": "\n<user_input>\n\n<goal>2048 game</goal>\n\n<description>Create a 🎮 2048 game charm with a grid-based UI, sliding tile mechanics, and score tracking. The charm will need to handle game state, tile merging logic, and win/lose conditions. Could add 🏆 high score persistence and 🔄 game reset functionality.</description>\n\n</user_input>\n\n<existing_schema>\n```json\n{\n  \"type\": \"object\",\n  \"properties\": {}\n}\n```\n</existing_schema>\n\nBased on this goal and the existing schema, please provide a title, description, any additional schema, detailed specification, and implementation plan.\n"
+    },
+    {
+      "role": "assistant",
+      "content": "<thinking>\nThe user wants to create a 2048 game charm. The 2048 game is a popular sliding tile puzzle game where players combine tiles with the same numbers to create a tile with the number 2048.\n\nKey components needed:\n1. Game board (typically a 4x4 grid)\n2. Tiles with numbers (powers of 2)\n3. Score tracking\n4. Game state (in progress, won, lost)\n5. Movement mechanics (up, down, left, right)\n6. Tile merging logic\n\nThe existing schema is empty, so I'll need to design a complete schema for this game.\n\nFor the minimal viable product, I'll focus on:\n- Representing the game board\n- Tracking the current score\n- Tracking game state\n- Storing the highest tile value\n\nFor a simple implementation, I'll create a schema that represents:\n1. The game board as a 2D array (4x4)\n2. Current score\n3. Game status (in progress, won, lost)\n4. Best score (for persistence)\n</thinking>\n\n<title>\n2048 Game Board\n</title>\n\n<description>\nA game charm to play the classic 2048 sliding tile puzzle.\n</description>\n\n<spec>\nThe 2048 game will feature a 4x4 grid where tiles with powers of 2 can be moved and merged. Players can slide tiles in four directions (up, down, left, right), with identical tiles merging into a tile of double value when they collide. The game tracks the current score, game state, and highest achieved score. The game ends when either a 2048 tile is created (win) or no more moves are possible (lose).\n</spec>\n\n<plan>\n1. Implement the game board representation and basic tile movement mechanics\n2. Add tile merging logic and score calculation\n3. Implement game state management (win/lose detection) and persistence\n</plan>\n\n<result_schema>\n{\n  \"type\": \"object\",\n  \"title\": \"2048 Game State\",\n  \"description\": \"Represents the current state of a 2048 game\",\n  \"properties\": {\n    \"board\": {\n      \"type\": \"array\",\n      \"title\": \"Game Board\",\n      \"description\": \"4x4 grid representing the game board\",\n      \"default\": [\n        [0, 0, 0, 0],\n        [0, 0, 0, 0],\n        [0, 0, 0, 0],\n        [0, 0, 0, 0]\n      ],\n      \"items\": {\n        \"type\": \"array\",\n        \"items\": {\n          \"type\": \"integer\",\n          \"minimum\": 0\n        }\n      }\n    },\n    \"score\": {\n      \"type\": \"integer\",\n      \"title\": \"Current Score\",\n      \"description\": \"Player's current score in this game\",\n      \"default\": 0,\n      \"minimum\": 0\n    },\n    \"bestScore\": {\n      \"type\": \"integer\",\n      \"title\": \"Best Score\",\n      \"description\": \"Highest score achieved across games\",\n      \"default\": 0,\n      \"minimum\": 0\n    },\n    \"gameStatus\": {\n      \"type\": \"string\",\n      \"title\": \"Game Status\",\n      \"description\": \"Current status of the game\",\n      \"enum\": [\"in_progress\", \"won\", \"lost\"],\n      \"default\": \"in_progress\"\n    },\n    \"highestTile\": {\n      \"type\": \"integer\",\n      \"title\": \"Highest Tile\",\n      \"description\": \"Value of the highest tile on the board\",\n      \"default\": 2,\n      \"minimum\": 2\n    }\n  },\n  \"required\": [\"board\", \"score\", \"gameStatus\"]\n}\n</result_schema>"
+    }
+  ],
+  "maxTokens": 64000
+}