Evals

Evals #32

Summary
Jobs
- Evals
Run details
- Usage
- Workflow file

Workflow file for this run

	name: Evals

	on:
	workflow_dispatch:
	inputs:
	commit_sha:
	description: "Git commit SHA to evaluate"
	required: true
	type: string
	test_size:
	type: choice
	description: "Number of charms to run"
	options:
	- one
	- ten
	- all
	# FOR DEVELOPMENT
	#pull_request:
	# branches:
	# - main

	env:
	# FOR DEVELOPMENT
	#target_sha: "4a4e488db80aa074a825923c757572f99137c957"
	#test_size: "one"
	target_sha: ${{ github.event.inputs.commit_sha }}
	test_size: ${{ github.event.inputs.test_size }}

	jobs:
	evals:
	name: Evals
	runs-on: ubuntu-latest
	environment: seeder
	services:
	redis:
	image: redis
	ports:
	- 6379:6379
	options: >-
	--health-cmd "redis-cli ping"
	--health-interval 10s
	--health-timeout 5s
	--health-retries 5
	steps:
	- name: ✏️ Log Eval Run Metadata
	run: \|
	echo "Running Evals:"
	echo "- Size: ${{ env.test_size }} charms"
	echo "- SHA: ${{ env.target_sha }}"

	# TODO Upload `seeder` as a deno executable to avoid
	# running from source.
	- name: 📥 Checkout repository
	uses: actions/checkout@v4
	with:
	ref: ${{ env.target_sha }}

	- name: 🦕 Setup Deno
	uses: ./.github/actions/deno-setup

	- name: 📦 Cache Deno dependencies
	uses: actions/cache@v3
	with:
	path: \|
	~/.deno
	~/.cache/deno
	key: ${{ runner.os }}-deno-${{ hashFiles('**/deno.json') }}

	- name: 🔑 Authenticate to Google Cloud
	uses: google-github-actions/auth@v1
	with:
	credentials_json: ${{ secrets.GCP_SA_KEY }}

	- name: ⚙️ Setup Google Cloud SDK
	uses: google-github-actions/setup-gcloud@v1

	- name: 📥 Download built binaries from Google Cloud Storage
	run: \|
	mkdir -p downloaded-artifacts
	gsutil cp gs://commontools-build-artifacts/workspace-artifacts/labs-${{ env.target_sha }}.tar.gz downloaded-artifacts/
	gsutil cp gs://commontools-build-artifacts/workspace-artifacts/labs-${{ env.target_sha }}.hash.txt hash.txt

	# Verify the tarball exists
	if [ ! -f downloaded-artifacts/labs-${{ env.target_sha }}.tar.gz ]; then
	echo "::error::Artifact tarball for commit ${{ env.target_sha }} not found!"
	echo "Make sure this commit was successfully built and artifacts were uploaded."
	exit 1
	fi
	mkdir -p common-binaries
	tar -xzf downloaded-artifacts/labs-${{ env.target_sha }}.tar.gz -C common-binaries

	# CTTS_AI_LLM_GOOGLE_APPLICATION_CREDENTIALS points to `/tmp/gcp-vertex-creds.json`
	- name: ⚙️ Setup Google Vertex Key
	run: \|
	echo '${{ secrets.GOOGLE_APPLICATION_CREDENTIALS }}' > /tmp/gcp-vertex-creds.json

	# We want to essentially use a persistent directory
	# https://github.com/actions/cache/blob/main/tips-and-workarounds.md#update-a-cache
	- name: 📦 Load LLM Cache
	id: load-llm-cache
	uses: actions/cache@v4
	with:
	path: ./llm_cache_dir
	key: llm-cache-${{ github.run_id }}
	restore-keys: llm-cache

	- name: ✏️ List LLM Cache Load
	run: \|
	mkdir -p ./llm_cache_dir/llm-api-cache
	ls -1 ./llm_cache_dir/llm-api-cache > ./cache-log-pre
	echo "$(cat ./cache-log-pre \| wc -l) items found in cache."
	cat ./cache-log-pre

	- name: 🚀 Start Toolshed server
	run: \|
	mkdir -p ./llm_cache_dir
	chmod +x ./common-binaries/toolshed
	CACHE_DIR=./llm_cache_dir \
	CTTS_AI_LLM_ANTHROPIC_API_KEY=${{ secrets.CTTS_AI_LLM_ANTHROPIC_API_KEY }} \
	CTTS_AI_LLM_OPENAI_API_KEY=${{ secrets.CTTS_AI_LLM_OPENAI_API_KEY }} \
	CTTS_AI_LLM_GOOGLE_APPLICATION_CREDENTIALS=${{ secrets.CTTS_AI_LLM_GOOGLE_APPLICATION_CREDENTIALS }} \
	./common-binaries/toolshed &

	# For Astral
	# https://github.com/lino-levan/astral/blob/f5ef833b2c5bde3783564a6b925073d5d46bb4b8/README.md#no-usable-sandbox-with-user-namespace-cloning-enabled
	- name: 🛡️ Disable AppArmor for browser tests
	run: echo 0 \| sudo tee /proc/sys/kernel/apparmor_restrict_unprivileged_userns

	- name: 🧪 Run seeder
	working-directory: packages/seeder
	run: \|
	FLAGS=""
	if [ ${{ env.test_size }} == "one" ]; then
	FLAGS="--tag smol"
	elif [ ${{ env.test_size }} == "ten" ]; then
	FLAGS="--tag 10x10"
	fi

	# Wrap in an if/else, we don't want the job to fail yet
	# until we can confidently expect passing results.
	if
	TOOLSHED_API_URL=http://localhost:8000/ \
	OPENAI_API_KEY=${{ secrets.CTTS_AI_LLM_OPENAI_API_KEY }} \
	deno task start --name ${{ env.target_sha }} $FLAGS ; then
	echo "All charms have been successfully verified!"
	else
	echo "Some charms have failed verification."
	fi

	- name: ✏️ List LLM Cache Save
	run: \|
	mkdir -p ./llm_cache_dir/llm-api-cache
	ls -1 ./llm_cache_dir/llm-api-cache > ./cache-log-post
	echo "$(cat ./cache-log-post \| wc -l) items found in cache."
	cat ./cache-log-post

	- name: 📦 LLM Cache Status
	run: \|
	if diff ./cache-log-pre ./cache-log-post > ./cache-analysis-diff ; then
	echo "No new LLM cache entries added!"
	else
	# `diff` outputs two lines per diff, one for location, one for content
	ADDITIONS=$(cat ./cache-analysis-diff \| wc -l)
	echo "LLM cache has added $((ADDITIONS / 2)) entries."
	fi

	- name: 📤 Upload reports
	uses: actions/upload-artifact@v4
	with:
	name: seeder-results
	path: \|
	./packages/seeder/results

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Evals #32

Workflow file

Evals #32

Uh oh!

Jobs

Run details

Workflow file for this run