Initial Research on Traces for Agento Project - PrototypeJam/agento_app GitHub Wiki
The Following Describes How to Isolate, Collect and Stage Traces for Analysis and Evals
Based on your research and needs, here's a comprehensive explanation of how to collect and manage traces from your AI Agent SDK Python project for analytics and evaluation purposes:
Since OpenAI doesn't provide a public API to download traces from their dashboard, you need to capture traces locally during execution. The OpenAI Agents SDK makes this straightforward with built-in tracing capabilities that you can customize.
The Agents SDK has tracing enabled by default and provides hooks to capture traces locally. Here's the recommended approach:
from agents import Agent, Runner, set_trace_processors, TraceProcessor
import json
from datetime import datetime
import os
class LocalFileTraceProcessor(TraceProcessor):
def __init__(self, output_dir="traces"):
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
def process_trace(self, trace):
"""Process and save each complete trace"""
trace_data = {
"trace_id": trace.trace_id,
"workflow_name": trace.workflow_name,
"timestamp": datetime.now().isoformat(),
"spans": []
}
# Extract all spans from the trace
for span in trace.spans:
span_data = {
"span_id": span.span_id,
"parent_id": span.parent_id,
"operation": span.operation_name,
"start_time": span.start_time,
"end_time": span.end_time,
"attributes": span.attributes,
"events": []
}
# Capture events within spans (LLM calls, tool calls, etc.)
for event in span.events:
event_data = {
"name": event.name,
"timestamp": event.timestamp,
"attributes": event.attributes
}
# Extract key data for evals
if event.name == "llm.generation":
event_data["input"] = event.attributes.get("input", "")
event_data["output"] = event.attributes.get("output", "")
event_data["model"] = event.attributes.get("model", "")
event_data["usage"] = event.attributes.get("usage", {})
span_data["events"].append(event_data)
trace_data["spans"].append(span_data)
# Save to file
filename = f"{self.output_dir}/trace_{trace.trace_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, 'w') as f:
json.dump(trace_data, f, indent=2, default=str)
def process_span(self, span):
"""Process individual spans as they complete"""
# You can optionally process spans in real-time
pass
# Set up the custom processor
custom_processor = LocalFileTraceProcessor(output_dir="agent_traces")
set_trace_processors([custom_processor])
# Your agent code
agent = Agent(
name="MyAgent",
instructions="You are a helpful assistant.",
model="gpt-4"
)
# Run the agent - traces will be automatically saved
result = Runner.run_sync(agent, "What's the weather like today?")
For more sophisticated trace collection that integrates with standard observability tools:
from agents import set_trace_processors, add_trace_processor
from agents.tracing.processors import OTLPHTTPTraceSpanProcessor
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
import json
# Configure OpenTelemetry
provider = TracerProvider()
trace.set_tracer_provider(provider)
# Export to local OTLP collector (which can write to files)
otlp_exporter = OTLPSpanExporter(
endpoint="http://localhost:4318/v1/traces",
headers={"Content-Type": "application/json"}
)
span_processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(span_processor)
# Add OTLP processor to agents SDK
add_trace_processor(
OTLPHTTPTraceSpanProcessor(endpoint="http://localhost:4318/v1/traces")
)
# Also add a custom processor for immediate local access
class EvalDataProcessor(TraceProcessor):
def __init__(self, eval_file="eval_data.jsonl"):
self.eval_file = eval_file
def process_trace(self, trace):
# Extract data specifically for evaluations
for span in trace.spans:
for event in span.events:
if event.name == "llm.generation":
eval_entry = {
"trace_id": trace.trace_id,
"span_id": span.span_id,
"timestamp": event.timestamp,
"input": event.attributes.get("messages", []),
"output": event.attributes.get("output", ""),
"model": event.attributes.get("model", ""),
"temperature": event.attributes.get("temperature", 1.0),
"usage": event.attributes.get("usage", {}),
"ideal_answer": None # Placeholder for manual annotation
}
# Append to JSONL file for easy streaming
with open(self.eval_file, 'a') as f:
f.write(json.dumps(eval_entry) + '\n')
add_trace_processor(EvalDataProcessor())
For your evaluation needs, you want to capture:
- Specific input (complete context window)
- LLM output
- Ideal answer (to be added manually later)
Here's a focused approach:
import pandas as pd
from typing import List, Dict, Any
class EvaluationDataCollector:
def __init__(self, output_file="evaluation_dataset.csv"):
self.output_file = output_file
self.eval_data = []
def extract_from_trace_files(self, trace_directory="agent_traces"):
"""Extract evaluation data from saved trace files"""
import glob
for trace_file in glob.glob(f"{trace_directory}/*.json"):
with open(trace_file, 'r') as f:
trace_data = json.load(f)
# Extract LLM calls from trace
for span in trace_data.get("spans", []):
for event in span.get("events", []):
if event.get("name") == "llm.generation":
eval_entry = {
"trace_id": trace_data["trace_id"],
"workflow_name": trace_data.get("workflow_name", ""),
"timestamp": event["timestamp"],
"model": event["attributes"].get("model", ""),
"input_messages": json.dumps(event["attributes"].get("input", [])),
"output": event["attributes"].get("output", ""),
"prompt_tokens": event["attributes"].get("usage", {}).get("prompt_tokens", 0),
"completion_tokens": event["attributes"].get("usage", {}).get("completion_tokens", 0),
"total_tokens": event["attributes"].get("usage", {}).get("total_tokens", 0),
"ideal_answer": "" # To be filled manually
}
self.eval_data.append(eval_entry)
# Save to CSV for easy editing in spreadsheets
df = pd.DataFrame(self.eval_data)
df.to_csv(self.output_file, index=False)
print(f"Saved {len(self.eval_data)} evaluation entries to {self.output_file}")
return df
# Usage
collector = EvaluationDataCollector()
eval_df = collector.extract_from_trace_files()
For agent workflows with multiple interactions, you'll want to group related traces:
class WorkflowTraceProcessor(TraceProcessor):
def __init__(self, output_dir="workflow_traces"):
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
self.active_workflows = {}
def process_trace(self, trace):
workflow_id = trace.workflow_name or trace.trace_id
if workflow_id not in self.active_workflows:
self.active_workflows[workflow_id] = {
"workflow_id": workflow_id,
"start_time": datetime.now().isoformat(),
"traces": [],
"all_inputs": [],
"all_outputs": []
}
workflow = self.active_workflows[workflow_id]
# Extract all LLM interactions
trace_summary = {
"trace_id": trace.trace_id,
"timestamp": datetime.now().isoformat(),
"llm_calls": []
}
for span in trace.spans:
for event in span.events:
if event.name == "llm.generation":
llm_call = {
"input": event.attributes.get("input", ""),
"output": event.attributes.get("output", ""),
"model": event.attributes.get("model", "")
}
trace_summary["llm_calls"].append(llm_call)
workflow["all_inputs"].append(llm_call["input"])
workflow["all_outputs"].append(llm_call["output"])
workflow["traces"].append(trace_summary)
# Check if workflow is complete (you'll need to define this logic)
if self.is_workflow_complete(trace):
self.save_workflow(workflow_id)
del self.active_workflows[workflow_id]
def is_workflow_complete(self, trace):
# Implement logic to determine if a workflow is done
# Could be based on specific events, timeouts, or explicit markers
return trace.attributes.get("workflow_complete", False)
def save_workflow(self, workflow_id):
workflow = self.active_workflows[workflow_id]
workflow["end_time"] = datetime.now().isoformat()
filename = f"{self.output_dir}/workflow_{workflow_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, 'w') as f:
json.dump(workflow, f, indent=2, default=str)
To use OpenTelemetry effectively, set up a local collector:
# otel-collector-config.yaml
receivers:
otlp:
protocols:
http:
endpoint: localhost:4318
processors:
batch:
exporters:
file:
path: ./traces.json
logging:
loglevel: debug
service:
pipelines:
traces:
receivers: [otlp]
processors: [batch]
exporters: [file, logging]
Run with Docker:
docker run -p 4318:4318 \
-v $(pwd)/otel-collector-config.yaml:/etc/otel-collector-config.yaml \
-v $(pwd)/traces:/traces \
otel/opentelemetry-collector:latest \
--config=/etc/otel-collector-config.yaml
Convert traces to analytics-friendly formats:
def traces_to_analytics_format(trace_dir="agent_traces"):
"""Convert trace files to a format suitable for analytics tools"""
all_events = []
for trace_file in glob.glob(f"{trace_dir}/*.json"):
with open(trace_file, 'r') as f:
trace = json.load(f)
base_info = {
"trace_id": trace["trace_id"],
"workflow_name": trace.get("workflow_name", ""),
"trace_timestamp": trace["timestamp"]
}
for span in trace.get("spans", []):
for event in span.get("events", []):
event_record = {**base_info}
event_record.update({
"span_id": span["span_id"],
"operation": span["operation"],
"event_name": event["name"],
"event_timestamp": event["timestamp"]
})
# Flatten attributes
for key, value in event.get("attributes", {}).items():
if isinstance(value, (str, int, float, bool)):
event_record[f"attr_{key}"] = value
else:
event_record[f"attr_{key}"] = json.dumps(value)
all_events.append(event_record)
# Save as CSV for easy import into analytics tools
df = pd.DataFrame(all_events)
df.to_csv("traces_analytics.csv", index=False)
# Also save as Parquet for efficient querying
df.to_parquet("traces_analytics.parquet", index=False)
return df
- Use the OpenAI Agents SDK's built-in tracing with custom processors to capture traces locally
- Don't rely on downloading from OpenAI's dashboard - there's no API for this
- Structure your trace data for easy extraction of inputs, outputs, and metadata
- Use standard formats (JSON, CSV, Parquet) for compatibility with analytics tools
- Consider OpenTelemetry for more sophisticated trace management and integration with observability platforms
This approach gives you complete control over your trace data while maintaining compatibility with standard analytics workflows and evaluation frameworks.
- The dashboard traces you see are created by an internal ingest-only endpoint (
POST /v1/traces/ingest
). No public “list” or bulk-export route exists today. (OpenAI Community, OpenAI GitHub) - Therefore, the only dependable way to guarantee you have every prompt/response for later evals is to capture a second copy at run-time.
Tracing is enabled automatically; each agent run emits spans for every LLM generation, tool call, guard-rail, hand-off, etc. (GitHub)
from agents import add_trace_processor, set_tracing_export_api_key
import os
set_tracing_export_api_key(os.environ["OPENAI_API_KEY"]) # keeps the dashboard view
Include the raw prompt & completion Spans already carry them. Keep
RunConfig.trace_include_sensitive_data=True
(default) so the context window and model output are preserved. (OpenAI GitHub)
Two lines give you a local copy without losing the dashboard conveniences.
from agents.tracing.processors import OTLPHTTPTraceSpanProcessor
add_trace_processor(
OTLPHTTPTraceSpanProcessor(
endpoint="http://localhost:4318/v1/traces" # any OTLP-compatible collector
)
)
-
OTLPHTTPTraceSpanProcessor
is documented in the SDK reference and can be aimed at Jaeger, Tempo, Langfuse, Arize, Braintrust, Logfire, etc. (OpenAI GitHub, Arize Docs) - A vanilla OpenTelemetry Collector pointed at that endpoint can fan-out to S3/Parquet, Postgres, Elasticsearch or flat JSON files.
Folder | Contents | Notes |
---|---|---|
raw/ |
One JSONL file per day – each line is the full OTLP span for a single LLM call. | Easy Pandas load; append-only. |
runs/ |
run-<trace_id>.json – hierarchy of the whole workflow. |
Lets you replay a multi-agent run end-to-end. |
eval_sets/ |
CSV with three columns: trace_id , actual_output , ideal_output . |
You hand-author ideal_output ; script can join on trace_id . |
A minimal ETL script that copies the prompt and completion into flat columns:
import json, csv, glob
with open('eval_sets/today.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['trace_id','input','model_output'])
for fn in glob.glob('raw/2025-06-01*.jsonl'):
for line in open(fn):
span = json.loads(line)
if span.get("attributes", {}).get("openai.event_type") == "LLM_RUN":
w.writerow([
span["trace_id"],
span["attributes"]["openai.llm.input"],
span["attributes"]["openai.llm.output"]
])
You now have a spreadsheet-ready file to which you can append your “ideal answer” and run eval metrics.
- Every run already has a unique
trace_id
; keep that as the primary key across spans. - To group separate agent steps that belong to the same logical “workflow”, wrap them in a
with trace("my_workflow_name"):
context; the SDK nests spans correctly. (GitHub) - Store the top-level “workflow” span JSON separately (
runs/
folder above) so you can reconstruct the full conversation.
- Collect: OTLP collector writes JSONL spans to S3 every hour.
- Materialise: Daily job flattens LLM_RUN spans to CSV.
-
Annotate: You (or an LLM‐assisted script) fill in the
ideal_output
column. -
Score: Run BLEU, ROUGE or bespoke rubric graders; store results next to the CSV.
Langfuse has a built-in
evaluate()
helper if you prefer hosted dashboards. (Langfuse)
Wrap the ordinary OpenAI Python client with manual logging:
response = client.chat.completions.create(...)
log = {
"trace_id": response.id,
"input": messages,
"output": response.choices[0].message.content,
"usage": response.usage
}
jsonl_writer.write(log)
Automatic OpenInference instrumentation can turn those calls into OTLP spans identical to the agents pipeline. (Arize Docs)
-
trace_include_sensitive_data=False
if you need to redact prompts/answers. (OpenAI GitHub) -
set_tracing_disabled(True)
removes the default exporter so no data leaves your VPC; keep only your custom processor. (TECHCOMMUNITY.MICROSOFT.COM)
- Collector is reachable (< 10 ms latency).
- Span count in your store == span count in dashboard.
- Daily back-ups verified.
- PII filter tested.
- Eval script passes on a test batch.
Once the above is in place you’ll have a reproducible, analytics-friendly record of (1) full context window, (2) model output, (3) your gold-standard answer for every call—ready for ad-hoc Pandas work, BI dashboards, or automated regression tests.
Okay, this is a clear and well-defined need, and the research you've compiled provides excellent guidance. The core takeaway from your research is that you cannot reliably download historical traces from the OpenAI dashboard via an API. The solution lies in capturing the data at runtime from your application.
The OpenAI Agents SDK is the absolute best tool for this, as it offers robust built-in tracing and, crucially, the ability to intercept and process those traces yourself. OpenTelemetry is the industry standard for collecting observable data (including traces), making it the ideal framework to integrate with the Agents SDK for local storage and analysis.
Here's a detailed, complete, and specific plan to achieve your needs, based on your research:
The traces you see in the OpenAI dashboard are primarily for debugging and visualization on their platform. As your research clearly states across all sources (Perplexity, ChatGPT, Grok3, Gemini), there is no public API or bulk export functionality to programmatically download these historical traces after they've been stored by OpenAI.
Therefore, the strategy is to capture all necessary trace data directly from your application as it runs, leveraging the powerful tracing capabilities of the OpenAI Agents SDK and exporting it to a local storage solution of your choice using OpenTelemetry.
This is the foundation for intercepting and storing your trace data.
You'll need the OpenAI Agents SDK, the core OpenTelemetry SDK, and an OpenTelemetry exporter to save your traces. For local files, you'd typically use an OTLP (OpenTelemetry Protocol) exporter pointed at a local OpenTelemetry Collector, which then writes to a file or database. For simplicity, we'll demonstrate using a basic OTLP HTTP exporter.
pip install openai-agents opentelemetry-sdk opentelemetry-exporter-otlp-proto-http
OpenTelemetry works by routing "spans" (units of work within a trace) through a TracerProvider
and then to an Exporter
. For your use case, you want an exporter that saves data locally.
import os
import json
from datetime import datetime
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor, ConsoleSpanExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.semconv.resource import ResourceAttributes
# We'll use a custom file exporter for simple local JSON saving.
# In a production setup, you'd send to a local OTel Collector or a database.
class CustomFileSpanExporter(ConsoleSpanExporter): # Inherit from ConsoleSpanExporter for structure, but override export
def __init__(self, file_path="traces.jsonl"):
self.file_path = file_path
with open(self.file_path, 'w') as f:
f.write("") # Clear file on init or ensure append mode
def export(self, spans):
# Convert OpenTelemetry spans to a dictionary format for JSON line output
# This is a simplified conversion. A full OTel exporter would handle OTLP protobufs.
# Here, we're just extracting key attributes for your specific needs.
for span in spans:
span_dict = {
"trace_id": format(span.context.trace_id, '032x'),
"span_id": format(span.context.span_id, '016x'),
"parent_id": format(span.parent.span_id, '016x') if span.parent else None,
"name": span.name,
"start_time_unix_nano": span.start_time,
"end_time_unix_nano": span.end_time,
"attributes": dict(span.attributes), # Convert Attributes to dict
"events": [{"name": event.name, "timestamp": event.timestamp, "attributes": dict(event.attributes)} for event in span.events],
"status": {"status_code": span.status.status_code.name, "description": span.status.description},
"resource_attributes": dict(span.resource.attributes)
}
with open(self.file_path, 'a') as f:
f.write(json.dumps(span_dict) + "\n")
return trace.export.SpanExportResult.SUCCESS
# Configure the TracerProvider
resource = Resource.create({
ResourceAttributes.SERVICE_NAME: "my-ai-agent-app",
ResourceAttributes.SERVICE_VERSION: "1.0.0",
})
provider = TracerProvider(resource=resource)
file_exporter = CustomFileSpanExporter()
span_processor = SimpleSpanProcessor(file_exporter)
provider.add_span_processor(span_processor)
trace.set_tracer_provider(provider)
# Get a tracer instance (though Agents SDK handles much of this automatically)
otel_tracer = trace.get_tracer(__name__)
This is the critical step that allows you to intercept the detailed trace data generated by the Agents SDK. The SDK provides TraceProcessor
and add_trace_processor()
.
from agents import set_trace_processors, set_tracing_export_api_key
from openai.lib.streaming._trace import TraceProcessor, TraceEvent, TraceType
import logging
# Set up basic logging to see process
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class OpenAIAgentOTELProcessor(TraceProcessor):
"""
A custom trace processor for OpenAI Agents SDK that converts SDK trace events
into OpenTelemetry spans and then exports them via the configured OTel provider.
"""
def __init__(self, otel_tracer_instance):
self.otel_tracer = otel_tracer_instance
self.span_map = {} # To keep track of active OTel spans by OpenAI trace/span ID
def process_trace(self, trace_event: TraceEvent) -> None:
# Each TraceEvent from the OpenAI Agents SDK corresponds to a unit of work.
# We map these to OpenTelemetry spans.
otel_span_name = f"openai_agent.{trace_event.event_type.value.lower()}"
# Determine parent span for OpenTelemetry hierarchy
parent_span_context = None
if trace_event.parent_id and trace_event.parent_id in self.span_map:
parent_span_context = self.span_map[trace_event.parent_id].context
elif trace_event.trace_id != trace_event.parent_id and trace_event.trace_id in self.span_map:
# Handle cases where parent_id is for the root trace itself but not the immediate parent span
parent_span_context = self.span_map[trace_event.trace_id].context
with self.otel_tracer.start_as_current_span(
otel_span_name,
context=parent_span_context,
kind=trace.SpanKind.INTERNAL # Most agent steps are internal
) as otel_span:
# Store the OTel span for child events to reference
self.span_map[trace_event.id] = otel_span
# Set core attributes for the span
otel_span.set_attribute("openai.sdk.trace_id", trace_event.trace_id)
otel_span.set_attribute("openai.sdk.span_id", trace_event.id)
if trace_event.parent_id:
otel_span.set_attribute("openai.sdk.parent_id", trace_event.parent_id)
otel_span.set_attribute("openai.sdk.event_type", trace_event.event_type.value)
otel_span.set_attribute("openai.sdk.start_time", trace_event.start_time.isoformat())
# Add detailed data based on event type
if trace_event.data:
# Convert Pydantic models in data to dicts for JSON serialization
processed_data = {}
for key, value in trace_event.data.items():
try:
processed_data[key] = value.model_dump() if hasattr(value, 'model_dump') else value
except Exception:
processed_data[key] = str(value) # Fallback to string if cannot dump
otel_span.set_attribute("openai.sdk.data", json.dumps(processed_data))
# --- CRUCIAL FOR YOUR EVALS: Extracting Input and LLM Output ---
if trace_event.event_type == TraceType.LLM_RUN:
# Input (everything that went into the context window)
if 'input' in processed_data:
# For chat models, 'input' is typically a list of message dicts
otel_span.set_attribute("llm.input.messages_json", json.dumps(processed_data['input']))
# LLM Output
if 'output' in processed_data and 'content' in processed_data['output']:
otel_span.set_attribute("llm.output.content", processed_data['output']['content'])
if 'usage' in processed_data:
otel_span.set_attribute("llm.usage.prompt_tokens", processed_data['usage'].get('prompt_tokens'))
otel_span.set_attribute("llm.usage.completion_tokens", processed_data['usage'].get('completion_tokens'))
otel_span.set_attribute("llm.usage.total_tokens", processed_data['usage'].get('total_tokens'))
if 'model' in processed_data:
otel_span.set_attribute("llm.model", processed_data['model'])
elif trace_event.event_type == TraceType.TOOL_CALL:
if 'tool_name' in processed_data:
otel_span.set_attribute("tool.name", processed_data['tool_name'])
if 'tool_args' in processed_data:
otel_span.set_attribute("tool.args_json", json.dumps(processed_data['tool_args']))
elif trace_event.event_type == TraceType.TOOL_OUTPUT:
if 'output' in processed_data:
otel_span.set_attribute("tool.output", str(processed_data['output'])) # Tool output can be complex
elif trace_event.event_type == TraceType.ERROR:
otel_span.set_status(trace.Status(trace.StatusCode.ERROR, description=str(trace_event.data)))
otel_span.record_exception(Exception(str(trace_event.data)))
# Mark the end of the span if it's a terminal event for that trace_id/span_id
# The SDK handles span closure implicitly, but this is for conceptual mapping.
if trace_event.event_type in [TraceType.AGENT_END, TraceType.LLM_END, TraceType.TOOL_END, TraceType.ERROR]:
if trace_event.id in self.span_map:
self.span_map[trace_event.id].end()
del self.span_map[trace_event.id]
if trace_event.trace_id == trace_event.id and trace_event.trace_id in self.span_map:
# If it's the root trace ending, ensure its span ends
self.span_map[trace_event.trace_id].end()
del self.span_map[trace_event.trace_id]
# Instantiate and register your custom processor
agent_otel_processor = OpenAIAgentOTELProcessor(otel_tracer)
# IMPORTANT: This replaces OpenAI's default processors.
# If you still want traces in the OpenAI dashboard, you would need to also
# include OpenAI's default exporter in the list if the SDK supported multi-exporters.
# As per research, `set_trace_processors` replaces.
# If you want both, consider sending OTel traces to a collector that then forwards to OpenAI (if that's an option via OTel, which it often is).
# For now, this is for local capture ONLY.
set_trace_processors([agent_otel_processor])
# If you still want to send some basic usage data to OpenAI for cost tracking (optional),
# you might still need this line, but it won't be sending full traces to their dashboard
# if set_trace_processors replaced the default one.
# set_tracing_export_api_key(os.getenv("OPENAI_API_KEY"))
Now, when you run your agent, the OpenAIAgentOTELProcessor
will intercept all trace events, convert them into OpenTelemetry spans, and your CustomFileSpanExporter
will write them to traces.jsonl
.
import os
from openai import OpenAI
from agents import Agent, Runner, trace, set_trace_processors
# Make sure to run the OTel and TraceProcessor setup code above first!
# Example Agent Definition and Run
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# Define a tool for the agent
def get_current_weather(location: str):
"""Get the current weather in a given location."""
if "san francisco" in location.lower():
return {"temperature": "72F", "conditions": "Sunny"}
elif "london" in location.lower():
return {"temperature": "55F", "conditions": "Cloudy"}
else:
return {"temperature": "N/A", "conditions": "Unknown"}
# Create an agent
weather_agent = Agent(
client=client,
model="gpt-4o",
instructions="You are a helpful weather assistant. Use the available tools.",
tools=[get_current_weather]
)
# Run the agent within a custom trace context for better identification
# The `with trace("your_workflow_name")` block helps categorize the top-level
# interaction in your traces.
with trace("WeatherQueryWorkflow"):
print("Running Agent 1: What is the weather in San Francisco?")
result1 = Runner.run_sync(weather_agent, "What is the weather in San Francisco?")
print(f"Agent 1 Output: {result1.final_output}\n")
with trace("GeneralKnowledgeWorkflow"):
print("Running Agent 2: What is the capital of France?")
result2 = Runner.run_sync(weather_agent, "What is the capital of France?")
print(f"Agent 2 Output: {result2.final_output}\n")
# You might want to explicitly flush the exporter's buffer at the end of your script
# if using a BatchSpanProcessor or similar. SimpleSpanProcessor flushes immediately.
print(f"Traces saved to traces.jsonl")
The traces.jsonl
file will now contain one JSON object per line, each representing an OpenTelemetry span. These spans are highly structured and contain the exact data you need.
In the traces.jsonl
file, you will be looking for spans where:
-
"name"
starts with"openai_agent.llm_run"
(indicating an LLM call). - Inside the
"attributes"
dictionary:-
Input (Context Window): Look for the
llm.input.messages_json
attribute. This will be a JSON string of themessages
list (including system, user, assistant, and tool messages) that were sent to the LLM. -
LLM Output: Look for the
llm.output.content
attribute. This will be the LLM's generated response string. - You'll also find
llm.model
for the model used, andllm.usage.prompt_tokens
,llm.usage.completion_tokens
for token counts.
-
Input (Context Window): Look for the
The hierarchical nature of OpenTelemetry traces, directly mapped from the Agents SDK, natively supports this:
-
trace_id
: This identifies the entire workflow or a longer span of interaction. All spans belonging to a single top-levelwith trace("WorkflowName")
block will share the sametrace_id
. -
span_id
: Identifies a specific operation within that workflow (e.g., an LLM call, a tool call, an agent step). -
parent_id
: Links a span to its parent span, creating a nested hierarchy that shows the flow of execution within your agent. -
openai.sdk.event_type
: Helps you understand what kind of event each span represents (e.g.,AGENT_RUN
,LLM_RUN
,TOOL_CALL
,TOOL_OUTPUT
).
You can group all spans by their trace_id
to reconstruct a complete workflow. The parent_id
attribute allows you to understand the sequence and nesting of operations within that workflow.
This is external data that you'll craft. To link it to your collected traces:
-
For Workflow-level Evals: Use the root
trace_id
of a complete interaction workflow as the primary key. When you craft an ideal answer for a multi-turn conversation or complex agent task, associate it with thattrace_id
. -
For Individual LLM Call Evals: If you want to evaluate specific LLM responses within a larger workflow, you can use a combination of the
trace_id
and thespan_id
of thellm_run
span. -
Data Structure: Store your ideal answers in a separate spreadsheet or database table that has columns for
trace_id
(and optionallyspan_id
) to enable easy joining with your collected trace data.
The traces.jsonl
file contains JSON objects.
-
For Spreadsheets:
- You'll need a script (e.g., using Python's
pandas
library) to loadtraces.jsonl
, flatten the nested JSON (especially theattributes
anddata
fields), and then export it to CSV. - Focus on extracting key attributes like
trace_id
,span_id
,name
,llm.input.messages_json
,llm.output.content
,llm.model
,llm.usage.total_tokens
, etc., into separate columns. - Example using
pandas
to flatten the data for a CSV:
import pandas as pd import json data = [] with open('traces.jsonl', 'r') as f: for line in f: data.append(json.loads(line)) # Convert to DataFrame df = pd.DataFrame(data) # Flatten the 'attributes' column for easier access # This is a simplified flattening. For deep nesting, you might need recursive flattening. df_attributes = pd.json_normalize(df['attributes'].apply(lambda x: x if x is not None else {})) df_merged = pd.concat([df.drop(columns=['attributes', 'events', 'status', 'resource_attributes']), df_attributes], axis=1) # Convert specific JSON string columns back to parsed JSON objects if needed for deeper analysis # For spreadsheets, you might keep them as strings or further flatten specific parts. # Example: Parse 'llm.input.messages_json' into a structured object for Python analysis # df_merged['llm.input.messages'] = df_merged['llm.input.messages_json'].apply(lambda x: json.loads(x) if pd.notna(x) else None) # Select relevant columns for your eval spreadsheet eval_df = df_merged[[ 'trace_id', 'span_id', 'name', # e.g., openai_agent.llm_run, openai_agent.tool_call 'openai.sdk.event_type', 'llm.model', 'llm.input.messages_json', # The full input context 'llm.output.content', # The LLM's raw output 'llm.usage.prompt_tokens', 'llm.usage.completion_tokens', 'llm.usage.total_tokens', # Add other relevant attributes like tool.name, tool.args_json, etc. ]].copy() # Filter for LLM run traces specifically for evals llm_eval_df = eval_df[eval_df['openai.sdk.event_type'] == 'LLM_RUN'].copy() # You might want to assign a unique ID that combines trace_id and span_id for linking llm_eval_df['eval_id'] = llm_eval_df['trace_id'] + '_' + llm_eval_df['span_id'] # Export to CSV llm_eval_df.to_csv('llm_eval_data.csv', index=False) print("LLM evaluation data saved to llm_eval_data.csv") # For broader analysis of workflows, you can save the full merged DataFrame # df_merged.to_csv('all_traces_flattened.csv', index=False) # print("All flattened trace data saved to all_traces_flattened.csv")
- You'll need a script (e.g., using Python's
-
For Various Analytics Tools: Most modern analytics tools (e.g., BI tools, data science notebooks, custom scripts) can easily import JSON Line (
.jsonl
) files. You can then use their respective data processing capabilities to flatten and query the data. -
Database Storage: For larger volumes or more complex querying, consider exporting traces from your OpenTelemetry Collector directly into a local PostgreSQL, SQLite, or specialized tracing database (like Jaeger or Tempo if you run them locally). This would provide SQL-like access for analysis.
Yes, it matters immensely and is the cornerstone of this solution.
-
Built-in Structured Tracing: The Agents SDK automatically generates rich, structured trace data (
TraceEvent
objects) for every step of your agent's execution: LLM calls, tool invocations, tool outputs, handoffs, guardrail checks, etc. This is far more granular and contextual than just logging raw OpenAI API requests/responses. -
Custom Trace Processor Support: This is the key feature that enables your solution. The
set_trace_processors()
function allows you to hook into this built-in tracing stream and direct the data to your own storage (like OpenTelemetry + local files/database) instead of solely relying on OpenAI's dashboard. Without the SDK, you'd have to manually instrument every LLM call and tool interaction to collect this level of detail. -
Workflow Context: The SDK naturally groups related operations into
trace_id
s andspan_id
s, making it easy to reconstruct full agent workflows and long interaction sequences. This is crucial for evaluating multi-turn agents or complex chained tasks. -
Richer Data for Evals: The SDK's traces provide specific attributes like
llm.input.messages_json
(the full context window),llm.output.content
, and detailed tool information, which are exactly what you need for comprehensive evaluations.
In summary: If you were just using the standard OpenAI Responses API directly, you would have to manually implement extensive logging around every client.chat.completions.create()
call, extract inputs/outputs, and manage trace IDs yourself. The Agents SDK automates much of this, making it much easier and more robust to collect the detailed trace data you require for analysis and evaluations.
By following this plan, you will gain full control over your agent's interaction data, enabling detailed analysis in spreadsheets and analytics tools, and providing the precise input, output, and workflow context needed for your evaluation pipeline.
Complete Plan for Collecting OpenAI Agents SDK Traces for EvaluationsBased on your research, the optimal solution for collecting evaluation data from your OpenAI Agents SDK Python project is implementing a custom TracingProcessor that automatically captures exactly what you need for evaluations 24.
Why This Approach is SuperiorYour research revealed several approaches for collecting agent traces, but the custom TracingProcessor method stands out as the clear winner for evaluation purposes 26.The custom TracingProcessor approach provides complete data control, automatic CSV export, and captures 100% of your interactions without relying on external services or manual processes 2. Unlike the OpenAI Dashboard which doesn't offer programmatic data export, or third-party tools that require additional costs and setup, this solution gives you immediate local access to all trace data 3.
What Data Gets Captured for EvaluationsThe implementation automatically captures exactly the three components you need for evaluations 2:
- Agent instructions and system prompts
- User input and conversation history
- Available tools and their schemas
- Model configuration parameters
- Full context window content - the complete prompt sent to the LLM 2
- Raw LLM response text
- Tool calls made during execution
- Finish reason and completion status
- Performance metrics (latency, tokens used, estimated costs) 2
- Empty
ideal_answer
field for your hand-crafted responses - Evaluation scoring fields
- Notes and categorization columns 4## Implementation GuideThe implementation leverages the OpenAI Agents SDK's built-in tracing capabilities through custom processors 26. Tracing is enabled by default in the SDK and captures comprehensive events including LLM generations, tool calls, handoffs, and guardrails 2.
openai-agents>=0.1.0 pandas>=1.5.0 python-dateutil>=2.8.0
Step 2: Add Trace Collection to Your ProjectThe beauty of this approach is that it requires minimal changes to your existing code 2. Simply add the evaluation collection setup to your main application:
import asyncio import os from agents import Agent, Runner, function_tool from evaluation_trace_collector import setup_evaluation_collection
os.environ["OPENAI_API_KEY"] = "your-api-key-here"
collector = setup_evaluation_collection()
@function_tool def get_weather(city: str) -> str: """Get weather information for a city""" return f"The weather in {city} is sunny and 72°F"
@function_tool
def get_order_status(order_id: str) -> str:
"""Get the status of an order"""
return f"Order {order_id} is shipped and will arrive tomorrow"
customer_service_agent = Agent( name="Customer Service Assistant", instructions="""You are a helpful customer service representative. Answer questions politely and use tools when needed to get accurate information. Always explain what you're doing when using tools.""", tools=[get_weather, get_order_status], model="gpt-4o-mini" )
async def main(): """Your main application logic - unchanged!"""
# Test cases for evaluation
test_cases = [
"What's the weather like in San Francisco?",
"Can you check the status of order #12345?",
"I need help with my account",
"What is your return policy?",
"How do I cancel my subscription?"
]
print("Running agent with evaluation data collection...")
# Run your agents exactly as before - traces are automatically collected
for i, user_input in enumerate(test_cases, 1):
print(f"\n=== Test Case {i} ===")
print(f"User: {user_input}")
# Use trace context to group multi-step workflows
from agents import trace
with trace(f"Customer_Support_Case_{i}"):
# Your normal agent execution - no changes!
result = await Runner.run(customer_service_agent, user_input)
print(f"Agent: {result.final_output}")
# 3. EXPORT YOUR EVALUATION DATA
print("\n=== Exporting Evaluation Data ===")
master_file = collector.export_all_data()
print(f"📊 All evaluation data saved to: {master_file}")
# 4. ADD IDEAL ANSWERS (do this once after collecting data)
from evaluation_trace_collector import add_ideal_answers_to_csv
ideal_answers = {
"What's the weather like in San Francisco?":
"I'll check the current weather in San Francisco for you. [Uses weather tool] The weather in San Francisco is sunny and 72°F.",
"Can you check the status of order #12345?":
"I'll look up the status of order #12345 for you. [Uses order status tool] Order #12345 is shipped and will arrive tomorrow.",
"I need help with my account":
"I'd be happy to help you with your account. Could you please provide more details about what specific assistance you need?",
"What is your return policy?":
"Our return policy allows returns within 30 days of purchase. Items must be in original condition with receipt. Refunds are processed within 5-7 business days.",
"How do I cancel my subscription?":
"I can help you cancel your subscription. To ensure security, I'll need to verify your account first. Please provide your email address or account number."
}
updated_file = add_ideal_answers_to_csv(master_file, ideal_answers)
print(f"📋 Updated CSV with ideal answers: {updated_file}")
# Clean shutdown
collector.shutdown()
print("\n✅ Complete! Your evaluation data is ready for analysis.")
print("\nNext steps:")
print(f"1. Open {updated_file} in Excel/Google Sheets")
print("2. Compare 'llm_output' vs 'ideal_answer' columns")
print("3. Add evaluation scores and notes")
print("4. Use data for training/fine-tuning improvements")
if name == "main": asyncio.run(main())
Step 3: Automatic Data ExportThe system automatically exports evaluation data in CSV format after each trace completes 4. You get both individual trace files and a master file containing all collected data, perfect for spreadsheet analysis 4.
Multi-Step Workflow SupportYour research emphasized the need to handle complex agent workflows with multiple LLM interactions 2. The implementation addresses this through several mechanisms:
- Workflow Step Tracking: Each LLM call gets a sequential step number within the workflow
-
Trace Grouping: All steps from the same workflow share a common
trace_id
- Parent-Child Relationships: Spans maintain hierarchical relationships for complex flows
- Multi-Step Indicators: Boolean flags identify workflows with multiple LLM interactions 4
For multi-step workflows, each LLM generation becomes a separate row in your CSV, but they're grouped by trace_id
so you can analyze the complete workflow sequence 4.
CSV Output Structure for AnalysisThe exported CSV contains columns specifically designed for evaluation workflows 4:
Column | Description | Usage |
---|---|---|
user_input |
Original user question | Input for evaluation |
full_context |
Complete LLM context window | Full input analysis |
llm_output |
Agent's response | Actual output to evaluate |
ideal_answer |
Your hand-crafted response | Gold standard comparison |
evaluation_score |
Numeric evaluation rating | Scoring field |
workflow_step_number |
Step in multi-step flow | Workflow analysis |
trace_id |
Groups related interactions | Workflow grouping |
After collecting trace data, you can programmatically add your hand-crafted ideal answers to the CSV files 4. The implementation includes helper functions to map user inputs to ideal responses, creating the complete evaluation dataset you need.
Integration with Evaluation ToolsThe CSV format works seamlessly with various analysis approaches 4:
- Spreadsheet Analysis: Direct import into Excel, Google Sheets, or similar tools
- LLM-as-Judge Evaluation: Structured format for automated evaluation scripts
- Custom Analytics: Compatible with pandas, SQL databases, and data visualization tools
- OpenAI Evaluation Suite: Ready for integration with OpenAI's evaluation frameworks
Technical Implementation DetailsThe core implementation uses a custom TracingProcessor
class that intercepts trace events from the OpenAI Agents SDK 26. The processor specifically targets GenerationSpanData
events, which contain the LLM input/output pairs you need for evaluations 2.
import json import csv import datetime from typing import Dict, List, Any, Optional from pathlib import Path from dataclasses import dataclass, asdict import pandas as pd
from agents import Agent, Runner, add_trace_processor from agents.tracing.processors import TracingProcessor from agents.tracing.types import Trace, Span
@dataclass class EvaluationRecord: """ Data structure for evaluation records - exactly what you need for evals """ # Core identifiers trace_id: str span_id: str workflow_name: str timestamp: str
# Input data (everything that went into the context window)
agent_instructions: str
user_input: str
conversation_history: List[Dict[str, str]]
tools_available: List[str]
model_name: str
full_context: str # Complete context window content
# Output data (LLM response)
llm_output: str
tool_calls: List[Dict[str, Any]]
finish_reason: str
# Evaluation fields (to be filled later)
ideal_answer: str = "" # Hand-crafted ideal response
evaluation_score: Optional[float] = None
evaluation_notes: str = ""
# Workflow tracking
parent_span_id: Optional[str] = None
workflow_step_number: int = 1
is_multi_step_workflow: bool = False
# Performance metrics
latency_ms: Optional[float] = None
tokens_used: Optional[int] = None
cost_estimate: Optional[float] = None
class EvaluationDataCollector(TracingProcessor): """ Custom trace processor to collect data for evaluations in CSV format """
def __init__(self, output_dir: str = "eval_data"):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
# Store evaluation records
self.evaluation_records: List[EvaluationRecord] = []
# Track workflow context
self.workflow_contexts: Dict[str, Dict] = {}
self.step_counters: Dict[str, int] = {}
def on_trace_start(self, trace: Trace) -> None:
"""Called when a new trace starts"""
self.workflow_contexts[trace.trace_id] = {
'workflow_name': trace.workflow_name,
'started_at': trace.started_at,
'metadata': trace.metadata or {}
}
self.step_counters[trace.trace_id] = 0
def on_trace_end(self, trace: Trace) -> None:
"""Called when a trace completes - save data to CSV"""
if trace.trace_id in self.workflow_contexts:
# Save records for this trace to CSV
self._save_trace_records(trace.trace_id)
# Clean up
del self.workflow_contexts[trace.trace_id]
if trace.trace_id in self.step_counters:
del self.step_counters[trace.trace_id]
def on_span_start(self, span: Span[Any]) -> None:
"""Called when a span starts"""
pass
def on_span_end(self, span: Span[Any]) -> None:
"""Called when a span ends - extract evaluation data"""
# Only process LLM generation spans for evaluations
if span.span_data.__class__.__name__ == 'GenerationSpanData':
self._process_generation_span(span)
# Track agent spans for context
elif span.span_data.__class__.__name__ == 'AgentSpanData':
self._process_agent_span(span)
def _process_generation_span(self, span: Span) -> None:
"""Extract LLM input/output for evaluation"""
span_data = span.span_data
trace_id = span.trace_id
# Increment step counter
self.step_counters[trace_id] = self.step_counters.get(trace_id, 0) + 1
# Extract input context (everything that went to the LLM)
messages = getattr(span_data, 'messages', [])
full_context = self._build_full_context(messages)
# Extract agent instructions from the system message
agent_instructions = ""
conversation_history = []
user_input = ""
for msg in messages:
if msg.get('role') == 'system':
agent_instructions = msg.get('content', '')
elif msg.get('role') == 'user':
user_input = msg.get('content', '')
conversation_history.append({
'role': 'user',
'content': msg.get('content', '')
})
elif msg.get('role') == 'assistant':
conversation_history.append({
'role': 'assistant',
'content': msg.get('content', '')
})
# Extract output data
llm_output = ""
tool_calls = []
finish_reason = ""
if hasattr(span_data, 'response') and span_data.response:
response = span_data.response
if hasattr(response, 'choices') and response.choices:
choice = response.choices[0]
llm_output = getattr(choice.message, 'content', '') or ""
tool_calls = getattr(choice.message, 'tool_calls', []) or []
finish_reason = getattr(choice, 'finish_reason', '')
# Extract performance metrics
latency_ms = None
if span.ended_at and span.started_at:
latency_ms = (span.ended_at - span.started_at).total_seconds() * 1000
tokens_used = getattr(getattr(span_data, 'response', None), 'usage', {}).get('total_tokens')
# Determine workflow context
workflow_context = self.workflow_contexts.get(trace_id, {})
workflow_name = workflow_context.get('workflow_name', 'Unknown')
is_multi_step = self.step_counters[trace_id] > 1
# Create evaluation record
eval_record = EvaluationRecord(
trace_id=trace_id,
span_id=span.span_id,
workflow_name=workflow_name,
timestamp=datetime.datetime.now().isoformat(),
# Input data
agent_instructions=agent_instructions,
user_input=user_input,
conversation_history=conversation_history,
tools_available=self._extract_available_tools(span_data),
model_name=getattr(span_data, 'model', 'unknown'),
full_context=full_context,
# Output data
llm_output=llm_output,
tool_calls=[self._serialize_tool_call(tc) for tc in tool_calls],
finish_reason=finish_reason,
# Workflow tracking
parent_span_id=span.parent_id,
workflow_step_number=self.step_counters[trace_id],
is_multi_step_workflow=is_multi_step,
# Performance metrics
latency_ms=latency_ms,
tokens_used=tokens_used,
cost_estimate=self._estimate_cost(tokens_used, getattr(span_data, 'model', ''))
)
self.evaluation_records.append(eval_record)
def _process_agent_span(self, span: Span) -> None:
"""Track agent context for richer evaluation data"""
# Can be used to capture additional agent context if needed
pass
def _build_full_context(self, messages: List[Dict]) -> str:
"""Build the complete context that was sent to the LLM"""
context_parts = []
for msg in messages:
role = msg.get('role', 'unknown')
content = msg.get('content', '')
context_parts.append(f"{role.upper()}: {content}")
return "\n\n".join(context_parts)
def _extract_available_tools(self, span_data) -> List[str]:
"""Extract list of tools available to the agent"""
tools = getattr(span_data, 'tools', []) or []
return [tool.get('function', {}).get('name', 'unknown') for tool in tools]
def _serialize_tool_call(self, tool_call) -> Dict[str, Any]:
"""Serialize tool call for CSV storage"""
if hasattr(tool_call, 'function'):
return {
'name': tool_call.function.name,
'arguments': tool_call.function.arguments
}
return {'name': 'unknown', 'arguments': '{}'}
def _estimate_cost(self, tokens: Optional[int], model: str) -> Optional[float]:
"""Estimate cost based on token usage and model"""
if not tokens:
return None
# Simplified cost estimation (update with current pricing)
cost_per_1k_tokens = {
'gpt-4o': 0.005,
'gpt-4o-mini': 0.0015,
'gpt-3.5-turbo': 0.001,
}
for model_key, cost in cost_per_1k_tokens.items():
if model_key in model.lower():
return (tokens / 1000) * cost
return None
def _save_trace_records(self, trace_id: str) -> None:
"""Save evaluation records for a specific trace to CSV"""
# Filter records for this trace
trace_records = [r for r in self.evaluation_records if r.trace_id == trace_id]
if not trace_records:
return
# Create filename with timestamp and trace ID
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"eval_data_{timestamp}_{trace_id[:8]}.csv"
filepath = self.output_dir / filename
# Convert to DataFrame for easy CSV export
records_dict = [asdict(record) for record in trace_records]
# Flatten complex fields for CSV
flattened_records = []
for record in records_dict:
flattened = record.copy()
# Convert list/dict fields to JSON strings for CSV
flattened['conversation_history'] = json.dumps(record['conversation_history'])
flattened['tools_available'] = json.dumps(record['tools_available'])
flattened['tool_calls'] = json.dumps(record['tool_calls'])
flattened_records.append(flattened)
# Save to CSV
df = pd.DataFrame(flattened_records)
df.to_csv(filepath, index=False)
print(f"Saved {len(trace_records)} evaluation records to {filepath}")
# Also save a human-readable JSON version
json_filepath = filepath.with_suffix('.json')
with open(json_filepath, 'w') as f:
json.dump(records_dict, f, indent=2, default=str)
print(f"Saved detailed JSON to {json_filepath}")
def export_all_data(self) -> str:
"""Export all collected data to a master CSV file"""
if not self.evaluation_records:
print("No evaluation records to export")
return ""
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
master_filepath = self.output_dir / f"all_eval_data_{timestamp}.csv"
# Convert all records to DataFrame
records_dict = [asdict(record) for record in self.evaluation_records]
# Flatten for CSV
flattened_records = []
for record in records_dict:
flattened = record.copy()
flattened['conversation_history'] = json.dumps(record['conversation_history'])
flattened['tools_available'] = json.dumps(record['tools_available'])
flattened['tool_calls'] = json.dumps(record['tool_calls'])
flattened_records.append(flattened)
df = pd.DataFrame(flattened_records)
df.to_csv(master_filepath, index=False)
print(f"Exported {len(self.evaluation_records)} total records to {master_filepath}")
return str(master_filepath)
def shutdown(self) -> None:
"""Clean shutdown - export any remaining data"""
self.export_all_data()
def setup_evaluation_collection(): """Set up the evaluation data collection system"""
# Create the collector
collector = EvaluationDataCollector(output_dir="eval_data")
# Add it as a trace processor (in addition to OpenAI's default)
add_trace_processor(collector)
return collector
async def example_agent_run_with_evaluation(): """Example of how to run agents with evaluation data collection"""
# Set up evaluation collection
collector = setup_evaluation_collection()
# Create your agent
agent = Agent(
name="Customer Service Agent",
instructions="""You are a helpful customer service agent.
Answer questions politely and provide accurate information.
If you need to use tools, explain what you're doing.""",
model="gpt-4o-mini"
)
# Example inputs for evaluation
test_inputs = [
"What is your return policy?",
"I have a problem with my recent order #12345",
"Can you help me track my shipment?",
"I want to cancel my subscription"
]
# Run each test input
for i, user_input in enumerate(test_inputs, 1):
# Use trace context to group related runs
from agents import trace
with trace(f"Customer_Service_Test_{i}"):
result = await Runner.run(agent, user_input)
print(f"Test {i} - Input: {user_input}")
print(f"Output: {result.final_output}")
print("-" * 50)
# Export all collected data
master_file = collector.export_all_data()
print(f"All evaluation data saved to: {master_file}")
# Clean shutdown
collector.shutdown()
return master_file
def add_ideal_answers_to_csv(csv_filepath: str, ideal_answers: Dict[str, str]) -> str: """ Add ideal answers to your evaluation CSV
Args:
csv_filepath: Path to the evaluation CSV
ideal_answers: Dict mapping user_input to ideal_answer
"""
df = pd.read_csv(csv_filepath)
# Add ideal answers based on user input
df['ideal_answer'] = df['user_input'].map(ideal_answers).fillna("")
# Save updated CSV
updated_filepath = csv_filepath.replace('.csv', '_with_ideals.csv')
df.to_csv(updated_filepath, index=False)
print(f"Updated CSV with ideal answers: {updated_filepath}")
return updated_filepath
example_ideal_answers = { "What is your return policy?": """Our return policy allows returns within 30 days of purchase. Items must be in original condition with tags attached. Refunds are processed within 5-7 business days.""",
"I have a problem with my recent order #12345": """I'd be happy to help with your order.
Let me look up order #12345 for you. Can you please provide your email address
associated with the order so I can access the details?""",
"Can you help me track my shipment?": """I can definitely help you track your shipment.
Please provide your order number or tracking number, and I'll get you the latest
shipping information.""",
"I want to cancel my subscription": """I can help you cancel your subscription.
To ensure I'm accessing the correct account, please verify your email address.
Note that cancellation will take effect at the end of your current billing cycle."""
}
The system maintains workflow context across multiple agent interactions, automatically handling the complexity of multi-step flows while providing clean, analyzable output 5.
Key Benefits of This ApproachBased on your research, this implementation provides several critical advantages 2:
- Zero Code Disruption: Your existing agent implementation requires no modifications
- Complete Data Capture: Captures 100% of agent interactions with full context
- Evaluation-Ready Format: Direct export to spreadsheet-compatible CSV files
- Local Data Control: No dependency on external services or dashboard access
- Cost-Free Operation: No additional licensing or service fees
- Workflow Intelligence: Automatic handling of complex multi-step agent flows
To implement this solution in your project, follow the complete setup guide and use the provided example integration code 3. The implementation captures exactly what you need for thorough agent evaluation while maintaining the simplicity and performance of your existing codebase.
README.md follows:
This implementation automatically collects traces from your OpenAI Agents SDK applications for evaluation purposes.
-
Install dependencies:
pip install -r requirements.txt
-
Add evaluation collection to your app:
from evaluation_trace_collector import setup_evaluation_collection collector = setup_evaluation_collection()
-
Run your agents normally - evaluation data is automatically collected!
-
Export and analyze:
master_file = collector.export_all_data() # Open the CSV in Excel/Google Sheets for evaluation
-
evaluation_trace_collector.py
- Main implementation -
example_integration.py
- Shows how to integrate with existing code -
trace-collection-guide.md
- Complete usage guide -
requirements.txt
- Python dependencies
Perfect CSV format for evaluations with:
- Complete LLM input context
- LLM outputs
- Fields for your ideal answers
- Performance metrics
- Multi-step workflow support
Ready for analysis in any spreadsheet or evaluation tool!
trace-collection-guide.md Follows:
Based on your research, here's a complete, actionable plan to collect traces from your OpenAI Agents SDK Python project for evaluation purposes.
# 1. Install required packages
pip install openai-agents pandas
# 2. Set up your trace collector (add this to your main app)
from evaluation_trace_collector import setup_evaluation_collection
# Initialize evaluation data collection
collector = setup_evaluation_collection()
# 3. Run your agents normally - traces are automatically collected!
# Your existing agent code doesn't need to change
The implementation captures exactly what you need for evaluations:
- Agent instructions
- User input
- Conversation history
- Available tools
- Model name
- Full context window content (complete prompt sent to LLM)
- LLM response text
- Tool calls made
- Finish reason
- Performance metrics (latency, tokens, cost)
Each row contains one LLM interaction with columns for:
-
user_input
- What the user asked -
full_context
- Complete context sent to LLM -
llm_output
- What the LLM responded -
ideal_answer
- Empty field for your hand-crafted ideal responses - Plus metadata (trace_id, timestamps, performance metrics)
For agents with multiple LLM calls in one workflow:
- Each LLM call gets its own row
-
workflow_step_number
tracks the sequence -
trace_id
groups all steps from same workflow -
is_multi_step_workflow
flag identifies complex flows
# Your existing agent code with trace collection added
collector = setup_evaluation_collection()
# Run your agents normally
result = await Runner.run(agent, "User question here")
# Data is automatically saved to CSV files in eval_data/ folder
# Define what the perfect response should be
ideal_answers = {
"What is your return policy?": "Our return policy allows...",
"How do I track my order?": "You can track your order by...",
# Add more ideal responses
}
# Add them to your CSV
updated_csv = add_ideal_answers_to_csv("eval_data/your_file.csv", ideal_answers)
Open the CSV in Excel, Google Sheets, or your analytics tool. You have:
- Column A: User input
- Column B: Full LLM context
- Column C: Actual LLM output
- Column D: Your ideal answer
- Columns E+: Metadata, performance metrics
Perfect for:
- Manual evaluation scoring
- LLM-as-judge evaluation
- Performance analysis
- Error pattern identification
# Extend the EvaluationRecord dataclass to add your own fields
@dataclass
class MyEvaluationRecord(EvaluationRecord):
custom_score: float = 0.0
category: str = ""
complexity_level: int = 1
The CSV format works directly with:
- OpenAI's evaluation tools
- LangSmith evaluation
- Custom evaluation scripts
- Any spreadsheet software
# Export all data at once
master_file = collector.export_all_data()
# Or export per trace automatically
# (happens automatically when each trace completes)
eval_data/
├── eval_data_20250601_143022_1a2b3c4d.csv # Individual trace files
├── eval_data_20250601_143022_1a2b3c4d.json # Detailed JSON version
├── eval_data_20250601_143055_5e6f7g8h.csv # Another trace
└── all_eval_data_20250601_150000.csv # Master file with everything
- Zero Code Changes: Your existing agent code doesn't need modification
- Complete Context: Captures everything that went into the LLM context window
- Evaluation Ready: Perfect format for manual or automated evaluation
- Workflow Aware: Handles complex multi-step agent workflows
- Analytics Friendly: CSV format works with any analysis tool
- Performance Tracking: Built-in latency, token, and cost tracking
- Dashboard: Good for debugging, can't export data programmatically
- This Solution: Gives you complete local control over your evaluation data
- Coverage: Captures 100% of your interactions vs. relying on dashboard access
This implementation is based on the latest OpenAI Agents SDK tracing capabilities and gives you exactly what you need for comprehensive evaluation workflows.
This approach transforms your OpenAI Agents SDK project into a comprehensive evaluation data collection system, giving you the structured, analyzable datasets needed for rigorous agent performance assessment and improvement.
Citations: 1 https://ppl-ai-file-upload.s3.amazonaws.com/web/direct-files/attachments/1696097/4fd7a056-b954-46f0-9328-9edc5b815fed/paste-2.txt 2 https://openai.github.io/openai-agents-python/tracing/ 3 https://openai.github.io/openai-agents-python/ 4 https://github.com/openai/openai-agents-python 5 https://docs.smith.langchain.com/observability/how_to_guides/trace_with_openai_agents_sdk 6 https://openai.github.io/openai-agents-python/ref/tracing/setup/ 7 https://aws.amazon.com/blogs/dotnet/developing-custom-processors-using-opentelemetry-in-net-8/ [8] https://techdocs.broadcom.com/us/en/ca-mainframe-software/automation/ca-workload-automation-esp-edition/12-0/reference/initialization-parameters/trace-activate-trace-mechanism.html [9] https://getstream.io/blog/local-openai-agents/ [10] https://github.com/openai/openai-agents-python/blob/main/docs/tracing.md [11] https://docs.arize.com/phoenix/tracing/integrations-tracing/openai-agents-sdk [12] https://docs.smith.langchain.com/reference/python/wrappers/langsmith.wrappers._openai_agents.OpenAIAgentsTracingProcessor [13] https://weave-docs.wandb.ai/guides/integrations/openai_agents/ [14] https://doc.rust-lang.org/beta/nightly-rustc/rustc_span/struct.Span.html [15] https://cran.r-project.org/web/packages/ganGenerativeData/ganGenerativeData.pdf [16] https://mlflow.org/docs/latest/tracing/integrations/openai-agent [17] https://edgedelta.com/company/blog/what-are-spans-in-distributed-tracing [18] https://community.openai.com/t/how-can-i-go-about-saving-output-in-csv/422152 [19] https://community.openai.com/t/load-a-csv-file-for-various-independent-queries-via-api/1112775 [20] https://docs.usagepanda.com/usecases/export-openai-requests-to-csv.html [21] https://www.youtube.com/watch?v=VVdzQs-FeHE [22] https://chromewebstore.google.com/detail/openai-api-usage-data-exp/afpkbnmljkkbmhfpcchlilcgnikdpemj?authuser=2 [23] https://docs.webmethods.io/on-premises/webmethods-broker/en/10.5.0/broker-webhelp/pif-webhelp/to-using_doc_trace_21.html [24] https://www.enghouseinteractive.com/products/insights-and-analytics/automated-agent-evaluation/ [25] https://python.langchain.com/v0.1/docs/integrations/toolkits/csv/ [26] https://android.googlesource.com/platform/external/perfetto/+/refs/heads/master-cuttlefish-testing-release/docs/trace-processor.md [27] https://opentelemetry-python.readthedocs.io/en/latest/sdk/trace.html [28] https://openai.github.io/openai-agents-python/ref/tracing/span_data/ [29] https://copper-mountain-technologies.gitbooks.io/vna-for-smarties/content/analyzer_data_output/trace_data_csv_file.html [30] https://wandb.ai/onlineinference/genai-research/reports/AI-agent-evaluation-Metrics-strategies-and-best-practices--VmlldzoxMjM0NjQzMQ
Answer from Perplexity: pplx.ai/share