Add OpenTelemetry observability with local JSONL traces (#347)
Co-authored-by: 0xallam <ahmed39652003@gmail.com>
This commit is contained in:
@@ -47,6 +47,11 @@ class Config:
|
||||
|
||||
# Telemetry
|
||||
strix_telemetry = "1"
|
||||
strix_otel_telemetry = None
|
||||
strix_posthog_telemetry = None
|
||||
traceloop_base_url = None
|
||||
traceloop_api_key = None
|
||||
traceloop_headers = None
|
||||
|
||||
# Config file override (set via --config CLI arg)
|
||||
_config_file_override: Path | None = None
|
||||
|
||||
@@ -413,8 +413,6 @@ def display_completion_message(args: argparse.Namespace, results_path: Path) ->
|
||||
if tracer and tracer.scan_results:
|
||||
scan_completed = tracer.scan_results.get("scan_completed", False)
|
||||
|
||||
has_vulnerabilities = tracer and len(tracer.vulnerability_reports) > 0
|
||||
|
||||
completion_text = Text()
|
||||
if scan_completed:
|
||||
completion_text.append("Penetration test completed", style="bold #22c55e")
|
||||
@@ -439,13 +437,12 @@ def display_completion_message(args: argparse.Namespace, results_path: Path) ->
|
||||
if stats_text.plain:
|
||||
panel_parts.extend(["\n", stats_text])
|
||||
|
||||
if scan_completed or has_vulnerabilities:
|
||||
results_text = Text()
|
||||
results_text.append("\n")
|
||||
results_text.append("Output", style="dim")
|
||||
results_text.append(" ")
|
||||
results_text.append(str(results_path), style="#60a5fa")
|
||||
panel_parts.extend(["\n", results_text])
|
||||
results_text = Text()
|
||||
results_text.append("\n")
|
||||
results_text.append("Output", style="dim")
|
||||
results_text.append(" ")
|
||||
results_text.append(str(results_path), style="#60a5fa")
|
||||
panel_parts.extend(["\n", results_text])
|
||||
|
||||
panel_content = Text.assemble(*panel_parts)
|
||||
|
||||
|
||||
23
strix/telemetry/flags.py
Normal file
23
strix/telemetry/flags.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from strix.config import Config
|
||||
|
||||
|
||||
_DISABLED_VALUES = {"0", "false", "no", "off"}
|
||||
|
||||
|
||||
def _is_enabled(raw_value: str | None, default: str = "1") -> bool:
|
||||
value = (raw_value if raw_value is not None else default).strip().lower()
|
||||
return value not in _DISABLED_VALUES
|
||||
|
||||
|
||||
def is_otel_enabled() -> bool:
|
||||
explicit = Config.get("strix_otel_telemetry")
|
||||
if explicit is not None:
|
||||
return _is_enabled(explicit)
|
||||
return _is_enabled(Config.get("strix_telemetry"), default="1")
|
||||
|
||||
|
||||
def is_posthog_enabled() -> bool:
|
||||
explicit = Config.get("strix_posthog_telemetry")
|
||||
if explicit is not None:
|
||||
return _is_enabled(explicit)
|
||||
return _is_enabled(Config.get("strix_telemetry"), default="1")
|
||||
@@ -6,7 +6,7 @@ from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from uuid import uuid4
|
||||
|
||||
from strix.config import Config
|
||||
from strix.telemetry.flags import is_posthog_enabled
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -19,7 +19,7 @@ _SESSION_ID = uuid4().hex[:16]
|
||||
|
||||
|
||||
def _is_enabled() -> bool:
|
||||
return (Config.get("strix_telemetry") or "1").lower() not in ("0", "false", "no", "off")
|
||||
return is_posthog_enabled()
|
||||
|
||||
|
||||
def _is_first_run() -> bool:
|
||||
|
||||
@@ -1,20 +1,40 @@
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import Any, Callable, Optional
|
||||
from uuid import uuid4
|
||||
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.trace import SpanContext, SpanKind
|
||||
|
||||
from strix.config import Config
|
||||
from strix.telemetry import posthog
|
||||
from strix.telemetry.flags import is_otel_enabled
|
||||
from strix.telemetry.utils import (
|
||||
TelemetrySanitizer,
|
||||
append_jsonl_record,
|
||||
bootstrap_otel,
|
||||
format_span_id,
|
||||
format_trace_id,
|
||||
get_events_write_lock,
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
try:
|
||||
from traceloop.sdk import Traceloop
|
||||
except ImportError: # pragma: no cover - exercised when dependency is absent
|
||||
Traceloop = None # type: ignore[assignment,unused-ignore]
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_global_tracer: Optional["Tracer"] = None
|
||||
|
||||
_OTEL_BOOTSTRAP_LOCK = threading.Lock()
|
||||
_OTEL_BOOTSTRAPPED = False
|
||||
_OTEL_REMOTE_ENABLED = False
|
||||
|
||||
def get_global_tracer() -> Optional["Tracer"]:
|
||||
return _global_tracer
|
||||
@@ -52,16 +72,225 @@ class Tracer:
|
||||
"status": "running",
|
||||
}
|
||||
self._run_dir: Path | None = None
|
||||
self._events_file_path: Path | None = None
|
||||
self._next_execution_id = 1
|
||||
self._next_message_id = 1
|
||||
self._saved_vuln_ids: set[str] = set()
|
||||
self._run_completed_emitted = False
|
||||
self._telemetry_enabled = is_otel_enabled()
|
||||
self._sanitizer = TelemetrySanitizer()
|
||||
|
||||
self._otel_tracer: Any = None
|
||||
self._remote_export_enabled = False
|
||||
|
||||
self.caido_url: str | None = None
|
||||
self.vulnerability_found_callback: Callable[[dict[str, Any]], None] | None = None
|
||||
|
||||
self._setup_telemetry()
|
||||
self._emit_run_started_event()
|
||||
|
||||
@property
|
||||
def events_file_path(self) -> Path:
|
||||
if self._events_file_path is None:
|
||||
self._events_file_path = self.get_run_dir() / "events.jsonl"
|
||||
return self._events_file_path
|
||||
|
||||
def _active_events_file_path(self) -> Path:
|
||||
active = get_global_tracer()
|
||||
if active and active._events_file_path is not None:
|
||||
return active._events_file_path
|
||||
return self.events_file_path
|
||||
|
||||
def _get_events_write_lock(self, output_path: Path | None = None) -> threading.Lock:
|
||||
path = output_path or self.events_file_path
|
||||
return get_events_write_lock(path)
|
||||
|
||||
def _active_run_metadata(self) -> dict[str, Any]:
|
||||
active = get_global_tracer()
|
||||
if active:
|
||||
return active.run_metadata
|
||||
return self.run_metadata
|
||||
|
||||
def _setup_telemetry(self) -> None:
|
||||
global _OTEL_BOOTSTRAPPED, _OTEL_REMOTE_ENABLED
|
||||
|
||||
if not self._telemetry_enabled:
|
||||
self._otel_tracer = None
|
||||
self._remote_export_enabled = False
|
||||
return
|
||||
|
||||
run_dir = self.get_run_dir()
|
||||
self._events_file_path = run_dir / "events.jsonl"
|
||||
base_url = (Config.get("traceloop_base_url") or "").strip()
|
||||
api_key = (Config.get("traceloop_api_key") or "").strip()
|
||||
headers_raw = Config.get("traceloop_headers") or ""
|
||||
|
||||
(
|
||||
self._otel_tracer,
|
||||
self._remote_export_enabled,
|
||||
_OTEL_BOOTSTRAPPED,
|
||||
_OTEL_REMOTE_ENABLED,
|
||||
) = bootstrap_otel(
|
||||
bootstrapped=_OTEL_BOOTSTRAPPED,
|
||||
remote_enabled_state=_OTEL_REMOTE_ENABLED,
|
||||
bootstrap_lock=_OTEL_BOOTSTRAP_LOCK,
|
||||
traceloop=Traceloop,
|
||||
base_url=base_url,
|
||||
api_key=api_key,
|
||||
headers_raw=headers_raw,
|
||||
output_path_getter=self._active_events_file_path,
|
||||
run_metadata_getter=self._active_run_metadata,
|
||||
sanitizer=self._sanitize_data,
|
||||
write_lock_getter=self._get_events_write_lock,
|
||||
tracer_name="strix.telemetry.tracer",
|
||||
)
|
||||
|
||||
def _set_association_properties(self, properties: dict[str, Any]) -> None:
|
||||
if Traceloop is None:
|
||||
return
|
||||
sanitized = self._sanitize_data(properties)
|
||||
try:
|
||||
Traceloop.set_association_properties(sanitized)
|
||||
except Exception: # noqa: BLE001
|
||||
logger.debug("Failed to set Traceloop association properties")
|
||||
|
||||
def _sanitize_data(self, data: Any, key_hint: str | None = None) -> Any:
|
||||
return self._sanitizer.sanitize(data, key_hint=key_hint)
|
||||
|
||||
def _append_event_record(self, record: dict[str, Any]) -> None:
|
||||
try:
|
||||
append_jsonl_record(self.events_file_path, record)
|
||||
except OSError:
|
||||
logger.exception("Failed to append JSONL event record")
|
||||
|
||||
def _enrich_actor(self, actor: dict[str, Any] | None) -> dict[str, Any] | None:
|
||||
if not actor:
|
||||
return None
|
||||
|
||||
enriched = dict(actor)
|
||||
if "agent_name" in enriched:
|
||||
return enriched
|
||||
|
||||
agent_id = enriched.get("agent_id")
|
||||
if not isinstance(agent_id, str):
|
||||
return enriched
|
||||
|
||||
agent_data = self.agents.get(agent_id, {})
|
||||
agent_name = agent_data.get("name")
|
||||
if isinstance(agent_name, str) and agent_name:
|
||||
enriched["agent_name"] = agent_name
|
||||
|
||||
return enriched
|
||||
|
||||
def _emit_event(
|
||||
self,
|
||||
event_type: str,
|
||||
actor: dict[str, Any] | None = None,
|
||||
payload: Any | None = None,
|
||||
status: str | None = None,
|
||||
error: Any | None = None,
|
||||
source: str = "strix.tracer",
|
||||
include_run_metadata: bool = False,
|
||||
) -> None:
|
||||
if not self._telemetry_enabled:
|
||||
return
|
||||
|
||||
enriched_actor = self._enrich_actor(actor)
|
||||
sanitized_actor = self._sanitize_data(enriched_actor) if enriched_actor else None
|
||||
sanitized_payload = self._sanitize_data(payload) if payload is not None else None
|
||||
sanitized_error = self._sanitize_data(error) if error is not None else None
|
||||
|
||||
trace_id: str | None = None
|
||||
span_id: str | None = None
|
||||
parent_span_id: str | None = None
|
||||
|
||||
current_context = trace.get_current_span().get_span_context()
|
||||
if isinstance(current_context, SpanContext) and current_context.is_valid:
|
||||
parent_span_id = format_span_id(current_context.span_id)
|
||||
|
||||
if self._otel_tracer is not None:
|
||||
try:
|
||||
with self._otel_tracer.start_as_current_span(
|
||||
f"strix.{event_type}",
|
||||
kind=SpanKind.INTERNAL,
|
||||
) as span:
|
||||
span_context = span.get_span_context()
|
||||
trace_id = format_trace_id(span_context.trace_id)
|
||||
span_id = format_span_id(span_context.span_id)
|
||||
|
||||
span.set_attribute("strix.event_type", event_type)
|
||||
span.set_attribute("strix.source", source)
|
||||
span.set_attribute("strix.run_id", self.run_id)
|
||||
span.set_attribute("strix.run_name", self.run_name or "")
|
||||
|
||||
if status:
|
||||
span.set_attribute("strix.status", status)
|
||||
if sanitized_actor is not None:
|
||||
span.set_attribute(
|
||||
"strix.actor",
|
||||
json.dumps(sanitized_actor, ensure_ascii=False),
|
||||
)
|
||||
if sanitized_payload is not None:
|
||||
span.set_attribute(
|
||||
"strix.payload",
|
||||
json.dumps(sanitized_payload, ensure_ascii=False),
|
||||
)
|
||||
if sanitized_error is not None:
|
||||
span.set_attribute(
|
||||
"strix.error",
|
||||
json.dumps(sanitized_error, ensure_ascii=False),
|
||||
)
|
||||
except Exception: # noqa: BLE001
|
||||
logger.debug("Failed to create OTEL span for event type '%s'", event_type)
|
||||
|
||||
if trace_id is None:
|
||||
trace_id = format_trace_id(uuid4().int & ((1 << 128) - 1)) or uuid4().hex
|
||||
if span_id is None:
|
||||
span_id = format_span_id(uuid4().int & ((1 << 64) - 1)) or uuid4().hex[:16]
|
||||
|
||||
record = {
|
||||
"timestamp": datetime.now(UTC).isoformat(),
|
||||
"event_type": event_type,
|
||||
"run_id": self.run_id,
|
||||
"trace_id": trace_id,
|
||||
"span_id": span_id,
|
||||
"parent_span_id": parent_span_id,
|
||||
"actor": sanitized_actor,
|
||||
"payload": sanitized_payload,
|
||||
"status": status,
|
||||
"error": sanitized_error,
|
||||
"source": source,
|
||||
}
|
||||
if include_run_metadata:
|
||||
record["run_metadata"] = self._sanitize_data(self.run_metadata)
|
||||
self._append_event_record(record)
|
||||
|
||||
def set_run_name(self, run_name: str) -> None:
|
||||
self.run_name = run_name
|
||||
self.run_id = run_name
|
||||
self.run_metadata["run_name"] = run_name
|
||||
self.run_metadata["run_id"] = run_name
|
||||
self._run_dir = None
|
||||
self._events_file_path = None
|
||||
self._run_completed_emitted = False
|
||||
self._set_association_properties({"run_id": self.run_id, "run_name": self.run_name or ""})
|
||||
self._emit_run_started_event()
|
||||
|
||||
def _emit_run_started_event(self) -> None:
|
||||
if not self._telemetry_enabled:
|
||||
return
|
||||
|
||||
self._emit_event(
|
||||
"run.started",
|
||||
payload={
|
||||
"run_name": self.run_name,
|
||||
"start_time": self.start_time,
|
||||
"local_jsonl_path": str(self.events_file_path),
|
||||
"remote_export_enabled": self._remote_export_enabled,
|
||||
},
|
||||
status="running",
|
||||
include_run_metadata=True,
|
||||
)
|
||||
|
||||
def get_run_dir(self) -> Path:
|
||||
if self._run_dir is None:
|
||||
@@ -134,6 +363,12 @@ class Tracer:
|
||||
self.vulnerability_reports.append(report)
|
||||
logger.info(f"Added vulnerability report: {report_id} - {title}")
|
||||
posthog.finding(severity)
|
||||
self._emit_event(
|
||||
"finding.created",
|
||||
payload={"report": report},
|
||||
status=report["severity"],
|
||||
source="strix.findings",
|
||||
)
|
||||
|
||||
if self.vulnerability_found_callback:
|
||||
self.vulnerability_found_callback(report)
|
||||
@@ -178,11 +413,24 @@ class Tracer:
|
||||
"""
|
||||
|
||||
logger.info("Updated scan final fields")
|
||||
self._emit_event(
|
||||
"finding.reviewed",
|
||||
payload={
|
||||
"scan_completed": True,
|
||||
"vulnerability_count": len(self.vulnerability_reports),
|
||||
},
|
||||
status="completed",
|
||||
source="strix.findings",
|
||||
)
|
||||
self.save_run_data(mark_complete=True)
|
||||
posthog.end(self, exit_reason="finished_by_tool")
|
||||
|
||||
def log_agent_creation(
|
||||
self, agent_id: str, name: str, task: str, parent_id: str | None = None
|
||||
self,
|
||||
agent_id: str,
|
||||
name: str,
|
||||
task: str,
|
||||
parent_id: str | None = None,
|
||||
) -> None:
|
||||
agent_data: dict[str, Any] = {
|
||||
"id": agent_id,
|
||||
@@ -196,6 +444,13 @@ class Tracer:
|
||||
}
|
||||
|
||||
self.agents[agent_id] = agent_data
|
||||
self._emit_event(
|
||||
"agent.created",
|
||||
actor={"agent_id": agent_id, "agent_name": name},
|
||||
payload={"task": task, "parent_id": parent_id},
|
||||
status="running",
|
||||
source="strix.agents",
|
||||
)
|
||||
|
||||
def log_chat_message(
|
||||
self,
|
||||
@@ -217,9 +472,21 @@ class Tracer:
|
||||
}
|
||||
|
||||
self.chat_messages.append(message_data)
|
||||
self._emit_event(
|
||||
"chat.message",
|
||||
actor={"agent_id": agent_id, "role": role},
|
||||
payload={"message_id": message_id, "content": content, "metadata": metadata or {}},
|
||||
status="logged",
|
||||
source="strix.chat",
|
||||
)
|
||||
return message_id
|
||||
|
||||
def log_tool_execution_start(self, agent_id: str, tool_name: str, args: dict[str, Any]) -> int:
|
||||
def log_tool_execution_start(
|
||||
self,
|
||||
agent_id: str,
|
||||
tool_name: str,
|
||||
args: dict[str, Any],
|
||||
) -> int:
|
||||
execution_id = self._next_execution_id
|
||||
self._next_execution_id += 1
|
||||
|
||||
@@ -241,18 +508,67 @@ class Tracer:
|
||||
if agent_id in self.agents:
|
||||
self.agents[agent_id]["tool_executions"].append(execution_id)
|
||||
|
||||
self._emit_event(
|
||||
"tool.execution.started",
|
||||
actor={
|
||||
"agent_id": agent_id,
|
||||
"tool_name": tool_name,
|
||||
"execution_id": execution_id,
|
||||
},
|
||||
payload={"args": args},
|
||||
status="running",
|
||||
source="strix.tools",
|
||||
)
|
||||
|
||||
return execution_id
|
||||
|
||||
def update_tool_execution(
|
||||
self, execution_id: int, status: str, result: Any | None = None
|
||||
self,
|
||||
execution_id: int,
|
||||
status: str,
|
||||
result: Any | None = None,
|
||||
) -> None:
|
||||
if execution_id in self.tool_executions:
|
||||
self.tool_executions[execution_id]["status"] = status
|
||||
self.tool_executions[execution_id]["result"] = result
|
||||
self.tool_executions[execution_id]["completed_at"] = datetime.now(UTC).isoformat()
|
||||
if execution_id not in self.tool_executions:
|
||||
return
|
||||
|
||||
tool_data = self.tool_executions[execution_id]
|
||||
tool_data["status"] = status
|
||||
tool_data["result"] = result
|
||||
tool_data["completed_at"] = datetime.now(UTC).isoformat()
|
||||
|
||||
tool_name = str(tool_data.get("tool_name", "unknown"))
|
||||
agent_id = str(tool_data.get("agent_id", "unknown"))
|
||||
error_payload = result if status in {"error", "failed"} else None
|
||||
|
||||
self._emit_event(
|
||||
"tool.execution.updated",
|
||||
actor={
|
||||
"agent_id": agent_id,
|
||||
"tool_name": tool_name,
|
||||
"execution_id": execution_id,
|
||||
},
|
||||
payload={"result": result},
|
||||
status=status,
|
||||
error=error_payload,
|
||||
source="strix.tools",
|
||||
)
|
||||
|
||||
if tool_name == "create_vulnerability_report":
|
||||
finding_status = "reviewed" if status == "completed" else "rejected"
|
||||
self._emit_event(
|
||||
"finding.reviewed",
|
||||
actor={"agent_id": agent_id, "tool_name": tool_name},
|
||||
payload={"execution_id": execution_id, "result": result},
|
||||
status=finding_status,
|
||||
error=error_payload,
|
||||
source="strix.findings",
|
||||
)
|
||||
|
||||
def update_agent_status(
|
||||
self, agent_id: str, status: str, error_message: str | None = None
|
||||
self,
|
||||
agent_id: str,
|
||||
status: str,
|
||||
error_message: str | None = None,
|
||||
) -> None:
|
||||
if agent_id in self.agents:
|
||||
self.agents[agent_id]["status"] = status
|
||||
@@ -260,6 +576,15 @@ class Tracer:
|
||||
if error_message:
|
||||
self.agents[agent_id]["error_message"] = error_message
|
||||
|
||||
self._emit_event(
|
||||
"agent.status.updated",
|
||||
actor={"agent_id": agent_id},
|
||||
payload={"error_message": error_message},
|
||||
status=status,
|
||||
error=error_message,
|
||||
source="strix.agents",
|
||||
)
|
||||
|
||||
def set_scan_config(self, config: dict[str, Any]) -> None:
|
||||
self.scan_config = config
|
||||
self.run_metadata.update(
|
||||
@@ -269,13 +594,29 @@ class Tracer:
|
||||
"max_iterations": config.get("max_iterations", 200),
|
||||
}
|
||||
)
|
||||
self.get_run_dir()
|
||||
self._set_association_properties(
|
||||
{
|
||||
"run_id": self.run_id,
|
||||
"run_name": self.run_name or "",
|
||||
"targets": config.get("targets", []),
|
||||
"max_iterations": config.get("max_iterations", 200),
|
||||
}
|
||||
)
|
||||
self._emit_event(
|
||||
"run.configured",
|
||||
payload={"scan_config": config},
|
||||
status="configured",
|
||||
source="strix.run",
|
||||
)
|
||||
|
||||
def save_run_data(self, mark_complete: bool = False) -> None: # noqa: PLR0912, PLR0915
|
||||
def save_run_data(self, mark_complete: bool = False) -> None:
|
||||
try:
|
||||
run_dir = self.get_run_dir()
|
||||
if mark_complete:
|
||||
self.end_time = datetime.now(UTC).isoformat()
|
||||
if self.end_time is None:
|
||||
self.end_time = datetime.now(UTC).isoformat()
|
||||
self.run_metadata["end_time"] = self.end_time
|
||||
self.run_metadata["status"] = "completed"
|
||||
|
||||
if self.final_scan_result:
|
||||
penetration_test_report_file = run_dir / "penetration_test_report.md"
|
||||
@@ -286,7 +627,8 @@ class Tracer:
|
||||
)
|
||||
f.write(f"{self.final_scan_result}\n")
|
||||
logger.info(
|
||||
f"Saved final penetration test report to: {penetration_test_report_file}"
|
||||
"Saved final penetration test report to: %s",
|
||||
penetration_test_report_file,
|
||||
)
|
||||
|
||||
if self.vulnerability_reports:
|
||||
@@ -302,7 +644,10 @@ class Tracer:
|
||||
severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3, "info": 4}
|
||||
sorted_reports = sorted(
|
||||
self.vulnerability_reports,
|
||||
key=lambda x: (severity_order.get(x["severity"], 5), x["timestamp"]),
|
||||
key=lambda report: (
|
||||
severity_order.get(report["severity"], 5),
|
||||
report["timestamp"],
|
||||
),
|
||||
)
|
||||
|
||||
for report in new_reports:
|
||||
@@ -329,8 +674,8 @@ class Tracer:
|
||||
f.write(f"**{label}:** {value}\n")
|
||||
|
||||
f.write("\n## Description\n\n")
|
||||
desc = report.get("description") or "No description provided."
|
||||
f.write(f"{desc}\n\n")
|
||||
description = report.get("description") or "No description provided."
|
||||
f.write(f"{description}\n\n")
|
||||
|
||||
if report.get("impact"):
|
||||
f.write("## Impact\n\n")
|
||||
@@ -404,11 +749,25 @@ class Tracer:
|
||||
|
||||
if new_reports:
|
||||
logger.info(
|
||||
f"Saved {len(new_reports)} new vulnerability report(s) to: {vuln_dir}"
|
||||
"Saved %d new vulnerability report(s) to: %s",
|
||||
len(new_reports),
|
||||
vuln_dir,
|
||||
)
|
||||
logger.info(f"Updated vulnerability index: {vuln_csv_file}")
|
||||
logger.info("Updated vulnerability index: %s", vuln_csv_file)
|
||||
|
||||
logger.info(f"📊 Essential scan data saved to: {run_dir}")
|
||||
logger.info("📊 Essential scan data saved to: %s", run_dir)
|
||||
if mark_complete and not self._run_completed_emitted:
|
||||
self._emit_event(
|
||||
"run.completed",
|
||||
payload={
|
||||
"duration_seconds": self._calculate_duration(),
|
||||
"vulnerability_count": len(self.vulnerability_reports),
|
||||
},
|
||||
status="completed",
|
||||
source="strix.run",
|
||||
include_run_metadata=True,
|
||||
)
|
||||
self._run_completed_emitted = True
|
||||
|
||||
except (OSError, RuntimeError):
|
||||
logger.exception("Failed to save scan data")
|
||||
|
||||
413
strix/telemetry/utils.py
Normal file
413
strix/telemetry/utils.py
Normal file
@@ -0,0 +1,413 @@
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import threading
|
||||
from collections.abc import Callable, Sequence
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.sdk.trace import ReadableSpan, TracerProvider
|
||||
from opentelemetry.sdk.trace.export import (
|
||||
BatchSpanProcessor,
|
||||
SimpleSpanProcessor,
|
||||
SpanExporter,
|
||||
SpanExportResult,
|
||||
)
|
||||
from scrubadub import Scrubber
|
||||
from scrubadub.detectors import RegexDetector
|
||||
from scrubadub.filth import Filth
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_REDACTED = "[REDACTED]"
|
||||
_SCREENSHOT_OMITTED = "[SCREENSHOT_OMITTED]"
|
||||
_SCREENSHOT_KEY_PATTERN = re.compile(r"screenshot", re.IGNORECASE)
|
||||
_SENSITIVE_KEY_PATTERN = re.compile(
|
||||
r"(api[_-]?key|token|secret|password|authorization|cookie|session|credential|private[_-]?key)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_SENSITIVE_TOKEN_PATTERN = re.compile(
|
||||
r"(?i)\b("
|
||||
r"bearer\s+[a-z0-9._-]+|"
|
||||
r"sk-[a-z0-9_-]{8,}|"
|
||||
r"gh[pousr]_[a-z0-9_-]{12,}|"
|
||||
r"xox[baprs]-[a-z0-9-]{12,}"
|
||||
r")\b"
|
||||
)
|
||||
_SCRUBADUB_PLACEHOLDER_PATTERN = re.compile(r"\{\{[^}]+\}\}")
|
||||
_EVENTS_FILE_LOCKS_LOCK = threading.Lock()
|
||||
_EVENTS_FILE_LOCKS: dict[str, threading.Lock] = {}
|
||||
_NOISY_OTEL_CONTENT_PREFIXES = (
|
||||
"gen_ai.prompt.",
|
||||
"gen_ai.completion.",
|
||||
"llm.input_messages.",
|
||||
"llm.output_messages.",
|
||||
)
|
||||
_NOISY_OTEL_EXACT_KEYS = {
|
||||
"llm.input",
|
||||
"llm.output",
|
||||
"llm.prompt",
|
||||
"llm.completion",
|
||||
}
|
||||
|
||||
|
||||
class _SecretFilth(Filth): # type: ignore[misc]
|
||||
type = "secret"
|
||||
|
||||
|
||||
class _SecretTokenDetector(RegexDetector): # type: ignore[misc]
|
||||
name = "strix_secret_token_detector"
|
||||
filth_cls = _SecretFilth
|
||||
regex = _SENSITIVE_TOKEN_PATTERN
|
||||
|
||||
|
||||
class TelemetrySanitizer:
|
||||
def __init__(self) -> None:
|
||||
self._scrubber = Scrubber(detector_list=[_SecretTokenDetector])
|
||||
|
||||
def sanitize(self, data: Any, key_hint: str | None = None) -> Any: # noqa: PLR0911
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
if isinstance(data, dict):
|
||||
sanitized: dict[str, Any] = {}
|
||||
for key, value in data.items():
|
||||
key_str = str(key)
|
||||
if _SCREENSHOT_KEY_PATTERN.search(key_str):
|
||||
sanitized[key_str] = _SCREENSHOT_OMITTED
|
||||
elif _SENSITIVE_KEY_PATTERN.search(key_str):
|
||||
sanitized[key_str] = _REDACTED
|
||||
else:
|
||||
sanitized[key_str] = self.sanitize(value, key_hint=key_str)
|
||||
return sanitized
|
||||
|
||||
if isinstance(data, list):
|
||||
return [self.sanitize(item, key_hint=key_hint) for item in data]
|
||||
|
||||
if isinstance(data, tuple):
|
||||
return [self.sanitize(item, key_hint=key_hint) for item in data]
|
||||
|
||||
if isinstance(data, str):
|
||||
if key_hint and _SENSITIVE_KEY_PATTERN.search(key_hint):
|
||||
return _REDACTED
|
||||
|
||||
cleaned = self._scrubber.clean(data)
|
||||
return _SCRUBADUB_PLACEHOLDER_PATTERN.sub(_REDACTED, cleaned)
|
||||
|
||||
if isinstance(data, int | float | bool):
|
||||
return data
|
||||
|
||||
return str(data)
|
||||
|
||||
|
||||
def format_trace_id(trace_id: int | None) -> str | None:
|
||||
if trace_id is None or trace_id == 0:
|
||||
return None
|
||||
return f"{trace_id:032x}"
|
||||
|
||||
|
||||
def format_span_id(span_id: int | None) -> str | None:
|
||||
if span_id is None or span_id == 0:
|
||||
return None
|
||||
return f"{span_id:016x}"
|
||||
|
||||
|
||||
def iso_from_unix_ns(unix_ns: int | None) -> str | None:
|
||||
if unix_ns is None:
|
||||
return None
|
||||
try:
|
||||
return datetime.fromtimestamp(unix_ns / 1_000_000_000, tz=UTC).isoformat()
|
||||
except (OSError, OverflowError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def get_events_write_lock(output_path: Path) -> threading.Lock:
|
||||
path_key = str(output_path.resolve(strict=False))
|
||||
with _EVENTS_FILE_LOCKS_LOCK:
|
||||
lock = _EVENTS_FILE_LOCKS.get(path_key)
|
||||
if lock is None:
|
||||
lock = threading.Lock()
|
||||
_EVENTS_FILE_LOCKS[path_key] = lock
|
||||
return lock
|
||||
|
||||
|
||||
def reset_events_write_locks() -> None:
|
||||
with _EVENTS_FILE_LOCKS_LOCK:
|
||||
_EVENTS_FILE_LOCKS.clear()
|
||||
|
||||
|
||||
def append_jsonl_record(output_path: Path, record: dict[str, Any]) -> None:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with get_events_write_lock(output_path), output_path.open("a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
def default_resource_attributes() -> dict[str, str]:
|
||||
return {
|
||||
"service.name": "strix-agent",
|
||||
"service.namespace": "strix",
|
||||
}
|
||||
|
||||
|
||||
def parse_traceloop_headers(raw_headers: str) -> dict[str, str]:
|
||||
headers = raw_headers.strip()
|
||||
if not headers:
|
||||
return {}
|
||||
|
||||
if headers.startswith("{"):
|
||||
try:
|
||||
parsed = json.loads(headers)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Invalid TRACELOOP_HEADERS JSON, ignoring custom headers")
|
||||
return {}
|
||||
if isinstance(parsed, dict):
|
||||
return {str(key): str(value) for key, value in parsed.items() if value is not None}
|
||||
logger.warning("TRACELOOP_HEADERS JSON must be an object, ignoring custom headers")
|
||||
return {}
|
||||
|
||||
result: dict[str, str] = {}
|
||||
for part in headers.split(","):
|
||||
key, sep, value = part.partition("=")
|
||||
if not sep:
|
||||
continue
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
if key and value:
|
||||
result[key] = value
|
||||
return result
|
||||
|
||||
|
||||
def prune_otel_span_attributes(attributes: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Drop high-volume LLM payload attributes to keep JSONL event files compact."""
|
||||
filtered: dict[str, Any] = {}
|
||||
filtered_count = 0
|
||||
|
||||
for key, value in attributes.items():
|
||||
key_str = str(key)
|
||||
if key_str in _NOISY_OTEL_EXACT_KEYS:
|
||||
filtered_count += 1
|
||||
continue
|
||||
|
||||
if key_str.endswith(".content") and key_str.startswith(_NOISY_OTEL_CONTENT_PREFIXES):
|
||||
filtered_count += 1
|
||||
continue
|
||||
|
||||
filtered[key_str] = value
|
||||
|
||||
if filtered_count:
|
||||
filtered["strix.filtered_attributes_count"] = filtered_count
|
||||
|
||||
return filtered
|
||||
|
||||
|
||||
class JsonlSpanExporter(SpanExporter): # type: ignore[misc]
|
||||
"""Append OTEL spans to JSONL for local run artifacts."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
output_path_getter: Callable[[], Path],
|
||||
run_metadata_getter: Callable[[], dict[str, Any]],
|
||||
sanitizer: Callable[[Any], Any],
|
||||
write_lock_getter: Callable[[Path], threading.Lock],
|
||||
):
|
||||
self._output_path_getter = output_path_getter
|
||||
self._run_metadata_getter = run_metadata_getter
|
||||
self._sanitize = sanitizer
|
||||
self._write_lock_getter = write_lock_getter
|
||||
|
||||
def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
|
||||
records: list[dict[str, Any]] = []
|
||||
for span in spans:
|
||||
attributes = prune_otel_span_attributes(dict(span.attributes or {}))
|
||||
if "strix.event_type" in attributes:
|
||||
# Tracer events are written directly in Tracer._emit_event.
|
||||
continue
|
||||
records.append(self._span_to_record(span, attributes))
|
||||
|
||||
if not records:
|
||||
return SpanExportResult.SUCCESS
|
||||
|
||||
try:
|
||||
output_path = self._output_path_getter()
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with self._write_lock_getter(output_path), output_path.open("a", encoding="utf-8") as f:
|
||||
for record in records:
|
||||
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
except OSError:
|
||||
logger.exception("Failed to write OTEL span records to JSONL")
|
||||
return SpanExportResult.FAILURE
|
||||
|
||||
return SpanExportResult.SUCCESS
|
||||
|
||||
def shutdown(self) -> None:
|
||||
return None
|
||||
|
||||
def force_flush(self, timeout_millis: int = 30_000) -> bool: # noqa: ARG002
|
||||
return True
|
||||
|
||||
def _span_to_record(
|
||||
self,
|
||||
span: ReadableSpan,
|
||||
attributes: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
span_context = span.get_span_context()
|
||||
parent_context = span.parent
|
||||
|
||||
status = None
|
||||
if span.status and span.status.status_code:
|
||||
status = span.status.status_code.name.lower()
|
||||
|
||||
event_type = str(attributes.get("gen_ai.operation.name", span.name))
|
||||
run_metadata = self._run_metadata_getter()
|
||||
run_id_attr = (
|
||||
attributes.get("strix.run_id")
|
||||
or attributes.get("strix_run_id")
|
||||
or run_metadata.get("run_id")
|
||||
or span.resource.attributes.get("strix.run_id")
|
||||
)
|
||||
|
||||
record: dict[str, Any] = {
|
||||
"timestamp": iso_from_unix_ns(span.end_time) or datetime.now(UTC).isoformat(),
|
||||
"event_type": event_type,
|
||||
"run_id": str(run_id_attr or run_metadata.get("run_id") or ""),
|
||||
"trace_id": format_trace_id(span_context.trace_id),
|
||||
"span_id": format_span_id(span_context.span_id),
|
||||
"parent_span_id": format_span_id(parent_context.span_id if parent_context else None),
|
||||
"actor": None,
|
||||
"payload": None,
|
||||
"status": status,
|
||||
"error": None,
|
||||
"source": "otel.span",
|
||||
"span_name": span.name,
|
||||
"span_kind": span.kind.name.lower(),
|
||||
"attributes": self._sanitize(attributes),
|
||||
}
|
||||
|
||||
if span.events:
|
||||
record["otel_events"] = self._sanitize(
|
||||
[
|
||||
{
|
||||
"name": event.name,
|
||||
"timestamp": iso_from_unix_ns(event.timestamp),
|
||||
"attributes": dict(event.attributes or {}),
|
||||
}
|
||||
for event in span.events
|
||||
]
|
||||
)
|
||||
|
||||
return record
|
||||
|
||||
|
||||
def bootstrap_otel(
|
||||
*,
|
||||
bootstrapped: bool,
|
||||
remote_enabled_state: bool,
|
||||
bootstrap_lock: threading.Lock,
|
||||
traceloop: Any,
|
||||
base_url: str,
|
||||
api_key: str,
|
||||
headers_raw: str,
|
||||
output_path_getter: Callable[[], Path],
|
||||
run_metadata_getter: Callable[[], dict[str, Any]],
|
||||
sanitizer: Callable[[Any], Any],
|
||||
write_lock_getter: Callable[[Path], threading.Lock],
|
||||
tracer_name: str = "strix.telemetry.tracer",
|
||||
) -> tuple[Any, bool, bool, bool]:
|
||||
with bootstrap_lock:
|
||||
if bootstrapped:
|
||||
return (
|
||||
trace.get_tracer(tracer_name),
|
||||
remote_enabled_state,
|
||||
bootstrapped,
|
||||
remote_enabled_state,
|
||||
)
|
||||
|
||||
local_exporter = JsonlSpanExporter(
|
||||
output_path_getter=output_path_getter,
|
||||
run_metadata_getter=run_metadata_getter,
|
||||
sanitizer=sanitizer,
|
||||
write_lock_getter=write_lock_getter,
|
||||
)
|
||||
local_processor = SimpleSpanProcessor(local_exporter)
|
||||
|
||||
headers = parse_traceloop_headers(headers_raw)
|
||||
remote_enabled = bool(base_url and api_key)
|
||||
otlp_headers = headers
|
||||
if remote_enabled:
|
||||
otlp_headers = {"Authorization": f"Bearer {api_key}"}
|
||||
otlp_headers.update(headers)
|
||||
|
||||
otel_init_ok = False
|
||||
if traceloop:
|
||||
try:
|
||||
from traceloop.sdk.instruments import Instruments
|
||||
|
||||
init_kwargs: dict[str, Any] = {
|
||||
"app_name": "strix-agent",
|
||||
"processor": local_processor,
|
||||
"telemetry_enabled": False,
|
||||
"resource_attributes": default_resource_attributes(),
|
||||
"block_instruments": {
|
||||
Instruments.URLLIB3,
|
||||
Instruments.REQUESTS,
|
||||
},
|
||||
}
|
||||
if remote_enabled:
|
||||
init_kwargs.update(
|
||||
{
|
||||
"api_endpoint": base_url,
|
||||
"api_key": api_key,
|
||||
"headers": headers,
|
||||
}
|
||||
)
|
||||
import io
|
||||
import sys
|
||||
|
||||
_stdout = sys.stdout
|
||||
sys.stdout = io.StringIO()
|
||||
try:
|
||||
traceloop.init(**init_kwargs)
|
||||
finally:
|
||||
sys.stdout = _stdout
|
||||
otel_init_ok = True
|
||||
except Exception:
|
||||
logger.exception("Failed to initialize Traceloop/OpenLLMetry")
|
||||
remote_enabled = False
|
||||
|
||||
if not otel_init_ok:
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
|
||||
provider = TracerProvider(resource=Resource.create(default_resource_attributes()))
|
||||
provider.add_span_processor(local_processor)
|
||||
if remote_enabled:
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
||||
OTLPSpanExporter,
|
||||
)
|
||||
|
||||
endpoint = base_url.rstrip("/") + "/v1/traces"
|
||||
provider.add_span_processor(
|
||||
BatchSpanProcessor(
|
||||
OTLPSpanExporter(endpoint=endpoint, headers=otlp_headers)
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Failed to configure OTLP HTTP exporter")
|
||||
remote_enabled = False
|
||||
|
||||
try:
|
||||
trace.set_tracer_provider(provider)
|
||||
otel_init_ok = True
|
||||
except Exception:
|
||||
logger.exception("Failed to set OpenTelemetry tracer provider")
|
||||
remote_enabled = False
|
||||
|
||||
otel_tracer = trace.get_tracer(tracer_name)
|
||||
if otel_init_ok:
|
||||
return otel_tracer, remote_enabled, True, remote_enabled
|
||||
|
||||
return otel_tracer, remote_enabled, bootstrapped, remote_enabled_state
|
||||
Reference in New Issue
Block a user