Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
40939859b9 |
4
package-lock.json
generated
4
package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@companion-ai/feynman",
|
||||
"version": "0.2.31",
|
||||
"version": "0.2.32",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@companion-ai/feynman",
|
||||
"version": "0.2.31",
|
||||
"version": "0.2.32",
|
||||
"hasInstallScript": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@companion-ai/feynman",
|
||||
"version": "0.2.31",
|
||||
"version": "0.2.32",
|
||||
"description": "Research-first CLI agent built on Pi and alphaXiv",
|
||||
"license": "MIT",
|
||||
"type": "module",
|
||||
|
||||
@@ -19,6 +19,8 @@ Analyze the research question using extended thinking. Develop a research strate
|
||||
- Source types and time periods that matter
|
||||
- Acceptance criteria: what evidence would make the answer "sufficient"
|
||||
|
||||
Make the scale decision before assigning owners in the plan. If the topic is a narrow "what is X" explainer, the plan must use lead-owned direct search tasks only; do not allocate researcher subagents in the task ledger.
|
||||
|
||||
Derive a short slug from the topic (lowercase, hyphens, no filler words, ≤5 words — e.g. "cloud-sandbox-pricing" not "deepresearch-plan"). Write the plan to `outputs/.plans/<slug>.md` as a self-contained artifact. Use this same slug for all artifacts in this run.
|
||||
If `CHANGELOG.md` exists, read the most recent relevant entries before finalizing the plan. Once the workflow becomes multi-round or spans enough work to merit resume support, append concise entries to `CHANGELOG.md` after meaningful progress and before stopping.
|
||||
|
||||
@@ -61,15 +63,19 @@ Do not stop after planning. If live search, subagents, web access, alphaXiv, or
|
||||
|
||||
| Query type | Execution |
|
||||
|---|---|
|
||||
| Single fact or narrow question | Search directly yourself, no subagents, 3-10 tool calls |
|
||||
| Single fact or narrow question, including "what is X" explainers | Search directly yourself, no subagents, 3-10 tool calls |
|
||||
| Direct comparison (2-3 items) | 2 parallel `researcher` subagents |
|
||||
| Broad survey or multi-faceted topic | 3-4 parallel `researcher` subagents |
|
||||
| Complex multi-domain research | 4-6 parallel `researcher` subagents |
|
||||
|
||||
Never spawn subagents for work you can do in 5 tool calls.
|
||||
For "what is X" explainer topics, you MUST NOT spawn researcher subagents unless the user explicitly asks for comprehensive coverage, current landscape, benchmarks, or production deployment.
|
||||
Do not inflate a simple explainer into a multi-agent survey.
|
||||
|
||||
## 3. Spawn researchers
|
||||
|
||||
Skip this section entirely when the scale decision chose direct search/no subagents. In that case, gather evidence yourself with search/fetch/paper tools, write notes directly to `<slug>-research-direct.md`, and continue to Section 4.
|
||||
|
||||
Launch parallel `researcher` subagents via `subagent`. Each gets a structured brief with:
|
||||
- **Objective:** what to find
|
||||
- **Output format:** numbered sources, evidence table, inline source references
|
||||
@@ -78,12 +84,16 @@ Launch parallel `researcher` subagents via `subagent`. Each gets a structured br
|
||||
- **Task IDs:** the specific ledger rows they own and must report back on
|
||||
|
||||
Assign each researcher a clearly disjoint dimension — different source types, geographic scopes, time periods, or technical angles. Never duplicate coverage.
|
||||
Keep `subagent` tool-call JSON small and valid. For detailed task instructions, write a per-researcher brief first, e.g. `outputs/.plans/<slug>-T1.md`, then pass a short task string that points to that brief and the required output file. Do not place multi-paragraph instructions inside the `subagent` JSON.
|
||||
Use only supported `subagent` keys. Do not add extra keys such as `artifacts` unless the tool schema explicitly exposes them.
|
||||
When using parallel researchers, always set `failFast: false` so one blocked researcher does not abort the whole workflow.
|
||||
Do not name exact tool commands in subagent tasks unless those tool names are visible in the current tool set. Prefer broad guidance such as "use paper search and web search"; if a PDF parser or paper fetch fails, the researcher must continue from metadata, abstracts, and web sources and mark PDF parsing as blocked.
|
||||
|
||||
```
|
||||
{
|
||||
tasks: [
|
||||
{ agent: "researcher", task: "...", output: "<slug>-research-web.md" },
|
||||
{ agent: "researcher", task: "...", output: "<slug>-research-papers.md" }
|
||||
{ agent: "researcher", task: "Read outputs/.plans/<slug>-T1.md and write <slug>-research-web.md.", output: "<slug>-research-web.md" },
|
||||
{ agent: "researcher", task: "Read outputs/.plans/<slug>-T2.md and write <slug>-research-papers.md.", output: "<slug>-research-papers.md" }
|
||||
],
|
||||
concurrency: 4,
|
||||
failFast: false
|
||||
@@ -150,25 +160,29 @@ Save this draft to `outputs/.drafts/<slug>-draft.md`.
|
||||
Spawn the `verifier` agent to post-process YOUR draft. The verifier agent adds inline citations, verifies every source URL, and produces the final output:
|
||||
|
||||
```
|
||||
{ agent: "verifier", task: "Add inline citations to <slug>-draft.md using the research files as source material. Verify every URL.", output: "<slug>-brief.md" }
|
||||
{ agent: "verifier", task: "Add inline citations to outputs/.drafts/<slug>-draft.md using the research files as source material. Verify every URL. Write the complete cited brief to outputs/.drafts/<slug>-cited.md.", output: "outputs/.drafts/<slug>-cited.md" }
|
||||
```
|
||||
|
||||
The verifier agent does not rewrite the report — it only anchors claims to sources and builds the numbered Sources section.
|
||||
This step is mandatory and must complete before any reviewer runs. Do not run the `verifier` and `reviewer` in the same parallel `subagent` call.
|
||||
After the verifier returns, verify on disk that `outputs/.drafts/<slug>-cited.md` exists. If the verifier wrote to a different path, find the cited file, move or copy it to `outputs/.drafts/<slug>-cited.md`, and use that path from this point forward.
|
||||
|
||||
## 7. Verify
|
||||
|
||||
Spawn the `reviewer` agent against the cited draft. The reviewer checks for:
|
||||
Only after `outputs/.drafts/<slug>-cited.md` exists, spawn the `reviewer` agent against that cited draft. The reviewer checks for:
|
||||
- Unsupported claims that slipped past citation
|
||||
- Logical gaps or contradictions between sections
|
||||
- Single-source claims on critical findings
|
||||
- Overstated confidence relative to evidence quality
|
||||
|
||||
```
|
||||
{ agent: "reviewer", task: "Verify <slug>-brief.md — flag any claims that lack sufficient source backing, identify logical gaps, and check that confidence levels match evidence strength. This is a verification pass, not a peer review.", output: "<slug>-verification.md" }
|
||||
{ agent: "reviewer", task: "Verify outputs/.drafts/<slug>-cited.md — flag any claims that lack sufficient source backing, identify logical gaps, and check that confidence levels match evidence strength. This is a verification pass, not a peer review.", output: "<slug>-verification.md" }
|
||||
```
|
||||
|
||||
If the reviewer flags FATAL issues, fix them in the brief before delivering. MAJOR issues get noted in the Open Questions section. MINOR issues are accepted.
|
||||
After fixes, run at least one more review-style verification pass if any FATAL issues were found. Do not assume one fix solved everything.
|
||||
When applying reviewer fixes, do not issue one giant `edit` tool call with many replacements. Use small localized edits only when there are 1-3 simple corrections. For section rewrites, table rewrites, or more than 3 substantive fixes, read the cited draft and write a corrected full file to `outputs/.drafts/<slug>-revised.md` instead. Then run the follow-up review against `outputs/.drafts/<slug>-revised.md`.
|
||||
The final candidate is `outputs/.drafts/<slug>-revised.md` if it exists; otherwise it is `outputs/.drafts/<slug>-cited.md`.
|
||||
|
||||
## 8. Deliver
|
||||
|
||||
@@ -196,11 +210,11 @@ Write a provenance record alongside it as `<slug>.provenance.md`:
|
||||
Before you stop, verify on disk that all of these exist:
|
||||
- `outputs/.plans/<slug>.md`
|
||||
- `outputs/.drafts/<slug>-draft.md`
|
||||
- `<slug>-brief.md` intermediate cited brief
|
||||
- `outputs/.drafts/<slug>-cited.md` intermediate cited brief
|
||||
- `outputs/<slug>.md` or `papers/<slug>.md` final promoted deliverable
|
||||
- `outputs/<slug>.provenance.md` or `papers/<slug>.provenance.md` provenance sidecar
|
||||
|
||||
Do not stop at `<slug>-brief.md` alone. If the cited brief exists but the promoted final output or provenance sidecar does not, create them before responding.
|
||||
Do not stop at the cited or revised draft alone. If the cited/revised brief exists but the promoted final output or provenance sidecar does not, create them before responding.
|
||||
If full verification could not be completed, still create the final deliverable and provenance sidecar with `Verification: BLOCKED` or `PASS WITH NOTES` and list the missing checks. Never end with only an explanation in chat.
|
||||
|
||||
## Background execution
|
||||
|
||||
@@ -238,6 +238,75 @@ export function patchPiSubagentsSource(relativePath, source) {
|
||||
"\t\t...(modelOverrides[index] ? { model: modelOverrides[index] } : {}),",
|
||||
].join("\n"),
|
||||
);
|
||||
patched = replaceAll(
|
||||
patched,
|
||||
[
|
||||
"\t\t\t\tcwd: t.cwd,",
|
||||
"\t\t\t\t...(modelOverrides[i] ? { model: modelOverrides[i] } : {}),",
|
||||
].join("\n"),
|
||||
[
|
||||
"\t\t\t\tcwd: t.cwd,",
|
||||
"\t\t\t\toutput: t.output,",
|
||||
"\t\t\t\t...(modelOverrides[i] ? { model: modelOverrides[i] } : {}),",
|
||||
].join("\n"),
|
||||
);
|
||||
patched = replaceAll(
|
||||
patched,
|
||||
[
|
||||
"\t\tcwd: t.cwd,",
|
||||
"\t\t...(modelOverrides[i] ? { model: modelOverrides[i] } : {}),",
|
||||
].join("\n"),
|
||||
[
|
||||
"\t\tcwd: t.cwd,",
|
||||
"\t\toutput: t.output,",
|
||||
"\t\t...(modelOverrides[i] ? { model: modelOverrides[i] } : {}),",
|
||||
].join("\n"),
|
||||
);
|
||||
patched = replaceAll(
|
||||
patched,
|
||||
[
|
||||
"\t\tconst behaviors = agentConfigs.map((c, i) =>",
|
||||
"\t\t\tresolveStepBehavior(c, { skills: skillOverrides[i] }),",
|
||||
"\t\t);",
|
||||
].join("\n"),
|
||||
[
|
||||
"\t\tconst behaviors = agentConfigs.map((c, i) =>",
|
||||
"\t\t\tresolveStepBehavior(c, { output: tasks[i]?.output, skills: skillOverrides[i] }),",
|
||||
"\t\t);",
|
||||
].join("\n"),
|
||||
);
|
||||
patched = replaceAll(
|
||||
patched,
|
||||
"\tconst behaviors = agentConfigs.map((config) => resolveStepBehavior(config, {}));",
|
||||
"\tconst behaviors = agentConfigs.map((config, i) => resolveStepBehavior(config, { output: tasks[i]?.output, skills: skillOverrides[i] }));",
|
||||
);
|
||||
patched = replaceAll(
|
||||
patched,
|
||||
[
|
||||
"\t\tconst taskCwd = resolveParallelTaskCwd(task, input.paramsCwd, input.worktreeSetup, index);",
|
||||
"\t\treturn runSync(input.ctx.cwd, input.agents, task.agent, input.taskTexts[index]!, {",
|
||||
].join("\n"),
|
||||
[
|
||||
"\t\tconst taskCwd = resolveParallelTaskCwd(task, input.paramsCwd, input.worktreeSetup, index);",
|
||||
"\t\tconst outputPath = typeof input.behaviors[index]?.output === \"string\"",
|
||||
"\t\t\t? resolveSingleOutputPath(input.behaviors[index]?.output, input.ctx.cwd, taskCwd)",
|
||||
"\t\t\t: undefined;",
|
||||
"\t\tconst taskText = injectSingleOutputInstruction(input.taskTexts[index]!, outputPath);",
|
||||
"\t\treturn runSync(input.ctx.cwd, input.agents, task.agent, taskText, {",
|
||||
].join("\n"),
|
||||
);
|
||||
patched = replaceAll(
|
||||
patched,
|
||||
[
|
||||
"\t\t\tmaxOutput: input.maxOutput,",
|
||||
"\t\t\tmaxSubagentDepth: input.maxSubagentDepths[index],",
|
||||
].join("\n"),
|
||||
[
|
||||
"\t\t\tmaxOutput: input.maxOutput,",
|
||||
"\t\t\toutputPath,",
|
||||
"\t\t\tmaxSubagentDepth: input.maxSubagentDepths[index],",
|
||||
].join("\n"),
|
||||
);
|
||||
break;
|
||||
case "schemas.ts":
|
||||
patched = replaceAll(
|
||||
|
||||
@@ -147,8 +147,8 @@ function prepareWorkspace(packageSpecs) {
|
||||
const result = spawnSync(
|
||||
process.env.npm_execpath ? process.execPath : "npm",
|
||||
process.env.npm_execpath
|
||||
? [process.env.npm_execpath, "install", "--prefer-offline", "--no-audit", "--no-fund", "--no-dry-run", "--legacy-peer-deps", "--loglevel", "error", "--prefix", workspaceDir, ...packageSpecs]
|
||||
: ["install", "--prefer-offline", "--no-audit", "--no-fund", "--no-dry-run", "--legacy-peer-deps", "--loglevel", "error", "--prefix", workspaceDir, ...packageSpecs],
|
||||
? [process.env.npm_execpath, "install", "--prefer-online", "--no-audit", "--no-fund", "--no-dry-run", "--legacy-peer-deps", "--loglevel", "error", "--prefix", workspaceDir, ...packageSpecs]
|
||||
: ["install", "--prefer-online", "--no-audit", "--no-fund", "--no-dry-run", "--legacy-peer-deps", "--loglevel", "error", "--prefix", workspaceDir, ...packageSpecs],
|
||||
{ stdio: "inherit", env: childNpmInstallEnv() },
|
||||
);
|
||||
if (result.status !== 0) {
|
||||
|
||||
@@ -72,6 +72,35 @@ test("deepresearch workflow requires durable artifacts even when blocked", () =>
|
||||
assert.match(deepResearchPrompt, /Never end with only an explanation in chat/i);
|
||||
});
|
||||
|
||||
test("deepresearch citation and review stages are sequential and avoid giant edits", () => {
|
||||
const deepResearchPrompt = readFileSync(join(repoRoot, "prompts", "deepresearch.md"), "utf8");
|
||||
|
||||
assert.match(deepResearchPrompt, /must complete before any reviewer runs/i);
|
||||
assert.match(deepResearchPrompt, /Do not run the `verifier` and `reviewer` in the same parallel `subagent` call/i);
|
||||
assert.match(deepResearchPrompt, /outputs\/\.drafts\/<slug>-cited\.md/i);
|
||||
assert.match(deepResearchPrompt, /do not issue one giant `edit` tool call/i);
|
||||
assert.match(deepResearchPrompt, /outputs\/\.drafts\/<slug>-revised\.md/i);
|
||||
assert.match(deepResearchPrompt, /The final candidate is `outputs\/\.drafts\/<slug>-revised\.md` if it exists/i);
|
||||
});
|
||||
|
||||
test("deepresearch keeps subagent tool calls small and skips subagents for narrow explainers", () => {
|
||||
const deepResearchPrompt = readFileSync(join(repoRoot, "prompts", "deepresearch.md"), "utf8");
|
||||
|
||||
assert.match(deepResearchPrompt, /including "what is X" explainers/i);
|
||||
assert.match(deepResearchPrompt, /Make the scale decision before assigning owners/i);
|
||||
assert.match(deepResearchPrompt, /lead-owned direct search tasks only/i);
|
||||
assert.match(deepResearchPrompt, /MUST NOT spawn researcher subagents/i);
|
||||
assert.match(deepResearchPrompt, /Do not inflate a simple explainer into a multi-agent survey/i);
|
||||
assert.match(deepResearchPrompt, /Skip this section entirely when the scale decision chose direct search\/no subagents/i);
|
||||
assert.match(deepResearchPrompt, /<slug>-research-direct\.md/i);
|
||||
assert.match(deepResearchPrompt, /Keep `subagent` tool-call JSON small and valid/i);
|
||||
assert.match(deepResearchPrompt, /write a per-researcher brief first/i);
|
||||
assert.match(deepResearchPrompt, /Do not place multi-paragraph instructions inside the `subagent` JSON/i);
|
||||
assert.match(deepResearchPrompt, /Do not add extra keys such as `artifacts`/i);
|
||||
assert.match(deepResearchPrompt, /always set `failFast: false`/i);
|
||||
assert.match(deepResearchPrompt, /if a PDF parser or paper fetch fails/i);
|
||||
});
|
||||
|
||||
test("workflow prompts do not introduce implicit confirmation gates", () => {
|
||||
const workflowPrompts = [
|
||||
"audit.md",
|
||||
|
||||
@@ -171,6 +171,71 @@ test("patchPiSubagentsSource preserves output on top-level parallel tasks", () =
|
||||
assert.doesNotMatch(patched, /resolvePiAgentDir/);
|
||||
});
|
||||
|
||||
test("patchPiSubagentsSource preserves output in async parallel task handoff", () => {
|
||||
const input = [
|
||||
"function run(tasks: TaskParam[]) {",
|
||||
"\tconst modelOverrides = tasks.map(() => undefined);",
|
||||
"\tconst skillOverrides = tasks.map(() => undefined);",
|
||||
"\tconst parallelTasks = tasks.map((t, i) => ({",
|
||||
"\t\tagent: t.agent,",
|
||||
"\t\ttask: params.context === \"fork\" ? wrapForkTask(taskTexts[i]!) : taskTexts[i]!,",
|
||||
"\t\tcwd: t.cwd,",
|
||||
"\t\t...(modelOverrides[i] ? { model: modelOverrides[i] } : {}),",
|
||||
"\t\t...(skillOverrides[i] !== undefined ? { skill: skillOverrides[i] } : {}),",
|
||||
"\t}));",
|
||||
"}",
|
||||
].join("\n");
|
||||
|
||||
const patched = patchPiSubagentsSource("subagent-executor.ts", input);
|
||||
|
||||
assert.match(patched, /\n\t\toutput: t\.output,/);
|
||||
});
|
||||
|
||||
test("patchPiSubagentsSource uses task output when resolving foreground parallel behavior", () => {
|
||||
const input = [
|
||||
"async function run(tasks: TaskParam[]) {",
|
||||
"\tconst skillOverrides = tasks.map((t) => normalizeSkillInput(t.skill));",
|
||||
"\tif (params.clarify === true && ctx.hasUI) {",
|
||||
"\t\tconst behaviors = agentConfigs.map((c, i) =>",
|
||||
"\t\t\tresolveStepBehavior(c, { skills: skillOverrides[i] }),",
|
||||
"\t\t);",
|
||||
"\t}",
|
||||
"\tconst behaviors = agentConfigs.map((config) => resolveStepBehavior(config, {}));",
|
||||
"}",
|
||||
].join("\n");
|
||||
|
||||
const patched = patchPiSubagentsSource("subagent-executor.ts", input);
|
||||
|
||||
assert.match(patched, /resolveStepBehavior\(c, \{ output: tasks\[i\]\?\.output, skills: skillOverrides\[i\] \}\)/);
|
||||
assert.match(patched, /resolveStepBehavior\(config, \{ output: tasks\[i\]\?\.output, skills: skillOverrides\[i\] \}\)/);
|
||||
assert.doesNotMatch(patched, /resolveStepBehavior\(config, \{\}\)/);
|
||||
});
|
||||
|
||||
test("patchPiSubagentsSource passes foreground parallel output paths into runSync", () => {
|
||||
const input = [
|
||||
"async function runForegroundParallelTasks(input: ForegroundParallelRunInput): Promise<SingleResult[]> {",
|
||||
"\treturn mapConcurrent(input.tasks, input.concurrencyLimit, async (task, index) => {",
|
||||
"\t\tconst overrideSkills = input.skillOverrides[index];",
|
||||
"\t\tconst effectiveSkills = overrideSkills === undefined ? input.behaviors[index]?.skills : overrideSkills;",
|
||||
"\t\tconst taskCwd = resolveParallelTaskCwd(task, input.paramsCwd, input.worktreeSetup, index);",
|
||||
"\t\treturn runSync(input.ctx.cwd, input.agents, task.agent, input.taskTexts[index]!, {",
|
||||
"\t\t\tcwd: taskCwd,",
|
||||
"\t\t\tsignal: input.signal,",
|
||||
"\t\t\tmaxOutput: input.maxOutput,",
|
||||
"\t\t\tmaxSubagentDepth: input.maxSubagentDepths[index],",
|
||||
"\t\t});",
|
||||
"\t});",
|
||||
"}",
|
||||
].join("\n");
|
||||
|
||||
const patched = patchPiSubagentsSource("subagent-executor.ts", input);
|
||||
|
||||
assert.match(patched, /const outputPath = typeof input\.behaviors\[index\]\?\.output === "string"/);
|
||||
assert.match(patched, /const taskText = injectSingleOutputInstruction\(input\.taskTexts\[index\]!, outputPath\)/);
|
||||
assert.match(patched, /runSync\(input\.ctx\.cwd, input\.agents, task\.agent, taskText, \{/);
|
||||
assert.match(patched, /\n\t\t\toutputPath,/);
|
||||
});
|
||||
|
||||
test("patchPiSubagentsSource documents output in top-level task schema", () => {
|
||||
const input = [
|
||||
"export const TaskItem = Type.Object({ ",
|
||||
|
||||
Reference in New Issue
Block a user