"use strict";
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
// If the importer is in node compatibility mode or this is not an ESM
// file that has been converted to a CommonJS file using a Babel-
// compatible transform (i.e. "__esModule" has not been set), then set
// "default" to the CommonJS "module.exports" for node compatibility.
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
mod
));
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
var __async = (__this, __arguments, generator) => {
return new Promise((resolve, reject) => {
var fulfilled = (value) => {
try {
step(generator.next(value));
} catch (e) {
reject(e);
}
};
var rejected = (value) => {
try {
step(generator.throw(value));
} catch (e) {
reject(e);
}
};
var step = (x) => x.done ? resolve(x.value) : Promise.resolve(x.value).then(fulfilled, rejected);
step((generator = generator.apply(__this, __arguments)).next());
});
};
// src/index.ts
var src_exports = {};
__export(src_exports, {
ActionPlanner: () => ActionPlanner,
AnthropicPlanner: () => AnthropicPlanner,
BrowserAgent: () => BrowserAgent,
pauseForInput: () => pauseForInput
});
module.exports = __toCommonJS(src_exports);
// src/browser.ts
var import_selenium_webdriver2 = require("selenium-webdriver");
// src/util.ts
var import_selenium_webdriver = require("selenium-webdriver");
function pauseForInput() {
return __async(this, null, function* () {
console.log("Press any key to continue...");
yield new Promise((resolve) => {
process.stdin.setRawMode(true);
process.stdin.resume();
process.stdin.once("data", () => {
process.stdin.setRawMode(false);
process.stdin.pause();
resolve();
});
});
});
}
function parseXdotool(xdotoolCommand) {
const modifiers = [];
const keys = [];
const keyParts = xdotoolCommand.split("+");
for (const keyPart of keyParts) {
switch (keyPart.toLowerCase()) {
case "ctrl":
modifiers.push(import_selenium_webdriver.Key.CONTROL);
break;
case "alt":
modifiers.push(import_selenium_webdriver.Key.ALT);
break;
case "shift":
modifiers.push(import_selenium_webdriver.Key.SHIFT);
break;
case "super":
case "command":
case "meta":
modifiers.push(import_selenium_webdriver.Key.META);
break;
case "null":
keys.push(import_selenium_webdriver.Key.NULL);
break;
case "cancel":
keys.push(import_selenium_webdriver.Key.CANCEL);
break;
case "help":
keys.push(import_selenium_webdriver.Key.HELP);
break;
case "backspace":
case "back_space":
keys.push(import_selenium_webdriver.Key.BACK_SPACE);
break;
case "tab":
keys.push(import_selenium_webdriver.Key.TAB);
break;
case "clear":
keys.push(import_selenium_webdriver.Key.CLEAR);
break;
case "return":
case "enter":
keys.push(import_selenium_webdriver.Key.RETURN);
break;
case "pause":
keys.push(import_selenium_webdriver.Key.PAUSE);
break;
case "escape":
keys.push(import_selenium_webdriver.Key.ESCAPE);
break;
case "space":
keys.push(import_selenium_webdriver.Key.SPACE);
break;
case "pageup":
case "page_up":
keys.push(import_selenium_webdriver.Key.PAGE_UP);
break;
case "pagedown":
case "page_down":
keys.push(import_selenium_webdriver.Key.PAGE_DOWN);
break;
case "end":
keys.push(import_selenium_webdriver.Key.END);
break;
case "home":
keys.push(import_selenium_webdriver.Key.HOME);
break;
case "left":
case "arrowleft":
case "arrow_left":
keys.push(import_selenium_webdriver.Key.ARROW_LEFT);
break;
case "up":
case "arrowup":
case "arrow_up":
keys.push(import_selenium_webdriver.Key.ARROW_UP);
break;
case "right":
case "arrowright":
case "arrow_right":
keys.push(import_selenium_webdriver.Key.ARROW_RIGHT);
break;
case "down":
case "arrowdown":
case "arrow_down":
keys.push(import_selenium_webdriver.Key.ARROW_DOWN);
break;
case "insert":
keys.push(import_selenium_webdriver.Key.INSERT);
break;
case "delete":
keys.push(import_selenium_webdriver.Key.DELETE);
break;
case "semicolon":
keys.push(import_selenium_webdriver.Key.SEMICOLON);
break;
case "equals":
keys.push(import_selenium_webdriver.Key.EQUALS);
break;
case "kp_0":
keys.push(import_selenium_webdriver.Key.NUMPAD0);
break;
case "kp_1":
keys.push(import_selenium_webdriver.Key.NUMPAD1);
break;
case "kp_2":
keys.push(import_selenium_webdriver.Key.NUMPAD2);
break;
case "kp_3":
keys.push(import_selenium_webdriver.Key.NUMPAD3);
break;
case "kp_4":
keys.push(import_selenium_webdriver.Key.NUMPAD4);
break;
case "kp_5":
keys.push(import_selenium_webdriver.Key.NUMPAD5);
break;
case "kp_6":
keys.push(import_selenium_webdriver.Key.NUMPAD6);
break;
case "kp_7":
keys.push(import_selenium_webdriver.Key.NUMPAD7);
break;
case "kp_8":
keys.push(import_selenium_webdriver.Key.NUMPAD8);
break;
case "kp_9":
keys.push(import_selenium_webdriver.Key.NUMPAD9);
break;
case "multiply":
keys.push(import_selenium_webdriver.Key.MULTIPLY);
break;
case "add":
keys.push(import_selenium_webdriver.Key.ADD);
break;
case "separator":
keys.push(import_selenium_webdriver.Key.SEPARATOR);
break;
case "subtract":
keys.push(import_selenium_webdriver.Key.SUBTRACT);
break;
case "decimal":
keys.push(import_selenium_webdriver.Key.DECIMAL);
break;
case "divide":
keys.push(import_selenium_webdriver.Key.DIVIDE);
break;
case "f1":
keys.push(import_selenium_webdriver.Key.F1);
break;
case "f2":
keys.push(import_selenium_webdriver.Key.F2);
break;
case "f3":
keys.push(import_selenium_webdriver.Key.F3);
break;
case "f4":
keys.push(import_selenium_webdriver.Key.F4);
break;
case "f5":
keys.push(import_selenium_webdriver.Key.F5);
break;
case "f6":
keys.push(import_selenium_webdriver.Key.F6);
break;
case "f7":
keys.push(import_selenium_webdriver.Key.F7);
break;
case "f8":
keys.push(import_selenium_webdriver.Key.F8);
break;
case "f9":
keys.push(import_selenium_webdriver.Key.F9);
break;
case "f10":
keys.push(import_selenium_webdriver.Key.F10);
break;
case "f11":
keys.push(import_selenium_webdriver.Key.F11);
break;
case "f12":
keys.push(import_selenium_webdriver.Key.F12);
break;
default:
keys.push(keyPart);
}
}
return { modifiers, keys };
}
// src/browser.ts
var ActionPlanner = class {
};
var BrowserAgent = class {
constructor(driver, actionPlanner, goal, options) {
this.additionalContext = "None";
this.additionalInstructions = [];
this.waitAfterStepMS = 500;
this.pauseAfterEachAction = false;
this.maxSteps = 50;
this._status = "initial";
this.history = [];
this.tabs = {};
this.driver = driver;
this.planner = actionPlanner;
this.goal = goal;
if (options) {
if (options.additionalContext !== void 0) {
if (typeof options.additionalContext !== "string") {
this.additionalContext = JSON.stringify(options.additionalContext);
} else {
this.additionalContext = options.additionalContext;
}
}
if (options.additionalInstructions !== void 0) {
this.additionalInstructions = options.additionalInstructions;
}
if (options.waitAfterStepMS !== void 0) {
this.waitAfterStepMS = options.waitAfterStepMS;
}
if (options.pauseAfterEachAction !== void 0) {
this.pauseAfterEachAction = options.pauseAfterEachAction;
}
if (options.maxSteps !== void 0) {
this.maxSteps = options.maxSteps;
}
}
}
getState() {
return __async(this, null, function* () {
const size = yield this.driver.executeScript("return { x: window.innerWidth, y: window.innerHeight }");
const screenshot = yield this.driver.takeScreenshot();
const mousePosition = yield this.getMousePosition();
const scrollPosition = yield this.getScrollPosition();
const tabs = yield this.driver.getAllWindowHandles();
const currentTab = yield this.driver.getWindowHandle();
const browserTabs = [];
for (const tab of tabs) {
yield this.driver.switchTo().window(tab);
const tabUrl = yield this.driver.getCurrentUrl();
const tabTitle = yield this.driver.getTitle();
const isActive = tab === currentTab;
let tabId;
let isNew;
if (this.tabs[tab]) {
tabId = this.tabs[tab].id;
isNew = false;
} else {
tabId = Object.keys(this.tabs).length;
isNew = true;
}
const browserTab = {
handle: tab,
url: tabUrl,
title: tabTitle,
active: isActive,
new: isNew,
id: tabId
};
this.tabs[tab] = browserTab;
browserTabs.push(browserTab);
}
yield this.driver.switchTo().window(currentTab);
return {
screenshot,
height: size.y,
width: size.x,
scrollbar: scrollPosition,
tabs: browserTabs,
active_tab: currentTab,
mouse: mousePosition
};
});
}
getAction(currentState) {
return __async(this, null, function* () {
return yield this.planner.planAction(this.goal, this.additionalContext, this.additionalInstructions, currentState, this.history);
});
}
getScrollPosition() {
return __async(this, null, function* () {
const [offset, height] = yield this.driver.executeScript("return [window.pageYOffset/document.documentElement.scrollHeight , window.innerHeight/document.documentElement.scrollHeight]");
return {
height,
offset
};
});
}
getMousePosition() {
return __async(this, null, function* () {
const listenScript = `
window.last_mouse_x = 0;
window.last_mouse_y = 0;
window.addEventListener('mousemove', function onMouseMove(ev) {
window.last_mouse_x = ev.clientX;
window.last_mouse_y = ev.clientY;
window.removeEventListener('mousemove', onMouseMove);
});`;
yield this.driver.executeScript(listenScript);
yield this.driver.actions().move({ x: 3, y: 3, origin: import_selenium_webdriver2.Origin.POINTER }).perform();
yield this.driver.actions().move({ x: -3, y: -3, origin: import_selenium_webdriver2.Origin.POINTER }).perform();
yield new Promise((resolve) => setTimeout(resolve, 100));
const [x, y] = yield this.driver.executeScript("return [window.last_mouse_x, window.last_mouse_y]");
if (typeof x === "number" && typeof y === "number") {
return { x, y };
}
return { x: 0, y: 0 };
});
}
takeAction(action, lastState) {
return __async(this, null, function* () {
const actions = this.driver.actions({ async: true });
switch (action.action) {
case "key":
if (!action.text) throw new Error("Text is required for key action");
const parsedKeyStrokes = parseXdotool(action.text);
let keyAction = actions;
for (const modifier of parsedKeyStrokes.modifiers) {
keyAction = keyAction.keyDown(modifier);
}
for (const key of parsedKeyStrokes.keys) {
keyAction = keyAction.sendKeys(key);
}
for (const modifier of parsedKeyStrokes.modifiers.reverse()) {
keyAction = keyAction.keyUp(modifier);
}
yield keyAction.perform();
break;
case "type":
if (!action.text) throw new Error("Text is required for type action");
yield actions.sendKeys(action.text).perform();
break;
case "mouse_move":
if (!action.coordinate) throw new Error("Coordinate is required for mouse_move action");
yield actions.move({ x: action.coordinate[0], y: action.coordinate[1] }).perform();
break;
case "left_click":
yield actions.click().perform();
break;
case "left_click_drag":
if (!action.coordinate) throw new Error("Coordinate is required for left_click_drag action");
yield actions.press().move({ x: action.coordinate[0], y: action.coordinate[1] }).release().perform();
break;
case "right_click":
yield actions.contextClick().perform();
break;
case "middle_click":
console.log("Middle mouse click not supported");
break;
case "double_click":
yield actions.doubleClick().perform();
break;
case "screenshot":
case "cursor_position":
break;
case "scroll_down":
yield this.driver.executeScript(`window.scrollBy(0, ${lastState.height / 2})`);
break;
case "scroll_up":
yield this.driver.executeScript(`window.scrollBy(0, -${lastState.height / 2})`);
break;
case "switch_tab":
if (!action.text) throw new Error("Text is required for switch_tab action");
const targetId = parseInt(action.text);
const tabHandle = Object.keys(this.tabs).find((handle) => this.tabs[handle].id === targetId);
if (!tabHandle) throw new Error(`No tab found with id: ${action.text}`);
yield this.driver.switchTo().window(tabHandle);
break;
default:
throw new Error(`Unsupported action: ${action.action}`);
}
});
}
step() {
return __async(this, null, function* () {
const currentState = yield this.getState();
const nextAction = yield this.getAction(currentState);
if (nextAction.action === "success") {
this._status = "success";
return;
} else if (nextAction.action === "failure") {
this._status = "failed";
return;
} else {
this._status = "running";
yield this.takeAction(nextAction, currentState);
}
this.history.push({
state: currentState,
action: nextAction
});
});
}
start() {
return __async(this, null, function* () {
yield this.driver.actions().move({ x: 1, y: 1, origin: import_selenium_webdriver2.Origin.VIEWPORT }).perform();
while (["initial", "running"].includes(this._status) && this.history.length <= this.maxSteps) {
yield this.step();
yield this.driver.sleep(this.waitAfterStepMS);
if (this.pauseAfterEachAction) {
yield pauseForInput();
}
}
});
}
get status() {
return this._status;
}
};
// src/planners/anthropic.ts
var import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
var import_sharp = __toESM(require("sharp"), 1);
var import_fs = __toESM(require("fs"), 1);
var cursor64 = "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAQCAYAAAAvf+5AAAAAw3pUWHRSYXcgcHJvZmlsZSB0eXBlIGV4aWYAAHjabVBRDsMgCP33FDuC8ijF49i1S3aDHX9YcLFLX+ITeOSJpOPzfqVHBxVOvKwqVSQbuHKlZoFmRzu5ZD55rvX8Uk9Dz2Ql2A1PVaJ/1MvPwK9m0TIZ6TOE7SpUDn/9M4qH0CciC/YwqmEEcqGEQYsvSNV1/sJ25CvUTxqBjzGJU86rbW9f7B0QHSjIxoD6AOiHE1oXjAlqjQVyxmTMkJjEFnK3p4H0BSRiWUv/cuYLAAABhWlDQ1BJQ0MgcHJvZmlsZQAAeJx9kT1Iw0AYht+2SqVUHCwo0iFD1cWCqIijVqEIFUKt0KqDyaV/0KQhSXFxFFwLDv4sVh1cnHV1cBUEwR8QZwcnRRcp8buk0CLGg7t7eO97X+6+A/yNClPNrnFA1SwjnUwI2dyqEHxFCFEM0DoqMVOfE8UUPMfXPXx8v4vzLO+6P0evkjcZ4BOIZ5luWMQbxNObls55nzjCSpJCfE48ZtAFiR+5Lrv8xrnosJ9nRoxMep44QiwUO1juYFYyVOIp4piiapTvz7qscN7irFZqrHVP/sJwXltZ5jrNKJJYxBJECJBRQxkVWIjTrpFiIk3nCQ//kOMXySWTqwxGjgVUoUJy/OB/8Lu3ZmFywk0KJ4DuF9v+GAaCu0Czbtvfx7bdPAECz8CV1vZXG8DMJ+n1thY7Avq2gYvrtibvAZc7wOCTLhmSIwVo+gsF4P2MvikH9N8CoTW3b61znD4AGepV6gY4OARGipS97vHuns6+/VvT6t8Ph1lyr0hzlCAAAA14aVRYdFhNTDpjb20uYWRvYmUueG1wAAAAAAA8P3hwYWNrZXQgYmVnaW49Iu+7vyIgaWQ9Ilc1TTBNcENlaGlIenJlU3pOVGN6a2M5ZCI/Pgo8eDp4bXBtZXRhIHhtbG5zOng9ImFkb2JlOm5zOm1ldGEvIiB4OnhtcHRrPSJYTVAgQ29yZSA0LjQuMC1FeGl2MiI+CiA8cmRmOlJERiB4bWxuczpyZGY9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkvMDIvMjItcmRmLXN5bnRheC1ucyMiPgogIDxyZGY6RGVzY3JpcHRpb24gcmRmOmFib3V0PSIiCiAgICB4bWxuczp4bXBNTT0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wL21tLyIKICAgIHhtbG5zOnN0RXZ0PSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvc1R5cGUvUmVzb3VyY2VFdmVudCMiCiAgICB4bWxuczpkYz0iaHR0cDovL3B1cmwub3JnL2RjL2VsZW1lbnRzLzEuMS8iCiAgICB4bWxuczpHSU1QPSJodHRwOi8vd3d3LmdpbXAub3JnL3htcC8iCiAgICB4bWxuczp0aWZmPSJodHRwOi8vbnMuYWRvYmUuY29tL3RpZmYvMS4wLyIKICAgIHhtbG5zOnhtcD0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wLyIKICAgeG1wTU06RG9jdW1lbnRJRD0iZ2ltcDpkb2NpZDpnaW1wOjFiYzFkZjE3LWM5YmMtNGYzZi1hMmEzLTlmODkyNWNiZjY4OSIKICAgeG1wTU06SW5zdGFuY2VJRD0ieG1wLmlpZDo4YTUyMWJhMC00YmNlLTQzZWEtYjgyYS04ZGM2MTBjYmZlOTgiCiAgIHhtcE1NOk9yaWdpbmFsRG9jdW1lbnRJRD0ieG1wLmRpZDplODQ3ZjUxNC00MWVlLTQ2ZjYtOTllNC1kNjI3MjMxMjhlZTIiCiAgIGRjOkZvcm1hdD0iaW1hZ2UvcG5nIgogICBHSU1QOkFQST0iMi4wIgogICBHSU1QOlBsYXRmb3JtPSJMaW51eCIKICAgR0lNUDpUaW1lU3RhbXA9IjE3MzAxNTc3NjY5MTI3ODciCiAgIEdJTVA6VmVyc2lvbj0iMi4xMC4zOCIKICAgdGlmZjpPcmllbnRhdGlvbj0iMSIKICAgeG1wOkNyZWF0b3JUb29sPSJHSU1QIDIuMTAiCiAgIHhtcDpNZXRhZGF0YURhdGU9IjIwMjQ6MTA6MjhUMTY6MjI6NDYtMDc6MDAiCiAgIHhtcDpNb2RpZnlEYXRlPSIyMDI0OjEwOjI4VDE2OjIyOjQ2LTA3OjAwIj4KICAgPHhtcE1NOkhpc3Rvcnk+CiAgICA8cmRmOlNlcT4KICAgICA8cmRmOmxpCiAgICAgIHN0RXZ0OmFjdGlvbj0ic2F2ZWQiCiAgICAgIHN0RXZ0OmNoYW5nZWQ9Ii8iCiAgICAgIHN0RXZ0Omluc3RhbmNlSUQ9InhtcC5paWQ6ZTVjOTM2ZDYtYjMzYi00NzM4LTlhNWUtYjM3YTA5MzdjZDAxIgogICAgICBzdEV2dDpzb2Z0d2FyZUFnZW50PSJHaW1wIDIuMTAgKExpbnV4KSIKICAgICAgc3RFdnQ6d2hlbj0iMjAyNC0xMC0yOFQxNjoyMjo0Ni0wNzowMCIvPgogICAgPC9yZGY6U2VxPgogICA8L3htcE1NOkhpc3Rvcnk+CiAgPC9yZGY6RGVzY3JpcHRpb24+CiA8L3JkZjpSREY+CjwveDp4bXBtZXRhPgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgCjw/eHBhY2tldCBlbmQ9InciPz5/5aQ8AAAABmJLR0QAcgByAAAtJLTuAAAACXBIWXMAAABZAAAAWQGqnamGAAAAB3RJTUUH6AocFxYuv5vOJAAAAHhJREFUKM+NzzEOQXEMB+DPYDY5iEVMIpzDfRxC3mZyBK7gChZnELGohaR58f7a7dd8bVq4YaVQgTvWFVjCUcXxA28qcBBHFUcVRwWPPuFfXVsbt0PPnLBL+dKHL+wxxhSPhBcZznuDXYKH1uGzBJ+YtPAZRyy/jTd7qEoydWUQ7QAAAABJRU5ErkJggg==";
var cursorBuffer = Buffer.from(cursor64, "base64");
var AnthropicPlanner = class extends ActionPlanner {
constructor(options) {
var _a, _b;
super();
this.screenshotHistory = 1;
this.mouseJitterReduction = 5;
this.inputTokenUsage = 0;
this.outputTokenUsage = 0;
this.debug = false;
if (options == null ? void 0 : options.client) {
this.client = options.client;
} else if (options == null ? void 0 : options.apiKey) {
this.client = new import_sdk.default({ apiKey: options.apiKey });
} else {
this.client = new import_sdk.default();
}
this.screenshotHistory = (_a = options == null ? void 0 : options.screenshotHistory) != null ? _a : this.screenshotHistory;
this.mouseJitterReduction = (_b = options == null ? void 0 : options.screenshotHistory) != null ? _b : this.mouseJitterReduction;
this.debugImagePath = options == null ? void 0 : options.debugImagePath;
}
formatSystemPrompt(goal, additionalContext, additionalInstructions) {
const prompt = `
* You are a computer use tool that is controlling a browser in fullscreen mode to complete a goal for the user. The goal is listed below in .
* The browser operates in fullscreen mode, meaning you cannot use standard browser UI elements like STOP, REFRESH, BACK, or the address bar. You must accomplish your task solely by interacting with the website's user interface or calling "switch_tab" or "stop_browsing"
* After each action, you will be provided with mouse position, open tabs, and a screenshot of the active browser tab.
* Use the Page_down or Page_up keys to scroll through the webpage. If the website is scrollable, a gray rectangle-shaped scrollbar will appear on the right edge of the screenshot. Ensure you have scrolled through the entire page before concluding that content is unavailable.
* The mouse cursor will appear as a black arrow in the screenshot. Use its position to confirm whether your mouse movement actions have been executed successfully. Ensure the cursor is correctly positioned over the intended UI element before executing a click command.
* After each action, you will receive information about open browser tabs. This information will be in the form of a list of JSON objects, each representing a browser tab with the following fields:
- "tab_id": An integer that identifies the tab within the browser. Use this ID to switch between tabs.
- "title": A string representing the title of the webpage loaded in the tab.
- "active_tab": A boolean indicating whether this tab is currently active. You will receive a screenshot of the active tab.
- "new_tab": A boolean indicating whether the tab was opened as a result of the last action.
* Follow all directions from the section below.
* The current date is ${(/* @__PURE__ */ new Date()).toISOString()}.
The user will ask you to perform a task and you should use their browser to do so. After each step, analyze the screenshot and carefully evaluate if you have achieved the right outcome. Explicitly show your thinking for EACH function call: "I have evaluated step X..." If not correct, try again. Only when you confirm a step was executed correctly should you move on to the next one. You should always call a tool! Always return a tool call. Remember call the stop_browsing tool when you have achieved the goal of the task. Use keyboard shortcuts to navigate whenever possible.
* After moving the mouse to the desired location, always perform a left-click to ensure the action is completed.
* You will use information provided in user's to fill out forms on the way to your goal.
* Ensure that any UI element is completely visible on the screen before attempting to interact with it.
${additionalInstructions.map((instruction) => `* ${instruction}`).join("\n")}
`;
return prompt.trim();
}
createToolUseId() {
const prefix = "toolu_01";
const characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
const idLength = 22;
let result = prefix;
for (let i = 0; i < idLength; i++) {
result += characters.charAt(Math.floor(Math.random() * characters.length));
}
return result;
}
getDimensions(screenshot) {
return __async(this, null, function* () {
const imgBuffer = Buffer.from(screenshot, "base64");
const sharpImage = (0, import_sharp.default)(imgBuffer);
const originalMeta = yield sharpImage.metadata();
return {
x: originalMeta.width,
y: originalMeta.height
};
});
}
markScreenshot(imgBuffer, mousePosition, scrollbar) {
return __async(this, null, function* () {
const sharpImage = (0, import_sharp.default)(imgBuffer);
const originalMeta = yield sharpImage.metadata();
const width = originalMeta.width;
const height = originalMeta.height;
if (width < 20 || height < 20) {
return imgBuffer;
}
const scrollbarWidth = Math.min(10, width);
const scrollbarHeight = Math.min(Math.floor(height * scrollbar.height), height);
const scrollbarTop = Math.floor(height * scrollbar.offset);
const scrollbarBuffer = yield (0, import_sharp.default)({
create: {
width: scrollbarWidth,
height: scrollbarHeight,
channels: 4,
background: { r: 128, g: 128, b: 128, alpha: 0.7 }
}
}).png().toBuffer();
const markedImage = yield sharpImage.composite([
{
input: scrollbarBuffer,
top: scrollbarTop,
left: width - scrollbarWidth
},
{
input: cursorBuffer,
top: Math.max(0, mousePosition.y),
left: Math.max(0, mousePosition.x)
}
]);
const outputBuffer = yield markedImage.toBuffer();
return outputBuffer;
});
}
resizeScreenshot(screenshotBuffer) {
return __async(this, null, function* () {
const sharpImage = (0, import_sharp.default)(screenshotBuffer);
const resizedImg = yield sharpImage.resize(1280, 800, { fit: "inside" });
const imgBuffer = yield resizedImg.toBuffer();
return imgBuffer;
});
}
resizeImageToDimensions(screenshotBuffer, newDim) {
return __async(this, null, function* () {
const sharpImage = (0, import_sharp.default)(screenshotBuffer);
const resizedImg = yield sharpImage.resize(newDim.x, newDim.y, { fit: "fill" });
const imgBuffer = yield resizedImg.toBuffer();
return imgBuffer;
});
}
getScalingRatio(origSize) {
const aspectRatio = origSize.x / origSize.y;
let newWidth;
let newHeight;
if (aspectRatio > 1280 / 800) {
newWidth = 1280;
newHeight = Math.round(1280 / aspectRatio);
} else {
newHeight = 800;
newWidth = Math.round(800 * aspectRatio);
}
const widthRatio = origSize.x / newWidth;
const heightRatio = origSize.y / newHeight;
return {
ratio: { x: widthRatio, y: heightRatio },
oldSize: { x: origSize.x, y: origSize.y },
newSize: { x: newWidth, y: newHeight }
};
}
browserToLLMCoordinates(inputCoords, scaling) {
return {
x: Math.min(Math.max(Math.floor(inputCoords.x / scaling.ratio.x), 1), scaling.newSize.x),
y: Math.min(Math.max(Math.floor(inputCoords.y / scaling.ratio.y), 1), scaling.newSize.y)
};
}
llmToBrowserCoordinates(inputCoords, scaling) {
return {
x: Math.min(Math.max(Math.floor(inputCoords.x * scaling.ratio.x), 1), scaling.oldSize.x),
y: Math.min(Math.max(Math.floor(inputCoords.y * scaling.ratio.y), 1), scaling.oldSize.y)
};
}
formatStateIntoMsg(toolCallId, currentState, options) {
return __async(this, null, function* () {
let resultText = "";
const contentSubMsg = [];
if (options.mousePosition) {
const imgDim = { x: currentState.width, y: currentState.height };
const scaling = this.getScalingRatio(imgDim);
const scaledCoord = this.browserToLLMCoordinates(currentState.mouse, scaling);
resultText += `Mouse location: ${JSON.stringify(scaledCoord)}
`;
}
if (options.tabs) {
const tabsAsDicts = currentState.tabs.map((tab) => ({
tab_id: tab.id,
title: tab.title,
active_tab: tab.active,
new_tab: tab.new
}));
resultText += `
Open Browser Tabs: ${JSON.stringify(tabsAsDicts)}
`;
}
if (options.screenshot) {
const imgBuffer = Buffer.from(currentState.screenshot, "base64");
const viewportImage = yield this.resizeImageToDimensions(imgBuffer, { x: currentState.width, y: currentState.height });
const markedImage = yield this.markScreenshot(viewportImage, currentState.mouse, currentState.scrollbar);
const resized = yield this.resizeScreenshot(markedImage);
if (this.debugImagePath) {
import_fs.default.writeFileSync(this.debugImagePath, resized, "base64");
}
contentSubMsg.push({
type: "image",
source: {
type: "base64",
media_type: "image/png",
data: resized.toString("base64")
}
});
}
if (!resultText) {
resultText = "Action was performed.";
}
contentSubMsg.unshift({
type: "text",
text: resultText.trim()
});
return {
role: "user",
content: [
{
type: "tool_result",
tool_use_id: toolCallId,
content: contentSubMsg
}
]
};
});
}
flattenBrowserStepToAction(step) {
if (step.action.action === "scroll_down") {
return {
action: "key",
text: "Page_Down"
};
}
if (step.action.action === "scroll_up") {
return {
action: "key",
text: "Page_Up"
};
}
const val = {
action: step.action.action
};
if (step.action.text) {
val.text = step.action.text;
}
if (step.action.coordinate) {
const imgDim = { x: step.state.width, y: step.state.height };
const scaling = this.getScalingRatio(imgDim);
const llmCoordinates = this.browserToLLMCoordinates(
{ x: step.action.coordinate[0], y: step.action.coordinate[1] },
scaling
);
val.coordinate = [llmCoordinates.x, llmCoordinates.y];
}
return val;
}
formatIntoMessages(goal, additionalContext, currentState, sessionHistory) {
return __async(this, null, function* () {
var _a;
const messages = [];
let toolId = this.createToolUseId();
const user_prompt = `Please complete the following task:
${goal}
Using the supporting contextual data:
${additionalContext}
`;
const msg0 = { role: "user", content: [{ type: "text", text: user_prompt.trim() }] };
const msg1 = {
role: "assistant",
content: [
{
type: "tool_use",
id: toolId,
name: "computer",
input: {
action: "screenshot",
reasoning: "Grab a view of the browser to understand what we are looking at."
}
}
]
};
messages.push(msg0);
messages.push(msg1);
for (let pastStepIdx = 0; pastStepIdx < sessionHistory.length; pastStepIdx++) {
const pastStep = sessionHistory[pastStepIdx];
const options = {
mousePosition: false,
screenshot: false,
tabs: true
};
if (pastStepIdx <= sessionHistory.length - this.screenshotHistory) {
options.tabs = false;
}
const resultMsg = yield this.formatStateIntoMsg(toolId, pastStep.state, options);
messages.push(resultMsg);
toolId = (_a = pastStep.action.id) != null ? _a : this.createToolUseId();
const actionMsg = {
role: "assistant",
content: [
{
type: "tool_use",
id: toolId,
name: "computer",
input: this.flattenBrowserStepToAction(pastStep)
}
]
};
messages.push(actionMsg);
}
const currentStateMessage = yield this.formatStateIntoMsg(toolId, currentState, { mousePosition: true, screenshot: true, tabs: true });
messages.push(currentStateMessage);
return messages;
});
}
parseAction(message, scaling, currentState) {
var _a, _b, _c, _d;
const reasoning = message.content.filter((content) => content.type === "text").map((content) => content.text).join(" ");
const lastMessage = message.content[message.content.length - 1];
if (typeof lastMessage === "string") {
return {
action: "failure",
reasoning: lastMessage,
id: this.createToolUseId()
};
}
if (lastMessage.type !== "tool_use") {
return {
action: "failure",
reasoning,
id: this.createToolUseId()
};
}
if (lastMessage.name === "stop_browsing") {
const input = lastMessage.input;
if (!input.success) {
return {
action: "failure",
reasoning,
text: (_a = input.error) != null ? _a : "Unknown error",
id: lastMessage.id
};
}
return {
action: "success",
reasoning,
text: (_b = input.error) != null ? _b : "Unknown error",
id: lastMessage.id
};
}
if (lastMessage.name !== "computer") {
return {
action: "failure",
reasoning,
text: "Wrong message called",
id: lastMessage.id
};
}
const { action, text } = lastMessage.input;
let coordinate;
let rawCoord = lastMessage.input.coordinate;
if (typeof rawCoord == "string") {
console.log("Coordinate is a string:", rawCoord);
rawCoord = JSON.parse(rawCoord);
}
if (typeof rawCoord == "object") {
if ("x" in rawCoord && "y" in rawCoord) {
console.log("Coordinate object has x and y properties");
coordinate = [rawCoord.x, rawCoord.y];
} else if (Array.isArray(rawCoord)) {
coordinate = [rawCoord[0], rawCoord[1]];
}
}
switch (action) {
case "key":
if (["page_down", "pagedown"].includes((_c = text == null ? void 0 : text.toLocaleLowerCase().trim()) != null ? _c : "")) {
return {
action: "scroll_down",
reasoning,
id: lastMessage.id
};
} else if (["page_up", "pageup"].includes((_d = text == null ? void 0 : text.toLocaleLowerCase().trim()) != null ? _d : "")) {
return {
action: "scroll_up",
reasoning,
id: lastMessage.id
};
}
// Explicit fallthrough
case "type":
if (!text) {
return {
action: "failure",
reasoning,
text: `No text provided for ${action}`,
id: lastMessage.id
};
} else {
return {
action,
reasoning,
text,
id: lastMessage.id
};
}
case "mouse_move":
if (!coordinate) {
return {
action: "failure",
reasoning,
text: "No coordinate provided",
id: lastMessage.id
};
} else {
const browserCoordinates = this.llmToBrowserCoordinates({ x: coordinate[0], y: coordinate[1] }, scaling);
const xJitter = Math.abs(browserCoordinates.x - currentState.mouse.x);
const yJitter = Math.abs(browserCoordinates.y - currentState.mouse.y);
if (xJitter <= this.mouseJitterReduction && yJitter <= this.mouseJitterReduction) {
console.log("Mouse jitter detected, overriding with click");
return {
action: "left_click",
reasoning,
id: lastMessage.id
};
}
}
case "left_click_drag":
if (!coordinate) {
return {
action: "failure",
reasoning,
text: "No coordinate provided",
id: lastMessage.id
};
} else {
const browserCoordinates = this.llmToBrowserCoordinates({ x: coordinate[0], y: coordinate[1] }, scaling);
return {
action,
reasoning,
coordinate: [browserCoordinates.x, browserCoordinates.y],
id: lastMessage.id
};
}
case "switch_tab":
const tabId = parseInt(text != null ? text : "", 10);
if (isNaN(tabId)) {
return {
action: "failure",
reasoning,
text: "Invalid tab ID provided for switch_tab",
id: lastMessage.id
};
}
return {
action: "switch_tab",
reasoning,
text,
id: lastMessage.id
};
case "left_click":
case "right_click":
case "middle_click":
case "double_click":
case "screenshot":
case "cursor_position":
return {
action,
reasoning,
id: lastMessage.id
};
default:
return {
action: "failure",
reasoning,
text: `Unsupported computer action: ${action}`,
id: lastMessage.id
};
}
}
planAction(goal, additionalContext, additionalInstructions, currentState, sessionHistory) {
return __async(this, null, function* () {
const systemPrompt = this.formatSystemPrompt(goal, additionalContext, additionalInstructions);
const messages = yield this.formatIntoMessages(goal, additionalContext, currentState, sessionHistory);
const scaling = this.getScalingRatio({ x: currentState.width, y: currentState.height });
this.printMessagesWithoutScreenshots(messages);
const response = yield this.client.beta.messages.create({
model: "claude-3-5-sonnet-20241022",
system: systemPrompt,
max_tokens: 1024,
tools: [
{
type: "computer_20241022",
name: "computer",
display_width_px: currentState.width,
display_height_px: currentState.height,
display_number: 1
},
{
name: "switch_tab",
description: "Call this function to switch the active browser tab to a new one",
input_schema: {
type: "object",
properties: {
tab_id: {
type: "integer",
description: "The ID of the tab to switch to"
}
},
required: ["tab_id"]
}
},
{
name: "stop_browsing",
description: "Call this function when you have achieved the goal of the task.",
input_schema: {
type: "object",
properties: {
success: {
type: "boolean",
description: "Whether the task was successful"
},
error: {
type: "string",
description: "The error message if the task was not successful"
}
},
required: ["success"]
}
}
],
messages,
betas: ["computer-use-2024-10-22"]
});
console.log(`Token usage - Input: ${response.usage.input_tokens}, Output: ${response.usage.output_tokens}`);
this.inputTokenUsage += response.usage.input_tokens;
this.outputTokenUsage += response.usage.output_tokens;
console.log(`Cumulative token usage - Input: ${this.inputTokenUsage}, Output: ${this.outputTokenUsage}, Total: ${this.inputTokenUsage + this.outputTokenUsage}`);
const action = this.parseAction(response, scaling, currentState);
console.log(action);
return action;
});
}
printMessagesWithoutScreenshots(msg) {
const msgCopy = JSON.parse(JSON.stringify(msg));
for (const message of msgCopy) {
if (message.content) {
for (const outerContent of message.content) {
if (outerContent.content) {
outerContent.content = outerContent.content.filter((content) => content.type !== "image");
}
}
}
}
for (const message of msgCopy) {
console.log(JSON.stringify(message, null, 2));
}
}
};
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
ActionPlanner,
AnthropicPlanner,
BrowserAgent,
pauseForInput
});
//# sourceMappingURL=index.cjs.map