fix: make read tool more mem efficient (#14009)
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import z from "zod"
|
||||
import * as fs from "fs"
|
||||
import * as path from "path"
|
||||
import { createInterface } from "readline"
|
||||
import { Tool } from "./tool"
|
||||
import { LSP } from "../lsp"
|
||||
import { FileTime } from "../file/time"
|
||||
@@ -11,7 +12,9 @@ import { InstructionPrompt } from "../session/instruction"
|
||||
|
||||
const DEFAULT_READ_LIMIT = 2000
|
||||
const MAX_LINE_LENGTH = 2000
|
||||
const MAX_LINE_SUFFIX = `... (line truncated to ${MAX_LINE_LENGTH} chars)`
|
||||
const MAX_BYTES = 50 * 1024
|
||||
const MAX_BYTES_LABEL = `${MAX_BYTES / 1024} KB`
|
||||
|
||||
export const ReadTool = Tool.define("read", {
|
||||
description: DESCRIPTION,
|
||||
@@ -134,27 +137,53 @@ export const ReadTool = Tool.define("read", {
|
||||
}
|
||||
}
|
||||
|
||||
const isBinary = await isBinaryFile(filepath, file)
|
||||
const isBinary = await isBinaryFile(filepath, stat.size)
|
||||
if (isBinary) throw new Error(`Cannot read binary file: ${filepath}`)
|
||||
|
||||
const stream = fs.createReadStream(filepath, { encoding: "utf8" })
|
||||
const rl = createInterface({
|
||||
input: stream,
|
||||
// Note: we use the crlfDelay option to recognize all instances of CR LF
|
||||
// ('\r\n') in file as a single line break.
|
||||
crlfDelay: Infinity,
|
||||
})
|
||||
|
||||
const limit = params.limit ?? DEFAULT_READ_LIMIT
|
||||
const offset = params.offset ?? 1
|
||||
const start = offset - 1
|
||||
const lines = await file.text().then((text) => text.split("\n"))
|
||||
if (start >= lines.length) throw new Error(`Offset ${offset} is out of range for this file (${lines.length} lines)`)
|
||||
|
||||
const raw: string[] = []
|
||||
let bytes = 0
|
||||
let lines = 0
|
||||
let truncatedByBytes = false
|
||||
for (let i = start; i < Math.min(lines.length, start + limit); i++) {
|
||||
const line = lines[i].length > MAX_LINE_LENGTH ? lines[i].substring(0, MAX_LINE_LENGTH) + "..." : lines[i]
|
||||
const size = Buffer.byteLength(line, "utf-8") + (raw.length > 0 ? 1 : 0)
|
||||
if (bytes + size > MAX_BYTES) {
|
||||
truncatedByBytes = true
|
||||
break
|
||||
let hasMoreLines = false
|
||||
try {
|
||||
for await (const text of rl) {
|
||||
lines += 1
|
||||
if (lines <= start) continue
|
||||
|
||||
if (raw.length >= limit) {
|
||||
hasMoreLines = true
|
||||
continue
|
||||
}
|
||||
|
||||
const line = text.length > MAX_LINE_LENGTH ? text.substring(0, MAX_LINE_LENGTH) + MAX_LINE_SUFFIX : text
|
||||
const size = Buffer.byteLength(line, "utf-8") + (raw.length > 0 ? 1 : 0)
|
||||
if (bytes + size > MAX_BYTES) {
|
||||
truncatedByBytes = true
|
||||
hasMoreLines = true
|
||||
break
|
||||
}
|
||||
|
||||
raw.push(line)
|
||||
bytes += size
|
||||
}
|
||||
raw.push(line)
|
||||
bytes += size
|
||||
} finally {
|
||||
rl.close()
|
||||
stream.destroy()
|
||||
}
|
||||
|
||||
if (lines < offset && !(lines === 0 && offset === 1)) {
|
||||
throw new Error(`Offset ${offset} is out of range for this file (${lines} lines)`)
|
||||
}
|
||||
|
||||
const content = raw.map((line, index) => {
|
||||
@@ -165,15 +194,15 @@ export const ReadTool = Tool.define("read", {
|
||||
let output = [`<path>${filepath}</path>`, `<type>file</type>`, "<content>"].join("\n")
|
||||
output += content.join("\n")
|
||||
|
||||
const totalLines = lines.length
|
||||
const totalLines = lines
|
||||
const lastReadLine = offset + raw.length - 1
|
||||
const hasMoreLines = totalLines > lastReadLine
|
||||
const nextOffset = lastReadLine + 1
|
||||
const truncated = hasMoreLines || truncatedByBytes
|
||||
|
||||
if (truncatedByBytes) {
|
||||
output += `\n\n(Output truncated at ${MAX_BYTES} bytes. Use 'offset' parameter to read beyond line ${lastReadLine})`
|
||||
output += `\n\n(Output capped at ${MAX_BYTES_LABEL}. Showing lines ${offset}-${lastReadLine}. Use offset=${nextOffset} to continue.)`
|
||||
} else if (hasMoreLines) {
|
||||
output += `\n\n(File has more lines. Use 'offset' parameter to read beyond line ${lastReadLine})`
|
||||
output += `\n\n(Showing lines ${offset}-${lastReadLine} of ${totalLines}. Use offset=${nextOffset} to continue.)`
|
||||
} else {
|
||||
output += `\n\n(End of file - total ${totalLines} lines)`
|
||||
}
|
||||
@@ -199,7 +228,7 @@ export const ReadTool = Tool.define("read", {
|
||||
},
|
||||
})
|
||||
|
||||
async function isBinaryFile(filepath: string, file: Bun.BunFile): Promise<boolean> {
|
||||
async function isBinaryFile(filepath: string, fileSize: number): Promise<boolean> {
|
||||
const ext = path.extname(filepath).toLowerCase()
|
||||
// binary check for common non-text extensions
|
||||
switch (ext) {
|
||||
@@ -236,22 +265,25 @@ async function isBinaryFile(filepath: string, file: Bun.BunFile): Promise<boolea
|
||||
break
|
||||
}
|
||||
|
||||
const stat = await file.stat()
|
||||
const fileSize = stat.size
|
||||
if (fileSize === 0) return false
|
||||
|
||||
const bufferSize = Math.min(4096, fileSize)
|
||||
const buffer = await file.arrayBuffer()
|
||||
if (buffer.byteLength === 0) return false
|
||||
const bytes = new Uint8Array(buffer.slice(0, bufferSize))
|
||||
const fh = await fs.promises.open(filepath, "r")
|
||||
try {
|
||||
const sampleSize = Math.min(4096, fileSize)
|
||||
const bytes = Buffer.alloc(sampleSize)
|
||||
const result = await fh.read(bytes, 0, sampleSize, 0)
|
||||
if (result.bytesRead === 0) return false
|
||||
|
||||
let nonPrintableCount = 0
|
||||
for (let i = 0; i < bytes.length; i++) {
|
||||
if (bytes[i] === 0) return true
|
||||
if (bytes[i] < 9 || (bytes[i] > 13 && bytes[i] < 32)) {
|
||||
nonPrintableCount++
|
||||
let nonPrintableCount = 0
|
||||
for (let i = 0; i < result.bytesRead; i++) {
|
||||
if (bytes[i] === 0) return true
|
||||
if (bytes[i] < 9 || (bytes[i] > 13 && bytes[i] < 32)) {
|
||||
nonPrintableCount++
|
||||
}
|
||||
}
|
||||
// If >30% non-printable characters, consider it binary
|
||||
return nonPrintableCount / result.bytesRead > 0.3
|
||||
} finally {
|
||||
await fh.close()
|
||||
}
|
||||
// If >30% non-printable characters, consider it binary
|
||||
return nonPrintableCount / bytes.length > 0.3
|
||||
}
|
||||
|
||||
@@ -211,8 +211,8 @@ describe("tool.read truncation", () => {
|
||||
const read = await ReadTool.init()
|
||||
const result = await read.execute({ filePath: path.join(tmp.path, "large.json") }, ctx)
|
||||
expect(result.metadata.truncated).toBe(true)
|
||||
expect(result.output).toContain("Output truncated at")
|
||||
expect(result.output).toContain("bytes")
|
||||
expect(result.output).toContain("Output capped at")
|
||||
expect(result.output).toContain("Use offset=")
|
||||
},
|
||||
})
|
||||
})
|
||||
@@ -230,7 +230,8 @@ describe("tool.read truncation", () => {
|
||||
const read = await ReadTool.init()
|
||||
const result = await read.execute({ filePath: path.join(tmp.path, "many-lines.txt"), limit: 10 }, ctx)
|
||||
expect(result.metadata.truncated).toBe(true)
|
||||
expect(result.output).toContain("File has more lines")
|
||||
expect(result.output).toContain("Showing lines 1-10 of 100")
|
||||
expect(result.output).toContain("Use offset=11")
|
||||
expect(result.output).toContain("line0")
|
||||
expect(result.output).toContain("line9")
|
||||
expect(result.output).not.toContain("line10")
|
||||
@@ -267,6 +268,10 @@ describe("tool.read truncation", () => {
|
||||
fn: async () => {
|
||||
const read = await ReadTool.init()
|
||||
const result = await read.execute({ filePath: path.join(tmp.path, "offset.txt"), offset: 10, limit: 5 }, ctx)
|
||||
expect(result.output).toContain("10: line10")
|
||||
expect(result.output).toContain("14: line14")
|
||||
expect(result.output).not.toContain("9: line10")
|
||||
expect(result.output).not.toContain("15: line15")
|
||||
expect(result.output).toContain("line10")
|
||||
expect(result.output).toContain("line14")
|
||||
expect(result.output).not.toContain("line0")
|
||||
@@ -293,6 +298,40 @@ describe("tool.read truncation", () => {
|
||||
})
|
||||
})
|
||||
|
||||
test("allows reading empty file at default offset", async () => {
|
||||
await using tmp = await tmpdir({
|
||||
init: async (dir) => {
|
||||
await Bun.write(path.join(dir, "empty.txt"), "")
|
||||
},
|
||||
})
|
||||
await Instance.provide({
|
||||
directory: tmp.path,
|
||||
fn: async () => {
|
||||
const read = await ReadTool.init()
|
||||
const result = await read.execute({ filePath: path.join(tmp.path, "empty.txt") }, ctx)
|
||||
expect(result.metadata.truncated).toBe(false)
|
||||
expect(result.output).toContain("End of file - total 0 lines")
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
test("throws when offset > 1 for empty file", async () => {
|
||||
await using tmp = await tmpdir({
|
||||
init: async (dir) => {
|
||||
await Bun.write(path.join(dir, "empty.txt"), "")
|
||||
},
|
||||
})
|
||||
await Instance.provide({
|
||||
directory: tmp.path,
|
||||
fn: async () => {
|
||||
const read = await ReadTool.init()
|
||||
await expect(read.execute({ filePath: path.join(tmp.path, "empty.txt"), offset: 2 }, ctx)).rejects.toThrow(
|
||||
"Offset 2 is out of range for this file (0 lines)",
|
||||
)
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
test("does not mark final directory page as truncated", async () => {
|
||||
await using tmp = await tmpdir({
|
||||
init: async (dir) => {
|
||||
@@ -324,7 +363,7 @@ describe("tool.read truncation", () => {
|
||||
fn: async () => {
|
||||
const read = await ReadTool.init()
|
||||
const result = await read.execute({ filePath: path.join(tmp.path, "long-line.txt") }, ctx)
|
||||
expect(result.output).toContain("...")
|
||||
expect(result.output).toContain("(line truncated to 2000 chars)")
|
||||
expect(result.output.length).toBeLessThan(3000)
|
||||
},
|
||||
})
|
||||
@@ -425,3 +464,40 @@ describe("tool.read loaded instructions", () => {
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
describe("tool.read binary detection", () => {
|
||||
test("rejects text extension files with null bytes", async () => {
|
||||
await using tmp = await tmpdir({
|
||||
init: async (dir) => {
|
||||
const bytes = Buffer.from([0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x00, 0x77, 0x6f, 0x72, 0x6c, 0x64])
|
||||
await Bun.write(path.join(dir, "null-byte.txt"), bytes)
|
||||
},
|
||||
})
|
||||
await Instance.provide({
|
||||
directory: tmp.path,
|
||||
fn: async () => {
|
||||
const read = await ReadTool.init()
|
||||
await expect(read.execute({ filePath: path.join(tmp.path, "null-byte.txt") }, ctx)).rejects.toThrow(
|
||||
"Cannot read binary file",
|
||||
)
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
test("rejects known binary extensions", async () => {
|
||||
await using tmp = await tmpdir({
|
||||
init: async (dir) => {
|
||||
await Bun.write(path.join(dir, "module.wasm"), "not really wasm")
|
||||
},
|
||||
})
|
||||
await Instance.provide({
|
||||
directory: tmp.path,
|
||||
fn: async () => {
|
||||
const read = await ReadTool.init()
|
||||
await expect(read.execute({ filePath: path.join(tmp.path, "module.wasm") }, ctx)).rejects.toThrow(
|
||||
"Cannot read binary file",
|
||||
)
|
||||
},
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user