claude-code内置工具Read核心代码

寒霜2025-11-142025-12-29

核心算法实现

1. 智能文件类型检测

// 文件类型检测算法
function detectFileType(filePath: string): FileTypeInfo {
  // 1. 基于扩展名的初步判断
  const extension = path.extname(filePath).toLowerCase();

  // 2. MIME类型检测
  const mimeType = mime.getType(filePath);

  // 3. 文件头魔数检测
  const fileSignature = readFileSignature(filePath);

  // 4. 综合判断文件类型
  const fileType = determineFileType(extension, mimeType, fileSignature);

  return {
    type: fileType,
    category: categorizeFileType(fileType),
    encoding: detectEncoding(filePath, fileType),
    readStrategy: selectReadStrategy(fileType),
    processingHints: getProcessingHints(fileType)
  };
}

// 文件类型分类
enum FileCategory {
  TEXT = 'text',           // 纯文本文件
  CODE = 'code',           // 代码文件
  BINARY = 'binary',       // 二进制文件
  IMAGE = 'image',         // 图像文件
  NOTEBOOK = 'notebook',   // Jupyter Notebook
  DOCUMENT = 'document',   // 文档文件
  ARCHIVE = 'archive',     // 压缩文件
  UNKNOWN = 'unknown'      // 未知类型
}

2. 智能编码检测机制

// 编码检测算法
function detectTextEncoding(filePath: string): EncodingInfo {
  // 1. 读取文件前几个字节进行BOM检测
  const bomResult = detectBOM(filePath);
  if (bomResult.hasBOM) {
    return {
      encoding: bomResult.encoding,
      confidence: 1.0,
      method: 'BOM'
    };
  }

  // 2. 统计分析法检测编码
  const sampleBuffer = readFileSample(filePath, 8192); // 读取8KB样本
  const encodingCandidates = [
    'utf-8',
    'utf-16le',
    'utf-16be',
    'gbk',
    'gb2312',
    'shift_jis',
    'euc-kr',
    'iso-8859-1'
  ];

  // 3. 对每种编码计算置信度
  const encodingScores = encodingCandidates.map(encoding => ({
    encoding: encoding,
    confidence: calculateEncodingConfidence(sampleBuffer, encoding)
  }));

  // 4. 选择置信度最高的编码
  const bestEncoding = encodingScores.reduce((best, current) =>
    current.confidence > best.confidence ? current : best
  );

  return {
    encoding: bestEncoding.encoding,
    confidence: bestEncoding.confidence,
    method: 'statistical',
    alternatives: encodingScores.filter(s => s.confidence > 0.7)
  };
}

3. 分块读取优化算法

// 大文件分块读取策略
async function* readFileInChunks(
  filePath: string,
  options: ReadOptions
): AsyncGenerator<FileChunk> {

  const fileStats = await fs.stat(filePath);
  const fileSize = fileStats.size;

  // 动态计算最优块大小
  const chunkSize = calculateOptimalChunkSize(fileSize, options);

  // 创建读取流
  const readStream = fs.createReadStream(filePath, {
    encoding: options.encoding as BufferEncoding,
    highWaterMark: chunkSize
  });

  let bytesRead = 0;
  let chunkIndex = 0;

  try {
    for await (const chunk of readStream) {
      // 处理当前块
      const processedChunk = await processFileChunk(
        chunk,
        chunkIndex,
        bytesRead,
        fileSize,
        options
      );

      // 更新进度
      bytesRead += chunk.length;
      chunkIndex++;

      // 生成块结果
      yield {
        index: chunkIndex,
        data: processedChunk,
        bytesRead: bytesRead,
        totalBytes: fileSize,
        progress: bytesRead / fileSize,
        isLast: bytesRead >= fileSize
      };

      // 内存压力检查
      if (process.memoryUsage().heapUsed > options.maxMemoryUsage) {
        // 触发垃圾回收建议
        if (global.gc) {
          global.gc();
        }

        // 暂停读取，等待内存释放
        await waitForMemoryRelease(options.memoryPressureThreshold);
      }
    }
  } catch (error) {
    throw new ReadError(
      ReadErrorType.FILE_READ_ERROR,
      `Failed to read file in chunks: ${error.message}`,
      { filePath, bytesRead, chunkIndex }
    );
  }
}

// 最优块大小计算
function calculateOptimalChunkSize(
  fileSize: number,
  options: ReadOptions
): number {
  // 基础块大小配置
  const baseChunkSize = 64 * 1024; // 64KB
  const maxChunkSize = 2 * 1024 * 1024; // 2MB
  const minChunkSize = 4 * 1024; // 4KB

  // 根据文件大小调整
  let chunkSize: number;

  if (fileSize < 1024 * 1024) {
    // 小文件：一次性读取
    chunkSize = fileSize;
  } else if (fileSize < 10 * 1024 * 1024) {
    // 中等文件：使用基础块大小
    chunkSize = baseChunkSize;
  } else {
    // 大文件：动态调整块大小
    chunkSize = Math.min(
      maxChunkSize,
      Math.max(minChunkSize, fileSize / 100) // 分100块读取
    );
  }

  // 根据可用内存调整
  const availableMemory = options.maxMemoryUsage - process.memoryUsage().heapUsed;
  chunkSize = Math.min(chunkSize, availableMemory * 0.1); // 使用10%可用内存

  return Math.max(minChunkSize, chunkSize);
}

4. readFileState状态管理

// readFileState更新机制
function updateReadFileState(
  filePath: string,
  content: string | Buffer,
  options: ReadOptions,
  readFileState: FileStateTracker
): void {

  const absolutePath = path.resolve(filePath);

  // 获取文件统计信息
  const fileStats = fs.statSync(absolutePath);

  // 计算内容哈希（用于验证文件一致性）
  const contentHash = calculateContentHash(content);

  // 更新状态记录
  readFileState[absolutePath] = {
    content: typeof content === 'string' ? content : content.toString(options.encoding || 'utf8'),
    timestamp: Date.now(), // 使用逻辑时间戳
    fileSystemTimestamp: fileStats.mtimeMs, // 文件系统修改时间
    size: fileStats.size,
    encoding: options.encoding || 'utf8',
    contentHash: contentHash,
    readOptions: {
      offset: options.offset,
      limit: options.limit,
      encoding: options.encoding
    },
    metadata: {
      fileType: detectFileType(absolutePath),
      readCount: (readFileState[absolutePath]?.metadata?.readCount || 0) + 1,
      lastAccessTime: Date.now(),
      permissions: fileStats.mode
    }
  };

  // 清理过期的状态记录
  cleanupExpiredFileStates(readFileState, options.stateRetentionTime);
}

// 文件状态清理
function cleanupExpiredFileStates(
  readFileState: FileStateTracker,
  retentionTime: number
): void {

  const currentTime = Date.now();
  const expiredPaths: string[] = [];

  // 查找过期的状态记录
  for (const [filePath, state] of Object.entries(readFileState)) {
    if (currentTime - state.timestamp > retentionTime) {
      expiredPaths.push(filePath);
    }
  }

  // 删除过期记录
  for (const expiredPath of expiredPaths) {
    delete readFileState[expiredPath];
  }

  // 记录清理统计
  if (expiredPaths.length > 0) {
    logFileStateCleanup({
      cleanupTime: currentTime,
      expiredCount: expiredPaths.length,
      remainingCount: Object.keys(readFileState).length
    });
  }
}

5. 特殊文件类型处理

Jupyter Notebook处理

// Jupyter Notebook特殊处理
async function readJupyterNotebook(
  filePath: string,
  options: ReadOptions
): Promise<NotebookReadResult> {

  try {
    // 使用专用的Notebook读取工具
    const notebookContent = await NotebookRead({
      notebook_path: filePath,
      cell_id: options.cellId
    });

    // 格式化Notebook内容
    const formattedContent = formatNotebookContent(notebookContent);

    return {
      success: true,
      content: formattedContent,
      metadata: {
        cellCount: notebookContent.cells?.length || 0,
        notebookVersion: notebookContent.nbformat,
        kernelInfo: notebookContent.metadata?.kernelspec
      }
    };
  } catch (error) {
    throw new ReadError(
      ReadErrorType.NOTEBOOK_READ_ERROR,
      `Failed to read Jupyter notebook: ${error.message}`,
      { filePath, options }
    );
  }
}

// Notebook内容格式化
function formatNotebookContent(notebookData: any): string {
  const sections: string[] = [];

  // 添加notebook信息头
  sections.push(`# Jupyter Notebook: ${notebookData.metadata?.title || 'Untitled'}`);
  sections.push(`Kernel: ${notebookData.metadata?.kernelspec?.display_name || 'Unknown'}`);
  sections.push('---\n');

  // 处理每个cell
  if (notebookData.cells && Array.isArray(notebookData.cells)) {
    notebookData.cells.forEach((cell: any, index: number) => {
      sections.push(`## Cell ${index + 1} (${cell.cell_type})`);

      if (cell.source) {
        const source = Array.isArray(cell.source)
          ? cell.source.join('')
          : cell.source;
        sections.push(source);
      }

      // 添加输出（如果有）
      if (cell.outputs && cell.outputs.length > 0) {
        sections.push('\n### Output:');
        cell.outputs.forEach((output: any) => {
          if (output.text) {
            const text = Array.isArray(output.text)
              ? output.text.join('')
              : output.text;
            sections.push(text);
          }
        });
      }

      sections.push('\n---\n');
    });
  }

  return sections.join('\n');
}

图像文件处理

// 图像文件处理
async function readImageFile(
  filePath: string,
  options: ReadOptions
): Promise<ImageReadResult> {

  try {
    // 读取图像文件的二进制数据
    const imageBuffer = await fs.readFile(filePath);

    // 获取图像元数据
    const imageMetadata = await getImageMetadata(imageBuffer);

    // 根据选项决定是否返回base64编码
    const imageContent = options.returnBase64
      ? `data:${imageMetadata.mimeType};base64,${imageBuffer.toString('base64')}`
      : imageBuffer;

    return {
      success: true,
      content: imageContent,
      metadata: {
        format: imageMetadata.format,
        width: imageMetadata.width,
        height: imageMetadata.height,
        size: imageBuffer.length,
        mimeType: imageMetadata.mimeType
      }
    };
  } catch (error) {
    throw new ReadError(
      ReadErrorType.IMAGE_READ_ERROR,
      `Failed to read image file: ${error.message}`,
      { filePath, options }
    );
  }
}

// 图像元数据提取
async function getImageMetadata(buffer: Buffer): Promise<ImageMetadata> {
  // 基于文件头判断图像格式
  const format = detectImageFormat(buffer);

  // 根据格式提取尺寸信息
  const dimensions = await extractImageDimensions(buffer, format);

  return {
    format: format,
    width: dimensions.width,
    height: dimensions.height,
    mimeType: `image/${format}`,
    hasAlpha: dimensions.hasAlpha || false,
    colorDepth: dimensions.colorDepth || 8
  };
}