核心算法实现
1. 智能文件类型检测
function detectFileType(filePath: string): FileTypeInfo {
const extension = path.extname(filePath).toLowerCase();
const mimeType = mime.getType(filePath);
const fileSignature = readFileSignature(filePath);
const fileType = determineFileType(extension, mimeType, fileSignature);
return {
type: fileType,
category: categorizeFileType(fileType),
encoding: detectEncoding(filePath, fileType),
readStrategy: selectReadStrategy(fileType),
processingHints: getProcessingHints(fileType)
};
}
enum FileCategory {
TEXT = 'text',
CODE = 'code',
BINARY = 'binary',
IMAGE = 'image',
NOTEBOOK = 'notebook',
DOCUMENT = 'document',
ARCHIVE = 'archive',
UNKNOWN = 'unknown'
}
2. 智能编码检测机制
function detectTextEncoding(filePath: string): EncodingInfo {
const bomResult = detectBOM(filePath);
if (bomResult.hasBOM) {
return {
encoding: bomResult.encoding,
confidence: 1.0,
method: 'BOM'
};
}
const sampleBuffer = readFileSample(filePath, 8192);
const encodingCandidates = [
'utf-8',
'utf-16le',
'utf-16be',
'gbk',
'gb2312',
'shift_jis',
'euc-kr',
'iso-8859-1'
];
const encodingScores = encodingCandidates.map(encoding => ({
encoding: encoding,
confidence: calculateEncodingConfidence(sampleBuffer, encoding)
}));
const bestEncoding = encodingScores.reduce((best, current) =>
current.confidence > best.confidence ? current : best
);
return {
encoding: bestEncoding.encoding,
confidence: bestEncoding.confidence,
method: 'statistical',
alternatives: encodingScores.filter(s => s.confidence > 0.7)
};
}
3. 分块读取优化算法
async function* readFileInChunks(
filePath: string,
options: ReadOptions
): AsyncGenerator<FileChunk> {
const fileStats = await fs.stat(filePath);
const fileSize = fileStats.size;
const chunkSize = calculateOptimalChunkSize(fileSize, options);
const readStream = fs.createReadStream(filePath, {
encoding: options.encoding as BufferEncoding,
highWaterMark: chunkSize
});
let bytesRead = 0;
let chunkIndex = 0;
try {
for await (const chunk of readStream) {
const processedChunk = await processFileChunk(
chunk,
chunkIndex,
bytesRead,
fileSize,
options
);
bytesRead += chunk.length;
chunkIndex++;
yield {
index: chunkIndex,
data: processedChunk,
bytesRead: bytesRead,
totalBytes: fileSize,
progress: bytesRead / fileSize,
isLast: bytesRead >= fileSize
};
if (process.memoryUsage().heapUsed > options.maxMemoryUsage) {
if (global.gc) {
global.gc();
}
await waitForMemoryRelease(options.memoryPressureThreshold);
}
}
} catch (error) {
throw new ReadError(
ReadErrorType.FILE_READ_ERROR,
`Failed to read file in chunks: ${error.message}`,
{ filePath, bytesRead, chunkIndex }
);
}
}
function calculateOptimalChunkSize(
fileSize: number,
options: ReadOptions
): number {
const baseChunkSize = 64 * 1024;
const maxChunkSize = 2 * 1024 * 1024;
const minChunkSize = 4 * 1024;
let chunkSize: number;
if (fileSize < 1024 * 1024) {
chunkSize = fileSize;
} else if (fileSize < 10 * 1024 * 1024) {
chunkSize = baseChunkSize;
} else {
chunkSize = Math.min(
maxChunkSize,
Math.max(minChunkSize, fileSize / 100)
);
}
const availableMemory = options.maxMemoryUsage - process.memoryUsage().heapUsed;
chunkSize = Math.min(chunkSize, availableMemory * 0.1);
return Math.max(minChunkSize, chunkSize);
}
4. readFileState状态管理
function updateReadFileState(
filePath: string,
content: string | Buffer,
options: ReadOptions,
readFileState: FileStateTracker
): void {
const absolutePath = path.resolve(filePath);
const fileStats = fs.statSync(absolutePath);
const contentHash = calculateContentHash(content);
readFileState[absolutePath] = {
content: typeof content === 'string' ? content : content.toString(options.encoding || 'utf8'),
timestamp: Date.now(),
fileSystemTimestamp: fileStats.mtimeMs,
size: fileStats.size,
encoding: options.encoding || 'utf8',
contentHash: contentHash,
readOptions: {
offset: options.offset,
limit: options.limit,
encoding: options.encoding
},
metadata: {
fileType: detectFileType(absolutePath),
readCount: (readFileState[absolutePath]?.metadata?.readCount || 0) + 1,
lastAccessTime: Date.now(),
permissions: fileStats.mode
}
};
cleanupExpiredFileStates(readFileState, options.stateRetentionTime);
}
function cleanupExpiredFileStates(
readFileState: FileStateTracker,
retentionTime: number
): void {
const currentTime = Date.now();
const expiredPaths: string[] = [];
for (const [filePath, state] of Object.entries(readFileState)) {
if (currentTime - state.timestamp > retentionTime) {
expiredPaths.push(filePath);
}
}
for (const expiredPath of expiredPaths) {
delete readFileState[expiredPath];
}
if (expiredPaths.length > 0) {
logFileStateCleanup({
cleanupTime: currentTime,
expiredCount: expiredPaths.length,
remainingCount: Object.keys(readFileState).length
});
}
}
5. 特殊文件类型处理
Jupyter Notebook处理
async function readJupyterNotebook(
filePath: string,
options: ReadOptions
): Promise<NotebookReadResult> {
try {
const notebookContent = await NotebookRead({
notebook_path: filePath,
cell_id: options.cellId
});
const formattedContent = formatNotebookContent(notebookContent);
return {
success: true,
content: formattedContent,
metadata: {
cellCount: notebookContent.cells?.length || 0,
notebookVersion: notebookContent.nbformat,
kernelInfo: notebookContent.metadata?.kernelspec
}
};
} catch (error) {
throw new ReadError(
ReadErrorType.NOTEBOOK_READ_ERROR,
`Failed to read Jupyter notebook: ${error.message}`,
{ filePath, options }
);
}
}
function formatNotebookContent(notebookData: any): string {
const sections: string[] = [];
sections.push(`# Jupyter Notebook: ${notebookData.metadata?.title || 'Untitled'}`);
sections.push(`Kernel: ${notebookData.metadata?.kernelspec?.display_name || 'Unknown'}`);
sections.push('---\n');
if (notebookData.cells && Array.isArray(notebookData.cells)) {
notebookData.cells.forEach((cell: any, index: number) => {
sections.push(`## Cell ${index + 1} (${cell.cell_type})`);
if (cell.source) {
const source = Array.isArray(cell.source)
? cell.source.join('')
: cell.source;
sections.push(source);
}
if (cell.outputs && cell.outputs.length > 0) {
sections.push('\n### Output:');
cell.outputs.forEach((output: any) => {
if (output.text) {
const text = Array.isArray(output.text)
? output.text.join('')
: output.text;
sections.push(text);
}
});
}
sections.push('\n---\n');
});
}
return sections.join('\n');
}
图像文件处理
async function readImageFile(
filePath: string,
options: ReadOptions
): Promise<ImageReadResult> {
try {
const imageBuffer = await fs.readFile(filePath);
const imageMetadata = await getImageMetadata(imageBuffer);
const imageContent = options.returnBase64
? `data:${imageMetadata.mimeType};base64,${imageBuffer.toString('base64')}`
: imageBuffer;
return {
success: true,
content: imageContent,
metadata: {
format: imageMetadata.format,
width: imageMetadata.width,
height: imageMetadata.height,
size: imageBuffer.length,
mimeType: imageMetadata.mimeType
}
};
} catch (error) {
throw new ReadError(
ReadErrorType.IMAGE_READ_ERROR,
`Failed to read image file: ${error.message}`,
{ filePath, options }
);
}
}
async function getImageMetadata(buffer: Buffer): Promise<ImageMetadata> {
const format = detectImageFormat(buffer);
const dimensions = await extractImageDimensions(buffer, format);
return {
format: format,
width: dimensions.width,
height: dimensions.height,
mimeType: `image/${format}`,
hasAlpha: dimensions.hasAlpha || false,
colorDepth: dimensions.colorDepth || 8
};
}