Two-Phase Repository Walk with Byte Budget

When analyzing large codebases, loading all files into memory at once leads to failure. By collecting only paths and stats first, completing structure analysis, and then chunking by byte budget to read/parse/free only necessary files sequentially, you can keep memory usage predictable.

Code

import { promises as fs } from 'node:fs'
import { join, extname } from 'node:path'

interface FileEntry {
  path: string
  size: number
}

interface ParsedFile {
  path: string
  content: string
  ast?: unknown // AST or analysis result
}

interface WalkOptions {
  /** Extensions to parse (e.g., ['.ts', '.js']) */
  parseableExtensions: string[]
  /** Patterns to ignore (e.g., node_modules, .git) */
  ignorePatterns: RegExp[]
  /** Maximum bytes per chunk */
  chunkBudget: number
  /** Maximum single file size (skip files exceeding this) */
  maxFileSize: number
}

/**
 * Phase 1: Scan only paths and sizes
 */
async function scanPaths(
  rootPath: string,
  options: WalkOptions
): Promise<FileEntry[]> {
  const entries: FileEntry[] = []

  async function walk(dir: string): Promise<void> {
    const items = await fs.readdir(dir, { withFileTypes: true })

    for (const item of items) {
      const fullPath = join(dir, item.name)

      // Check ignore patterns (judge early for memory efficiency)
      if (options.ignorePatterns.some((pattern) => pattern.test(fullPath))) {
        continue
      }

      if (item.isDirectory()) {
        await walk(fullPath)
      } else if (item.isFile()) {
        const stat = await fs.stat(fullPath)

        // Size check (exclude huge files early)
        if (stat.size > options.maxFileSize) {
          continue
        }

        entries.push({
          path: fullPath,
          size: stat.size
        })
      }
    }
  }

  await walk(rootPath)
  return entries
}

/**
 * Phase 2: Structure analysis and filtering
 */
function analyzeStructure(
  entries: FileEntry[],
  options: WalkOptions
): FileEntry[] {
  // Filter by extension
  const parseableFiles = entries.filter((entry) => {
    const ext = extname(entry.path)
    return options.parseableExtensions.includes(ext)
  })

  // Sort by size (processing smaller files first yields early results)
  return parseableFiles.toSorted((a, b) => a.size - b.size)
}

/**
 * Phase 3: Chunk by byte budget and read/parse sequentially
 */
async function parseWithBudget(
  entries: FileEntry[],
  options: WalkOptions,
  parseFile: (content: string, path: string) => unknown
): Promise<ParsedFile[]> {
  const results: ParsedFile[] = []
  let currentBudget = 0
  let chunk: FileEntry[] = []

  for (const entry of entries) {
    // Process chunk when budget exceeded
    if (currentBudget + entry.size > options.chunkBudget) {
      // Process current chunk
      const parsed = await processChunk(chunk, parseFile)
      results.push(...parsed)

      // Reset (allows GC to reclaim memory)
      chunk = []
      currentBudget = 0
    }

    chunk.push(entry)
    currentBudget += entry.size
  }

  // Process final chunk
  if (chunk.length > 0) {
    const parsed = await processChunk(chunk, parseFile)
    results.push(...parsed)
  }

  return results
}

/**
 * Batch read and parse files within chunk
 */
async function processChunk(
  chunk: FileEntry[],
  parseFile: (content: string, path: string) => unknown
): Promise<ParsedFile[]> {
  const results: ParsedFile[] = []

  // Parallel reading (concurrent execution within chunk for I/O efficiency)
  const readPromises = chunk.map(async (entry) => {
    try {
      const content = await fs.readFile(entry.path, 'utf-8')
      const ast = parseFile(content, entry.path)
      return { path: entry.path, content, ast }
    } catch (error) {
      console.warn(`Failed to parse ${entry.path}:`, error)
      return null
    }
  })

  const parsed = await Promise.all(readPromises)

  for (const result of parsed) {
    if (result) {
      results.push(result)
    }
  }

  return results
}

/**
 * Unified interface
 */
export async function walkRepository(
  rootPath: string,
  options: WalkOptions,
  parseFile: (content: string, path: string) => unknown
): Promise<ParsedFile[]> {
  // Phase 1: Scan paths and sizes
  const entries = await scanPaths(rootPath, options)

  // Phase 2: Structure analysis
  const parseableFiles = analyzeStructure(entries, options)

  // Phase 3: Parse sequentially by byte budget
  const results = await parseWithBudget(parseableFiles, options, parseFile)

  return results
}

Usage

import { parse } from '@typescript-eslint/typescript-estree'

// Analyze TypeScript repository
const results = await walkRepository(
  './my-project',
  {
    parseableExtensions: ['.ts', '.tsx', '.js', '.jsx'],
    ignorePatterns: [
      /node_modules/,
      /\.git/,
      /dist/,
      /build/
    ],
    chunkBudget: 50 * 1024 * 1024, // 50MB
    maxFileSize: 10 * 1024 * 1024    // 10MB
  },
  (content, path) => {
    // Apply AST parser
    return parse(content, {
      filePath: path,
      jsx: true
    })
  }
)

console.log(`Parsed ${results.length} files`)

How It Works

Phase 1 (Scan): Recursively walk directories with fs.readdir and fs.stat, collecting only path and size. File contents are not read.
Phase 2 (Structure Analysis): Extract parseable files from collected metadata and optimize processing order (by size, priority, etc.).
Phase 3 (Chunking + Parse): Divide files into chunks not exceeding byte budget, then read → parse → free each chunk.
Parallel Reading: Files within a chunk are read concurrently with Promise.all to reduce I/O wait time.
Memory Release: Reset variables after chunk processing so GC can reclaim memory.

Benefits

Memory Control: Avoid reading all files at once; set upper limit with byte budget
Early Filtering: Exclude unnecessary files using only stats, reducing I/O
Progress Tracking: Report progress per chunk, making it easy to update UI or handle cancellation
I/O Efficiency: Optimize disk wait time with parallel reading within chunks

Caveats

To determine “which files to prioritize” in Phase 2 structure analysis requires some domain knowledge. For example, reading package.json or config files first can inform processing strategy for other files. Also, chunkBudget and maxFileSize need adjustment according to environment memory limits.

Applications

Code indexer: Extract and index function/type definitions across repository
Static analyzer: Apply lint rules to large codebase in bulk
AST-based migration tool: Apply automatic refactoring to entire codebase
Bulk linter: Check tens of thousands of files without memory failure
Large-repo ingestion: Analyze changes in large repositories in CI/CD