Files
pugz/src/lexer.zig

1481 lines
50 KiB
Zig
Raw Normal View History

//! Pug Lexer - Tokenizes Pug template source into a stream of tokens.
//!
//! The lexer handles indentation-based nesting (emitting indent/dedent tokens),
//! Pug-specific syntax (tags, classes, IDs, attributes), and text content
//! including interpolation markers.
const std = @import("std");
/// All possible token types produced by the lexer.
pub const TokenType = enum {
// Structure tokens for indentation-based nesting
indent, // Increased indentation level
dedent, // Decreased indentation level
newline, // Line terminator
eof, // End of source
// Element tokens
tag, // HTML tag name: div, p, a, span, etc.
class, // Class selector: .classname
id, // ID selector: #idname
// Attribute tokens for (attr=value) syntax
lparen, // Opening paren: (
rparen, // Closing paren: )
attr_name, // Attribute name: href, class, data-id
attr_eq, // Assignment: = or !=
attr_value, // Attribute value (quoted or unquoted)
comma, // Attribute separator: ,
// Text content tokens
text, // Plain text content
buffered_text, // Escaped output: = expr
unescaped_text, // Raw output: != expr
pipe_text, // Piped text: | text
dot_block, // Text block marker: .
literal_html, // Literal HTML: <tag>...
self_close, // Self-closing marker: /
// Interpolation tokens for #{} and !{} syntax
interp_start, // Escaped interpolation: #{
interp_start_unesc, // Unescaped interpolation: !{
interp_end, // Interpolation end: }
// Tag interpolation tokens for #[tag text] syntax
tag_interp_start, // Tag interpolation start: #[
tag_interp_end, // Tag interpolation end: ]
// Control flow keywords
kw_if,
kw_else,
kw_unless,
kw_each,
kw_for, // alias for each
kw_while,
kw_in,
kw_case,
kw_when,
kw_default,
// Template structure keywords
kw_doctype,
kw_mixin,
kw_block,
kw_extends,
kw_include,
kw_append,
kw_prepend,
// Mixin invocation: +mixinName
mixin_call,
// Comment tokens
comment, // Rendered comment: //
comment_unbuffered, // Silent comment: //-
// Miscellaneous
colon, // Block expansion: :
ampersand_attrs, // Attribute spread: &attributes
};
/// A single token with its type, value, and source location.
pub const Token = struct {
type: TokenType,
value: []const u8, // Slice into source (no allocation)
line: usize,
column: usize,
};
/// Errors that can occur during lexing.
pub const LexerError = error{
UnterminatedString,
UnmatchedBrace,
OutOfMemory,
};
/// Static map for keyword lookup. Using comptime perfect hashing would be ideal,
/// but a simple StaticStringMap is efficient for small keyword sets.
const keywords = std.StaticStringMap(TokenType).initComptime(.{
.{ "if", .kw_if },
.{ "else", .kw_else },
.{ "unless", .kw_unless },
.{ "each", .kw_each },
.{ "for", .kw_for },
.{ "while", .kw_while },
.{ "case", .kw_case },
.{ "when", .kw_when },
.{ "default", .kw_default },
.{ "doctype", .kw_doctype },
.{ "mixin", .kw_mixin },
.{ "block", .kw_block },
.{ "extends", .kw_extends },
.{ "include", .kw_include },
.{ "append", .kw_append },
.{ "prepend", .kw_prepend },
.{ "in", .kw_in },
});
/// Lexer for Pug template syntax.
///
/// Converts source text into a sequence of tokens. Handles:
/// - Indentation tracking with indent/dedent tokens
/// - Tag, class, and ID shorthand syntax
/// - Attribute parsing within parentheses
/// - Text content and interpolation
/// - Comments and keywords
pub const Lexer = struct {
source: []const u8,
pos: usize,
line: usize,
column: usize,
indent_stack: std.ArrayListUnmanaged(usize),
tokens: std.ArrayListUnmanaged(Token),
allocator: std.mem.Allocator,
at_line_start: bool,
current_indent: usize,
in_raw_block: bool,
raw_block_indent: usize,
raw_block_started: bool,
/// Creates a new lexer for the given source.
/// Does not allocate; allocations happen during tokenize().
pub fn init(allocator: std.mem.Allocator, source: []const u8) Lexer {
return .{
.source = source,
.pos = 0,
.line = 1,
.column = 1,
.indent_stack = .empty,
.tokens = .empty,
.allocator = allocator,
.at_line_start = true,
.current_indent = 0,
.in_raw_block = false,
.raw_block_indent = 0,
.raw_block_started = false,
};
}
/// Releases all allocated memory (tokens and indent stack).
/// Call this when done with the lexer, typically via defer.
pub fn deinit(self: *Lexer) void {
self.indent_stack.deinit(self.allocator);
self.tokens.deinit(self.allocator);
}
/// Tokenizes the source and returns the token slice.
///
/// Returns a slice of tokens owned by the Lexer. The slice remains valid
/// until deinit() is called. On error, calls reset() via errdefer to
/// restore the lexer to a clean state for potential retry or inspection.
pub fn tokenize(self: *Lexer) ![]Token {
// Pre-allocate with estimated capacity: ~1 token per 10 chars is a reasonable heuristic
const estimated_tokens = @max(16, self.source.len / 10);
try self.tokens.ensureTotalCapacity(self.allocator, estimated_tokens);
try self.indent_stack.ensureTotalCapacity(self.allocator, 16); // Reasonable nesting depth
try self.indent_stack.append(self.allocator, 0);
errdefer self.reset();
while (!self.isAtEnd()) {
try self.scanToken();
}
// Emit dedents for any remaining indentation levels
while (self.indent_stack.items.len > 1) {
_ = self.indent_stack.pop();
try self.addToken(.dedent, "");
}
try self.addToken(.eof, "");
return self.tokens.items;
}
/// Resets lexer state while retaining allocated capacity.
/// Called on error to restore clean state for reuse.
pub fn reset(self: *Lexer) void {
self.tokens.clearRetainingCapacity();
self.indent_stack.clearRetainingCapacity();
self.pos = 0;
self.line = 1;
self.column = 1;
self.at_line_start = true;
self.current_indent = 0;
}
/// Appends a token to the output list.
fn addToken(self: *Lexer, token_type: TokenType, value: []const u8) !void {
try self.tokens.append(self.allocator, .{
.type = token_type,
.value = value,
.line = self.line,
.column = self.column,
});
}
/// Main token dispatch. Processes one token based on current character.
/// Handles indentation at line start, then dispatches to specific scanners.
fn scanToken(self: *Lexer) !void {
if (self.at_line_start) {
// In raw block mode, handle indentation specially
if (self.in_raw_block) {
// Remember position before consuming indent
const line_start = self.pos;
const indent = self.measureIndent();
self.current_indent = indent;
if (indent > self.raw_block_indent) {
// First line in raw block - emit indent token
if (!self.raw_block_started) {
self.raw_block_started = true;
try self.indent_stack.append(self.allocator, indent);
try self.addToken(.indent, "");
}
// Scan line as raw text, preserving relative indentation
try self.scanRawLineFrom(line_start);
self.at_line_start = false;
return;
} else {
// Exiting raw block - emit dedent and process normally
self.in_raw_block = false;
self.raw_block_started = false;
if (self.indent_stack.items.len > 1) {
_ = self.indent_stack.pop();
try self.addToken(.dedent, "");
}
try self.processIndentation();
self.at_line_start = false;
return;
}
}
try self.processIndentation();
self.at_line_start = false;
}
if (self.isAtEnd()) return;
const c = self.peek();
// Whitespace (not at line start - already handled)
if (c == ' ' or c == '\t') {
self.advance();
return;
}
// Newline: emit token and mark next line start
if (c == '\n') {
try self.addToken(.newline, "\n");
self.advance();
self.line += 1;
self.column = 1;
self.at_line_start = true;
return;
}
// Handle \r\n (Windows) and \r (old Mac)
if (c == '\r') {
self.advance();
if (self.peek() == '\n') {
self.advance();
}
try self.addToken(.newline, "\n");
self.line += 1;
self.column = 1;
self.at_line_start = true;
return;
}
// Comments: // or //-
if (c == '/' and self.peekNext() == '/') {
try self.scanComment();
return;
}
// Self-closing marker: / at end of tag (before newline or space)
if (c == '/') {
const next = self.peekNext();
if (next == '\n' or next == '\r' or next == ' ' or next == 0) {
self.advance();
try self.addToken(.self_close, "/");
return;
}
}
// Dot: either .class or . (text block)
if (c == '.') {
const next = self.peekNext();
if (next == '\n' or next == '\r' or next == 0) {
self.advance();
try self.addToken(.dot_block, ".");
// Mark that we're entering a raw text block
self.in_raw_block = true;
self.raw_block_indent = self.current_indent;
return;
}
if (isAlpha(next) or next == '-' or next == '_') {
try self.scanClass();
return;
}
}
// Hash: either #id, #{interpolation}, or #[tag interpolation]
if (c == '#') {
const next = self.peekNext();
if (next == '{') {
self.advance();
self.advance();
try self.addToken(.interp_start, "#{");
return;
}
if (next == '[') {
self.advance();
self.advance();
try self.addToken(.tag_interp_start, "#[");
return;
}
if (isAlpha(next) or next == '-' or next == '_') {
try self.scanId();
return;
}
}
// Unescaped interpolation: !{
if (c == '!' and self.peekNext() == '{') {
self.advance();
self.advance();
try self.addToken(.interp_start_unesc, "!{");
return;
}
// Attributes: (...)
if (c == '(') {
try self.scanAttributes();
return;
}
// Pipe text: | text
if (c == '|') {
try self.scanPipeText();
return;
}
// Literal HTML: lines starting with <
if (c == '<') {
try self.scanLiteralHtml();
return;
}
// Buffered output: = expression
if (c == '=') {
self.advance();
try self.addToken(.buffered_text, "=");
try self.scanInlineText();
return;
}
// Unescaped output: != expression
if (c == '!' and self.peekNext() == '=') {
self.advance();
self.advance();
try self.addToken(.unescaped_text, "!=");
try self.scanInlineText();
return;
}
// Mixin call: +name
if (c == '+') {
try self.scanMixinCall();
return;
}
// Block expansion: tag: nested
if (c == ':') {
self.advance();
try self.addToken(.colon, ":");
return;
}
// Attribute spread: &attributes(obj)
if (c == '&') {
try self.scanAmpersandAttrs();
return;
}
// Interpolation end
if (c == '}') {
self.advance();
try self.addToken(.interp_end, "}");
return;
}
// Tag name or keyword
if (isAlpha(c) or c == '_') {
try self.scanTagOrKeyword();
return;
}
// Fallback: treat remaining content as text
try self.scanInlineText();
}
/// Processes leading whitespace at line start to emit indent/dedent tokens.
/// Measures indentation at current position and advances past whitespace.
/// Returns the indent level (spaces=1, tabs=2).
fn measureIndent(self: *Lexer) usize {
var indent: usize = 0;
// Count spaces (1 each) and tabs (2 each)
while (!self.isAtEnd()) {
const c = self.peek();
if (c == ' ') {
indent += 1;
self.advance();
} else if (c == '\t') {
indent += 2;
self.advance();
} else {
break;
}
}
return indent;
}
/// Processes leading whitespace at line start to emit indent/dedent tokens.
/// Tracks indentation levels on a stack to handle nested blocks.
fn processIndentation(self: *Lexer) !void {
const indent = self.measureIndent();
// Empty lines don't affect indentation
if (!self.isAtEnd() and (self.peek() == '\n' or self.peek() == '\r')) {
return;
}
// Comment-only lines preserve current indent context
if (!self.isAtEnd() and self.peek() == '/' and self.peekNext() == '/') {
self.current_indent = indent;
return;
}
self.current_indent = indent;
const current_stack_indent = self.indent_stack.items[self.indent_stack.items.len - 1];
if (indent > current_stack_indent) {
// Deeper nesting: push new level and emit indent
try self.indent_stack.append(self.allocator, indent);
try self.addToken(.indent, "");
} else if (indent < current_stack_indent) {
// Shallower nesting: pop levels and emit dedents
while (self.indent_stack.items.len > 1 and
self.indent_stack.items[self.indent_stack.items.len - 1] > indent)
{
_ = self.indent_stack.pop();
try self.addToken(.dedent, "");
// Exit raw block mode when dedenting to or below original level
if (self.in_raw_block and indent <= self.raw_block_indent) {
self.in_raw_block = false;
}
}
}
}
/// Scans a comment (// or //-) until end of line.
/// Unbuffered comments (//-) are not rendered in output.
fn scanComment(self: *Lexer) !void {
self.advance(); // skip first /
self.advance(); // skip second /
const is_unbuffered = self.peek() == '-';
if (is_unbuffered) {
self.advance();
}
const start = self.pos;
while (!self.isAtEnd() and self.peek() != '\n' and self.peek() != '\r') {
self.advance();
}
const value = self.source[start..self.pos];
try self.addToken(if (is_unbuffered) .comment_unbuffered else .comment, value);
}
/// Scans a class selector: .classname
/// After the class, checks for inline text if no more selectors follow.
fn scanClass(self: *Lexer) !void {
self.advance(); // skip .
const start = self.pos;
while (!self.isAtEnd()) {
const c = self.peek();
if (isAlphaNumeric(c) or c == '-' or c == '_') {
self.advance();
} else {
break;
}
}
try self.addToken(.class, self.source[start..self.pos]);
// Check for inline text after class (if no more selectors/attrs follow)
try self.tryInlineTextAfterSelector();
}
/// Scans an ID selector: #idname
/// After the ID, checks for inline text if no more selectors follow.
fn scanId(self: *Lexer) !void {
self.advance(); // skip #
const start = self.pos;
while (!self.isAtEnd()) {
const c = self.peek();
if (isAlphaNumeric(c) or c == '-' or c == '_') {
self.advance();
} else {
break;
}
}
try self.addToken(.id, self.source[start..self.pos]);
// Check for inline text after ID (if no more selectors/attrs follow)
try self.tryInlineTextAfterSelector();
}
/// Scans attribute list: (name=value, name2=value2, boolean)
/// Also handles mixin arguments: ('value', expr, name=value)
/// Handles quoted strings, expressions, and boolean attributes.
fn scanAttributes(self: *Lexer) !void {
self.advance(); // skip (
try self.addToken(.lparen, "(");
while (!self.isAtEnd() and self.peek() != ')') {
self.skipWhitespaceInAttrs();
if (self.peek() == ')') break;
// Comma separator
if (self.peek() == ',') {
self.advance();
try self.addToken(.comma, ",");
continue;
}
// Check for bare value (mixin argument): starts with quote or digit
const c = self.peek();
if (c == '"' or c == '\'' or c == '`' or c == '{' or c == '[' or isDigit(c)) {
// This is a bare value (mixin argument), not name=value
try self.scanAttrValue();
continue;
}
// Check for rest parameter: ...name
const name_start = self.pos;
if (c == '.' and self.peekAt(1) == '.' and self.peekAt(2) == '.') {
// Skip the three dots, include them in attr_name
self.advance();
self.advance();
self.advance();
}
// Attribute name (supports data-*, @event, :bind)
while (!self.isAtEnd()) {
const ch = self.peek();
if (isAlphaNumeric(ch) or ch == '-' or ch == '_' or ch == ':' or ch == '@') {
self.advance();
} else {
break;
}
}
if (self.pos > name_start) {
try self.addToken(.attr_name, self.source[name_start..self.pos]);
} else {
// No attribute name found - skip unknown character to prevent infinite loop
// This can happen with operators like + in expressions
self.advance();
continue;
}
self.skipWhitespaceInAttrs();
// Value assignment: = or !=
if (self.peek() == '!' and self.peekNext() == '=') {
self.advance();
self.advance();
try self.addToken(.attr_eq, "!=");
self.skipWhitespaceInAttrs();
try self.scanAttrValue();
} else if (self.peek() == '=') {
self.advance();
try self.addToken(.attr_eq, "=");
self.skipWhitespaceInAttrs();
try self.scanAttrValue();
}
// No = means boolean attribute (e.g., checked, disabled)
}
if (self.peek() == ')') {
self.advance();
try self.addToken(.rparen, ")");
// Check for inline text after attributes: a(href='...') Click me
if (self.peek() == ' ') {
const next = self.peekAt(1);
// Don't consume if followed by selector, attr, or special syntax
if (next != '.' and next != '#' and next != '(' and next != '=' and next != ':' and
next != '\n' and next != '\r' and next != 0)
{
self.advance(); // skip space
try self.scanInlineText();
}
}
}
}
/// Scans an attribute value: "string", 'string', `template`, {object}, or expression.
/// Handles expression continuation with operators like + for string concatenation.
/// Emits a single token for the entire expression (e.g., "btn btn-" + type).
fn scanAttrValue(self: *Lexer) !void {
const start = self.pos;
// Scan the complete expression including operators
while (!self.isAtEnd()) {
const c = self.peek();
if (c == '"' or c == '\'') {
// Quoted string
const quote = c;
self.advance();
while (!self.isAtEnd() and self.peek() != quote) {
if (self.peek() == '\\' and self.peekNext() == quote) {
self.advance(); // skip backslash
}
self.advance();
}
if (self.peek() == quote) self.advance();
} else if (c == '`') {
// Template literal
self.advance();
while (!self.isAtEnd() and self.peek() != '`') {
self.advance();
}
if (self.peek() == '`') self.advance();
} else if (c == '{') {
// Object literal - scan matching braces
var depth: usize = 0;
while (!self.isAtEnd()) {
const ch = self.peek();
if (ch == '{') depth += 1;
if (ch == '}') {
depth -= 1;
self.advance();
if (depth == 0) break;
continue;
}
self.advance();
}
} else if (c == '[') {
// Array literal - scan matching brackets
var depth: usize = 0;
while (!self.isAtEnd()) {
const ch = self.peek();
if (ch == '[') depth += 1;
if (ch == ']') {
depth -= 1;
self.advance();
if (depth == 0) break;
continue;
}
self.advance();
}
} else if (c == '(') {
// Function call - scan matching parens
var depth: usize = 0;
while (!self.isAtEnd()) {
const ch = self.peek();
if (ch == '(') depth += 1;
if (ch == ')') {
depth -= 1;
self.advance();
if (depth == 0) break;
continue;
}
self.advance();
}
} else if (c == ')' or c == ',') {
// End of attribute value
break;
} else if (c == ' ' or c == '\t') {
// Whitespace - check if followed by operator (continue) or not (end)
const ws_start = self.pos;
while (self.peek() == ' ' or self.peek() == '\t') {
self.advance();
}
const next = self.peek();
if (next == '+' or next == '-' or next == '*' or next == '/') {
// Operator follows - continue scanning (include whitespace)
continue;
} else {
// Not an operator - rewind and end
self.pos = ws_start;
break;
}
} else if (c == '+' or c == '-' or c == '*' or c == '/') {
// Operator - include it and continue
self.advance();
} else if (c == '\n' or c == '\r') {
// Newline ends the value
break;
} else {
// Regular character (alphanumeric, etc.)
self.advance();
}
}
const value = std.mem.trim(u8, self.source[start..self.pos], " \t");
if (value.len > 0) {
try self.addToken(.attr_value, value);
}
}
/// Scans an object literal {...} handling nested braces.
/// Returns error if braces are unmatched.
fn scanObjectLiteral(self: *Lexer) !void {
const start = self.pos;
var brace_depth: usize = 0;
while (!self.isAtEnd()) {
const c = self.peek();
if (c == '{') {
brace_depth += 1;
} else if (c == '}') {
if (brace_depth == 0) {
// Unmatched closing brace - shouldn't happen if called correctly
return LexerError.UnmatchedBrace;
}
brace_depth -= 1;
if (brace_depth == 0) {
self.advance();
break;
}
}
self.advance();
}
// Check for unterminated object literal
if (brace_depth > 0) {
return LexerError.UnterminatedString;
}
try self.addToken(.attr_value, self.source[start..self.pos]);
}
/// Scans an array literal [...] handling nested brackets.
fn scanArrayLiteral(self: *Lexer) !void {
const start = self.pos;
var bracket_depth: usize = 0;
while (!self.isAtEnd()) {
const c = self.peek();
if (c == '[') {
bracket_depth += 1;
} else if (c == ']') {
if (bracket_depth == 0) {
return LexerError.UnmatchedBrace;
}
bracket_depth -= 1;
if (bracket_depth == 0) {
self.advance();
break;
}
}
self.advance();
}
if (bracket_depth > 0) {
return LexerError.UnterminatedString;
}
try self.addToken(.attr_value, self.source[start..self.pos]);
}
/// Skips whitespace within attribute lists (allows multi-line attributes).
/// Properly tracks line and column for error reporting.
fn skipWhitespaceInAttrs(self: *Lexer) void {
while (!self.isAtEnd()) {
const c = self.peek();
switch (c) {
' ', '\t' => self.advance(),
'\n' => {
self.pos += 1;
self.line += 1;
self.column = 1;
},
'\r' => {
self.pos += 1;
if (!self.isAtEnd() and self.source[self.pos] == '\n') {
self.pos += 1;
}
self.line += 1;
self.column = 1;
},
else => break,
}
}
}
/// Scans pipe text: | followed by text content.
fn scanPipeText(self: *Lexer) !void {
self.advance(); // skip |
if (self.peek() == ' ') self.advance(); // skip optional space
try self.addToken(.pipe_text, "|");
try self.scanInlineText();
}
/// Scans literal HTML: lines starting with < are passed through as-is.
fn scanLiteralHtml(self: *Lexer) !void {
const start = self.pos;
// Scan until end of line
while (!self.isAtEnd() and self.peek() != '\n' and self.peek() != '\r') {
self.advance();
}
const html = self.source[start..self.pos];
try self.addToken(.literal_html, html);
}
/// Scans a raw line of text (used inside dot blocks).
/// Captures everything until end of line as a single text token.
/// Preserves indentation relative to the base raw block indent.
/// Takes line_start position to include proper indentation from source.
fn scanRawLineFrom(self: *Lexer, line_start: usize) !void {
// Scan until end of line
while (!self.isAtEnd() and self.peek() != '\n' and self.peek() != '\r') {
self.advance();
}
// Include all content from line_start, preserving the indentation from source
if (self.pos > line_start) {
const text = self.source[line_start..self.pos];
try self.addToken(.text, text);
}
}
/// Scans inline text until end of line, handling interpolation markers.
/// Uses iterative approach instead of recursion to avoid stack overflow.
fn scanInlineText(self: *Lexer) !void {
if (self.peek() == ' ') self.advance(); // skip leading space
while (!self.isAtEnd() and self.peek() != '\n' and self.peek() != '\r') {
const start = self.pos;
// Scan until interpolation or end of line
while (!self.isAtEnd() and self.peek() != '\n' and self.peek() != '\r') {
const c = self.peek();
const next = self.peekNext();
// Check for interpolation start: #{, !{, or #[
if ((c == '#' or c == '!') and next == '{') {
break;
}
if (c == '#' and next == '[') {
break;
}
self.advance();
}
// Emit text before interpolation (if any)
if (self.pos > start) {
try self.addToken(.text, self.source[start..self.pos]);
}
// Handle interpolation if found
if (!self.isAtEnd() and self.peek() != '\n' and self.peek() != '\r') {
const c = self.peek();
if (c == '#' and self.peekNext() == '{') {
self.advance();
self.advance();
try self.addToken(.interp_start, "#{");
try self.scanInterpolationContent();
} else if (c == '!' and self.peekNext() == '{') {
self.advance();
self.advance();
try self.addToken(.interp_start_unesc, "!{");
try self.scanInterpolationContent();
} else if (c == '#' and self.peekNext() == '[') {
self.advance();
self.advance();
try self.addToken(.tag_interp_start, "#[");
try self.scanTagInterpolation();
}
}
}
}
/// Scans tag interpolation content: #[tag(attrs) text]
/// This needs to handle the tag, optional attributes, optional text, and closing ]
fn scanTagInterpolation(self: *Lexer) !void {
// Skip whitespace
while (self.peek() == ' ' or self.peek() == '\t') {
self.advance();
}
// Scan tag name
if (isAlpha(self.peek()) or self.peek() == '_') {
const tag_start = self.pos;
while (!self.isAtEnd()) {
const c = self.peek();
if (isAlphaNumeric(c) or c == '-' or c == '_') {
self.advance();
} else {
break;
}
}
try self.addToken(.tag, self.source[tag_start..self.pos]);
}
// Scan classes and ids (inline to avoid circular dependencies)
while (self.peek() == '.' or self.peek() == '#') {
if (self.peek() == '.') {
// Inline class scanning
self.advance(); // skip .
const class_start = self.pos;
while (!self.isAtEnd()) {
const c = self.peek();
if (isAlphaNumeric(c) or c == '-' or c == '_') {
self.advance();
} else {
break;
}
}
try self.addToken(.class, self.source[class_start..self.pos]);
} else if (self.peek() == '#' and self.peekNext() != '[' and self.peekNext() != '{') {
// Inline id scanning
self.advance(); // skip #
const id_start = self.pos;
while (!self.isAtEnd()) {
const c = self.peek();
if (isAlphaNumeric(c) or c == '-' or c == '_') {
self.advance();
} else {
break;
}
}
try self.addToken(.id, self.source[id_start..self.pos]);
} else {
break;
}
}
// Scan attributes if present (inline to avoid circular dependencies)
if (self.peek() == '(') {
self.advance(); // skip (
try self.addToken(.lparen, "(");
while (!self.isAtEnd() and self.peek() != ')') {
// Skip whitespace
while (self.peek() == ' ' or self.peek() == '\t' or self.peek() == '\n' or self.peek() == '\r') {
if (self.peek() == '\n' or self.peek() == '\r') {
self.line += 1;
self.column = 1;
}
self.advance();
}
if (self.peek() == ')') break;
// Comma separator
if (self.peek() == ',') {
self.advance();
try self.addToken(.comma, ",");
continue;
}
// Attribute name
const name_start = self.pos;
while (!self.isAtEnd()) {
const c = self.peek();
if (isAlphaNumeric(c) or c == '-' or c == '_' or c == ':' or c == '@') {
self.advance();
} else {
break;
}
}
if (self.pos > name_start) {
try self.addToken(.attr_name, self.source[name_start..self.pos]);
}
// Skip whitespace
while (self.peek() == ' ' or self.peek() == '\t') {
self.advance();
}
// Value assignment
if (self.peek() == '!' and self.peekNext() == '=') {
self.advance();
self.advance();
try self.addToken(.attr_eq, "!=");
while (self.peek() == ' ' or self.peek() == '\t') {
self.advance();
}
try self.scanAttrValue();
} else if (self.peek() == '=') {
self.advance();
try self.addToken(.attr_eq, "=");
while (self.peek() == ' ' or self.peek() == '\t') {
self.advance();
}
try self.scanAttrValue();
}
}
if (self.peek() == ')') {
self.advance();
try self.addToken(.rparen, ")");
}
}
// Skip whitespace before text content
while (self.peek() == ' ' or self.peek() == '\t') {
self.advance();
}
// Scan text content until ] (handling nested #[ ])
if (self.peek() != ']') {
const text_start = self.pos;
var bracket_depth: usize = 1;
while (!self.isAtEnd() and bracket_depth > 0) {
const c = self.peek();
if (c == '#' and self.peekNext() == '[') {
bracket_depth += 1;
self.advance();
} else if (c == ']') {
bracket_depth -= 1;
if (bracket_depth == 0) break;
} else if (c == '\n' or c == '\r') {
break;
}
self.advance();
}
if (self.pos > text_start) {
try self.addToken(.text, self.source[text_start..self.pos]);
}
}
// Emit closing ]
if (self.peek() == ']') {
self.advance();
try self.addToken(.tag_interp_end, "]");
}
}
/// Scans interpolation content between { and }, handling nested braces.
fn scanInterpolationContent(self: *Lexer) !void {
const start = self.pos;
var brace_depth: usize = 1;
while (!self.isAtEnd() and brace_depth > 0) {
const c = self.peek();
if (c == '{') {
brace_depth += 1;
} else if (c == '}') {
brace_depth -= 1;
if (brace_depth == 0) break;
}
self.advance();
}
try self.addToken(.text, self.source[start..self.pos]);
if (!self.isAtEnd() and self.peek() == '}') {
self.advance();
try self.addToken(.interp_end, "}");
}
}
/// Scans a mixin call: +mixinName
fn scanMixinCall(self: *Lexer) !void {
self.advance(); // skip +
const start = self.pos;
while (!self.isAtEnd()) {
const c = self.peek();
if (isAlphaNumeric(c) or c == '-' or c == '_') {
self.advance();
} else {
break;
}
}
try self.addToken(.mixin_call, self.source[start..self.pos]);
}
/// Scans &attributes syntax for attribute spreading.
fn scanAmpersandAttrs(self: *Lexer) !void {
const start = self.pos;
const remaining = self.source.len - self.pos;
if (remaining >= 11 and std.mem.eql(u8, self.source[self.pos..][0..11], "&attributes")) {
self.pos += 11;
self.column += 11;
try self.addToken(.ampersand_attrs, "&attributes");
// Parse the (...) that follows &attributes
if (self.peek() == '(') {
self.advance(); // skip (
const obj_start = self.pos;
var paren_depth: usize = 1;
while (!self.isAtEnd() and paren_depth > 0) {
const c = self.peek();
if (c == '(') {
paren_depth += 1;
} else if (c == ')') {
paren_depth -= 1;
}
if (paren_depth > 0) self.advance();
}
try self.addToken(.attr_value, self.source[obj_start..self.pos]);
if (self.peek() == ')') self.advance(); // skip )
}
} else {
// Lone & treated as text
self.advance();
try self.addToken(.text, self.source[start..self.pos]);
}
}
/// Checks if inline text follows after a class/ID selector.
/// Only scans inline text if the next char is space followed by non-selector content.
fn tryInlineTextAfterSelector(self: *Lexer) !void {
if (self.peek() != ' ') return;
const next = self.peekAt(1);
// Don't consume if followed by another selector, attribute, or special syntax
if (next == '.' or next == '#' or next == '(' or next == '=' or next == ':' or
next == '\n' or next == '\r' or next == 0)
{
return;
}
self.advance(); // skip space
try self.scanInlineText();
}
/// Scans a tag name or keyword, then optionally inline text.
/// Uses static map for O(1) keyword lookup.
fn scanTagOrKeyword(self: *Lexer) !void {
const start = self.pos;
while (!self.isAtEnd()) {
const c = self.peek();
if (isAlphaNumeric(c) or c == '-' or c == '_') {
self.advance();
} else {
break;
}
}
const value = self.source[start..self.pos];
// O(1) keyword lookup using static map
const token_type = keywords.get(value) orelse .tag;
try self.addToken(token_type, value);
// Keywords that take expressions: scan rest of line as text
// This allows `if user.description` to keep the dot notation intact
switch (token_type) {
.kw_if, .kw_unless, .kw_each, .kw_for, .kw_while, .kw_case, .kw_when, .kw_doctype, .kw_extends, .kw_include => {
// Skip whitespace after keyword
while (self.peek() == ' ' or self.peek() == '\t') {
self.advance();
}
// Scan rest of line as expression/path text
if (!self.isAtEnd() and self.peek() != '\n') {
try self.scanExpressionText();
}
},
.tag => {
// Tags may have inline text: p Hello world
if (self.peek() == ' ') {
const next = self.peekAt(1);
// Don't consume text if followed by selector/attr syntax
// Note: # followed by { is interpolation, not ID selector
const is_id_selector = next == '#' and self.peekAt(2) != '{';
if (next != '.' and !is_id_selector and next != '(' and next != '=' and next != ':') {
self.advance();
try self.scanInlineText();
}
}
},
else => {},
}
}
/// Scans expression text (rest of line) preserving dots and other chars.
fn scanExpressionText(self: *Lexer) !void {
const start = self.pos;
// Scan until end of line
while (!self.isAtEnd() and self.peek() != '\n') {
self.advance();
}
const text = self.source[start..self.pos];
if (text.len > 0) {
try self.addToken(.text, text);
}
}
// ─────────────────────────────────────────────────────────────────────────
// Helper functions for character inspection and position management
// ─────────────────────────────────────────────────────────────────────────
/// Returns true if at end of source.
inline fn isAtEnd(self: *const Lexer) bool {
return self.pos >= self.source.len;
}
/// Returns current character or 0 if at end.
inline fn peek(self: *const Lexer) u8 {
if (self.pos >= self.source.len) return 0;
return self.source[self.pos];
}
/// Returns next character or 0 if at/past end.
inline fn peekNext(self: *const Lexer) u8 {
if (self.pos + 1 >= self.source.len) return 0;
return self.source[self.pos + 1];
}
/// Returns character at pos + offset or 0 if out of bounds.
inline fn peekAt(self: *const Lexer, offset: usize) u8 {
const target = self.pos + offset;
if (target >= self.source.len) return 0;
return self.source[target];
}
/// Advances position and column by one.
inline fn advance(self: *Lexer) void {
if (self.pos < self.source.len) {
self.pos += 1;
self.column += 1;
}
}
};
// ─────────────────────────────────────────────────────────────────────────────
// Character classification utilities (inlined for performance)
// ─────────────────────────────────────────────────────────────────────────────
inline fn isAlpha(c: u8) bool {
return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
}
inline fn isDigit(c: u8) bool {
return c >= '0' and c <= '9';
}
inline fn isAlphaNumeric(c: u8) bool {
return isAlpha(c) or isDigit(c);
}
// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────
test "tokenize simple tag" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "div");
defer lexer.deinit();
const tokens = try lexer.tokenize();
try std.testing.expectEqual(@as(usize, 2), tokens.len);
try std.testing.expectEqual(TokenType.tag, tokens[0].type);
try std.testing.expectEqualStrings("div", tokens[0].value);
}
test "tokenize tag with class" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "div.container");
defer lexer.deinit();
const tokens = try lexer.tokenize();
try std.testing.expectEqual(TokenType.tag, tokens[0].type);
try std.testing.expectEqual(TokenType.class, tokens[1].type);
try std.testing.expectEqualStrings("container", tokens[1].value);
}
test "tokenize tag with id" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "div#main");
defer lexer.deinit();
const tokens = try lexer.tokenize();
try std.testing.expectEqual(TokenType.tag, tokens[0].type);
try std.testing.expectEqual(TokenType.id, tokens[1].type);
try std.testing.expectEqualStrings("main", tokens[1].value);
}
test "tokenize nested tags" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator,
\\div
\\ p Hello
);
defer lexer.deinit();
const tokens = try lexer.tokenize();
var found_indent = false;
var found_dedent = false;
for (tokens) |token| {
if (token.type == .indent) found_indent = true;
if (token.type == .dedent) found_dedent = true;
}
try std.testing.expect(found_indent);
try std.testing.expect(found_dedent);
}
test "tokenize attributes" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "a(href=\"/link\" target=\"_blank\")");
defer lexer.deinit();
const tokens = try lexer.tokenize();
try std.testing.expectEqual(TokenType.tag, tokens[0].type);
try std.testing.expectEqual(TokenType.lparen, tokens[1].type);
try std.testing.expectEqual(TokenType.attr_name, tokens[2].type);
try std.testing.expectEqualStrings("href", tokens[2].value);
try std.testing.expectEqual(TokenType.attr_eq, tokens[3].type);
try std.testing.expectEqual(TokenType.attr_value, tokens[4].type);
// Quotes are preserved in token value for expression evaluation
try std.testing.expectEqualStrings("\"/link\"", tokens[4].value);
}
test "tokenize interpolation" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "p Hello #{name}!");
defer lexer.deinit();
const tokens = try lexer.tokenize();
var found_interp_start = false;
var found_interp_end = false;
for (tokens) |token| {
if (token.type == .interp_start) found_interp_start = true;
if (token.type == .interp_end) found_interp_end = true;
}
try std.testing.expect(found_interp_start);
try std.testing.expect(found_interp_end);
}
test "tokenize multiple interpolations" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "p #{a} and #{b} and #{c}");
defer lexer.deinit();
const tokens = try lexer.tokenize();
var interp_count: usize = 0;
for (tokens) |token| {
if (token.type == .interp_start) interp_count += 1;
}
try std.testing.expectEqual(@as(usize, 3), interp_count);
}
test "tokenize if keyword" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "if condition");
defer lexer.deinit();
const tokens = try lexer.tokenize();
try std.testing.expectEqual(TokenType.kw_if, tokens[0].type);
}
test "tokenize each keyword" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "each item in items");
defer lexer.deinit();
const tokens = try lexer.tokenize();
try std.testing.expectEqual(TokenType.kw_each, tokens[0].type);
// Rest of line is captured as text for parser to handle
try std.testing.expectEqual(TokenType.text, tokens[1].type);
try std.testing.expectEqualStrings("item in items", tokens[1].value);
}
test "tokenize mixin call" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "+button");
defer lexer.deinit();
const tokens = try lexer.tokenize();
try std.testing.expectEqual(TokenType.mixin_call, tokens[0].type);
try std.testing.expectEqualStrings("button", tokens[0].value);
}
test "tokenize comment" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "// This is a comment");
defer lexer.deinit();
const tokens = try lexer.tokenize();
try std.testing.expectEqual(TokenType.comment, tokens[0].type);
}
test "tokenize unbuffered comment" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "//- Hidden comment");
defer lexer.deinit();
const tokens = try lexer.tokenize();
try std.testing.expectEqual(TokenType.comment_unbuffered, tokens[0].type);
}
test "tokenize object literal in attributes" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator, "div(style={color: 'red', nested: {a: 1}})");
defer lexer.deinit();
const tokens = try lexer.tokenize();
// Find the attr_value token with object literal
var found_object = false;
for (tokens) |token| {
if (token.type == .attr_value and token.value.len > 0 and token.value[0] == '{') {
found_object = true;
break;
}
}
try std.testing.expect(found_object);
}
test "tokenize dot block" {
const allocator = std.testing.allocator;
var lexer = Lexer.init(allocator,
\\script.
\\ if (usingPug)
\\ console.log('hi')
);
defer lexer.deinit();
const tokens = try lexer.tokenize();
var found_dot_block = false;
var text_count: usize = 0;
for (tokens) |token| {
if (token.type == .dot_block) found_dot_block = true;
if (token.type == .text) text_count += 1;
}
try std.testing.expect(found_dot_block);
try std.testing.expectEqual(@as(usize, 2), text_count);
}