diff --git a/build.zig.zon b/build.zig.zon index c5d5b80..1e55c44 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -1,6 +1,6 @@ .{ .name = .pugz, - .version = "0.3.6", + .version = "0.3.7", .fingerprint = 0x822db0790e17621d, // Changing this has security and trust implications. .minimum_zig_version = "0.15.2", .dependencies = .{}, diff --git a/docs/CLAUDE.md b/docs/CLAUDE.md index 6a7c06f..e029853 100644 --- a/docs/CLAUDE.md +++ b/docs/CLAUDE.md @@ -44,13 +44,28 @@ Source → Lexer → Tokens → StripComments → Parser → AST → Linker → **codegen.zig**, **template.zig**, and **zig_codegen.zig** all consume the AST from the parser. When fixing bugs related to AST structure (like attribute handling, class merging, etc.), prefer fixing in **parser.zig** so all three rendering paths benefit from the fix automatically. Only fix in the individual codegen modules if the behavior should differ between rendering modes. +### Shared Utilities in runtime.zig + +The `runtime.zig` module is the single source of truth for shared utilities used across all rendering modes: + +- **`isHtmlEntity(str)`** - Checks if string starts with valid HTML entity (`&name;`, `&#digits;`, `&#xhex;`) +- **`appendTextEscaped(allocator, output, str)`** - Escapes text content (`<`, `>`, `&`) preserving existing entities +- **`isXhtmlDoctype(val)`** - Checks if doctype is XHTML (xml, strict, transitional, frameset, 1.1, basic, mobile) +- **`escapeChar(c)`** - O(1) lookup table for HTML character escaping +- **`appendEscaped(allocator, output, str)`** - Escapes all HTML special chars including quotes +- **`doctypes`** - StaticStringMap of doctype names to DOCTYPE strings +- **`whitespace_sensitive_tags`** - Tags where whitespace matters (pre, textarea, script, style, code) + +The `codegen.zig` module provides: +- **`void_elements`** - StaticStringMap of HTML5 void/self-closing elements (br, img, input, etc.) + ### Core Modules | Module | File | Purpose | |--------|------|---------| | **Lexer** | `src/lexer.zig` | Tokenizes Pug source into tokens | | **Parser** | `src/parser.zig` | Builds AST from tokens | -| **Runtime** | `src/runtime.zig` | Shared utilities (HTML escaping, etc.) | +| **Runtime** | `src/runtime.zig` | Shared utilities (HTML escaping, entity detection, doctype helpers) | | **Error** | `src/error.zig` | Error formatting with source context | | **Walk** | `src/walk.zig` | AST traversal with visitor pattern | | **Strip Comments** | `src/strip_comments.zig` | Token filtering for comments | diff --git a/src/codegen.zig b/src/codegen.zig index 194f277..e718066 100644 --- a/src/codegen.zig +++ b/src/codegen.zig @@ -18,6 +18,8 @@ const runtime = @import("runtime.zig"); pub const escapeChar = runtime.escapeChar; pub const doctypes = runtime.doctypes; pub const whitespace_sensitive_tags = runtime.whitespace_sensitive_tags; +pub const isHtmlEntity = runtime.isHtmlEntity; +pub const isXhtmlDoctype = runtime.isXhtmlDoctype; // Import error types const pug_error = @import("error.zig"); @@ -157,6 +159,7 @@ pub const Compiler = struct { fn writeTextEscaped(self: *Compiler, str: []const u8) CompilerError!void { // For text content - escapes < > & (NOT quotes) // Preserves existing HTML entities like ’ or & + // Uses shared isHtmlEntity from runtime.zig var i: usize = 0; while (i < str.len) { const c = str[i]; @@ -165,7 +168,7 @@ pub const Compiler = struct { '>' => try self.write(">"), '&' => { // Check if this is already an HTML entity - if (isHtmlEntity(str[i..])) { + if (runtime.isHtmlEntity(str[i..])) { // Pass through the entity as-is try self.writeChar(c); } else { @@ -178,66 +181,6 @@ pub const Compiler = struct { } } - fn isHtmlEntity(str: []const u8) bool { - // Check if str starts with a valid HTML entity: &name; or &#digits; or &#xhex; - if (str.len < 3 or str[0] != '&') return false; - - var i: usize = 1; - - // Numeric entity: &#digits; or &#xhex; - if (str[i] == '#') { - i += 1; - if (i >= str.len) return false; - - // Hex entity: &#x...; - if (str[i] == 'x' or str[i] == 'X') { - i += 1; - if (i >= str.len) return false; - // Need at least one hex digit - var has_hex = false; - while (i < str.len and i < 10) : (i += 1) { - const ch = str[i]; - if (ch == ';') return has_hex; - if ((ch >= '0' and ch <= '9') or - (ch >= 'a' and ch <= 'f') or - (ch >= 'A' and ch <= 'F')) - { - has_hex = true; - } else { - return false; - } - } - return false; - } - - // Decimal entity: &#digits; - var has_digit = false; - while (i < str.len and i < 10) : (i += 1) { - const ch = str[i]; - if (ch == ';') return has_digit; - if (ch >= '0' and ch <= '9') { - has_digit = true; - } else { - return false; - } - } - return false; - } - - // Named entity: &name; - var has_alpha = false; - while (i < str.len and i < 32) : (i += 1) { - const ch = str[i]; - if (ch == ';') return has_alpha; - if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or (ch >= '0' and ch <= '9')) { - has_alpha = true; - } else { - return false; - } - } - return false; - } - fn prettyIndent(self: *Compiler) CompilerError!void { if (self.options.pretty and !self.escape_pretty) { try self.writeChar('\n'); diff --git a/src/runtime.zig b/src/runtime.zig index 77c9f7b..a2d001d 100644 --- a/src/runtime.zig +++ b/src/runtime.zig @@ -282,6 +282,107 @@ pub inline fn escapeChar(c: u8) ?[]const u8 { return escape_table[c]; } +// ============================================================================ +// HTML Entity Detection and Text Escaping - shared across codegen modules +// ============================================================================ + +/// Check if string starts with a valid HTML entity: &name; or &#digits; or &#xhex; +/// Used to preserve existing entities during text escaping. +/// Shared across codegen.zig and template.zig. +pub fn isHtmlEntity(str: []const u8) bool { + if (str.len < 3 or str[0] != '&') return false; + + var i: usize = 1; + + // Numeric entity: &#digits; or &#xhex; + if (str[i] == '#') { + i += 1; + if (i >= str.len) return false; + + // Hex entity: &#x...; + if (str[i] == 'x' or str[i] == 'X') { + i += 1; + if (i >= str.len) return false; + var has_hex = false; + while (i < str.len and i < 10) : (i += 1) { + const ch = str[i]; + if (ch == ';') return has_hex; + if ((ch >= '0' and ch <= '9') or + (ch >= 'a' and ch <= 'f') or + (ch >= 'A' and ch <= 'F')) + { + has_hex = true; + } else { + return false; + } + } + return false; + } + + // Decimal entity: &#digits; + var has_digit = false; + while (i < str.len and i < 10) : (i += 1) { + const ch = str[i]; + if (ch == ';') return has_digit; + if (ch >= '0' and ch <= '9') { + has_digit = true; + } else { + return false; + } + } + return false; + } + + // Named entity: &name; + var has_alpha = false; + while (i < str.len and i < 32) : (i += 1) { + const ch = str[i]; + if (ch == ';') return has_alpha; + if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or (ch >= '0' and ch <= '9')) { + has_alpha = true; + } else { + return false; + } + } + return false; +} + +/// Escape for text content - escapes < > & (NOT quotes) +/// Preserves existing HTML entities like ’ or & +/// Shared across codegen.zig and template.zig. +pub fn appendTextEscaped(allocator: Allocator, output: *ArrayListUnmanaged(u8), str: []const u8) Allocator.Error!void { + var i: usize = 0; + while (i < str.len) { + const c = str[i]; + switch (c) { + '<' => try output.appendSlice(allocator, "<"), + '>' => try output.appendSlice(allocator, ">"), + '&' => { + if (isHtmlEntity(str[i..])) { + try output.append(allocator, c); + } else { + try output.appendSlice(allocator, "&"); + } + }, + else => try output.append(allocator, c), + } + i += 1; + } +} + +/// Check if a doctype value corresponds to XHTML (non-terse mode). +/// Returns true for XHTML doctypes, false for HTML5. +/// Shared across codegen.zig, template.zig, and zig_codegen.zig. +pub fn isXhtmlDoctype(val: []const u8) bool { + return mem.eql(u8, val, "xml") or + mem.eql(u8, val, "strict") or + mem.eql(u8, val, "transitional") or + mem.eql(u8, val, "frameset") or + mem.eql(u8, val, "1.1") or + mem.eql(u8, val, "basic") or + mem.eql(u8, val, "mobile"); +} + /// Attribute entry for attrs function pub const AttrEntry = struct { key: []const u8, diff --git a/src/template.zig b/src/template.zig index 7706a14..bfc0a83 100644 --- a/src/template.zig +++ b/src/template.zig @@ -10,6 +10,7 @@ const pug = @import("pug.zig"); const parser = @import("parser.zig"); const Node = parser.Node; const runtime = @import("runtime.zig"); +const codegen = @import("codegen.zig"); const mixin_mod = @import("mixin.zig"); pub const MixinRegistry = mixin_mod.MixinRegistry; @@ -194,14 +195,7 @@ fn detectDoctype(node: *Node, ctx: *RenderContext) void { if (node.type == .Doctype) { if (node.val) |val| { // XHTML doctypes use non-terse mode - if (std.mem.eql(u8, val, "xml") or - std.mem.eql(u8, val, "strict") or - std.mem.eql(u8, val, "transitional") or - std.mem.eql(u8, val, "frameset") or - std.mem.eql(u8, val, "1.1") or - std.mem.eql(u8, val, "basic") or - std.mem.eql(u8, val, "mobile")) - { + if (runtime.isXhtmlDoctype(val)) { ctx.terse = false; } } @@ -826,7 +820,7 @@ fn processInterpolation(allocator: Allocator, output: *std.ArrayListUnmanaged(u8 '<' => try output.appendSlice(allocator, "<"), '>' => try output.appendSlice(allocator, ">"), '&' => { - if (isHtmlEntity(text[i..])) { + if (runtime.isHtmlEntity(text[i..])) { try output.append(allocator, c); } else { try output.appendSlice(allocator, "&"); @@ -872,92 +866,15 @@ fn getFieldValue(data: anytype, name: []const u8) ?[]const u8 { /// Escape for text content - escapes < > & (NOT quotes) /// Preserves existing HTML entities like ’ +/// Uses shared appendTextEscaped from runtime.zig. fn appendTextEscaped(allocator: Allocator, output: *std.ArrayListUnmanaged(u8), str: []const u8) Allocator.Error!void { - var i: usize = 0; - while (i < str.len) { - const c = str[i]; - switch (c) { - '<' => try output.appendSlice(allocator, "<"), - '>' => try output.appendSlice(allocator, ">"), - '&' => { - if (isHtmlEntity(str[i..])) { - try output.append(allocator, c); - } else { - try output.appendSlice(allocator, "&"); - } - }, - else => try output.append(allocator, c), - } - i += 1; - } -} - -/// Check if string starts with a valid HTML entity -fn isHtmlEntity(str: []const u8) bool { - if (str.len < 3 or str[0] != '&') return false; - - var i: usize = 1; - - // Numeric entity: &#digits; or &#xhex; - if (str[i] == '#') { - i += 1; - if (i >= str.len) return false; - - if (str[i] == 'x' or str[i] == 'X') { - i += 1; - if (i >= str.len) return false; - var has_hex = false; - while (i < str.len and i < 10) : (i += 1) { - const ch = str[i]; - if (ch == ';') return has_hex; - if ((ch >= '0' and ch <= '9') or - (ch >= 'a' and ch <= 'f') or - (ch >= 'A' and ch <= 'F')) - { - has_hex = true; - } else { - return false; - } - } - return false; - } - - var has_digit = false; - while (i < str.len and i < 10) : (i += 1) { - const ch = str[i]; - if (ch == ';') return has_digit; - if (ch >= '0' and ch <= '9') { - has_digit = true; - } else { - return false; - } - } - return false; - } - - // Named entity: &name; - var has_alpha = false; - while (i < str.len and i < 32) : (i += 1) { - const ch = str[i]; - if (ch == ';') return has_alpha; - if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or (ch >= '0' and ch <= '9')) { - has_alpha = true; - } else { - return false; - } - } - return false; + try runtime.appendTextEscaped(allocator, output, str); } +/// Check if tag is a void (self-closing) HTML element. +/// Uses shared void_elements from codegen.zig. fn isSelfClosing(name: []const u8) bool { - const self_closing_tags = [_][]const u8{ - "area", "base", "br", "col", "embed", "hr", "img", "input", - "link", "meta", "param", "source", "track", "wbr", - }; - for (self_closing_tags) |tag| { - if (std.mem.eql(u8, name, tag)) return true; - } - return false; + return codegen.void_elements.has(name); } // ============================================================================ diff --git a/src/tpl_compiler/zig_codegen.zig b/src/tpl_compiler/zig_codegen.zig index d2ab183..27d22be 100644 --- a/src/tpl_compiler/zig_codegen.zig +++ b/src/tpl_compiler/zig_codegen.zig @@ -603,14 +603,8 @@ pub const Codegen = struct { if (node.type == .Doctype) { if (node.val) |val| { // XHTML doctypes use non-terse mode - if (std.mem.eql(u8, val, "xml") or - std.mem.eql(u8, val, "strict") or - std.mem.eql(u8, val, "transitional") or - std.mem.eql(u8, val, "frameset") or - std.mem.eql(u8, val, "1.1") or - std.mem.eql(u8, val, "basic") or - std.mem.eql(u8, val, "mobile")) - { + // Uses shared isXhtmlDoctype from runtime.zig + if (runtime.isXhtmlDoctype(val)) { self.terse = false; } }