Forji/Forji/ForjiTests/HTMLParserTests.swift
2026-03-09 20:27:08 +01:00

336 lines
9.5 KiB
Swift

import Foundation
import Testing
@testable import Forji
@MainActor
struct HTMLParserTests {
// MARK: - Block-level extraction
@Test func detailsBlockExtracted() {
let markdown = """
# Title
Some text.
<details>
<summary>Click me</summary>
Hidden content here.
</details>
More text.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.count == 1)
if case .html(let content) = htmlSegments[0] {
#expect(content.contains("<details>"))
#expect(content.contains("</details>"))
}
}
@Test func tableBlockExtracted() {
let markdown = """
Before table.
<table>
<tr><th>Header</th></tr>
<tr><td>Cell</td></tr>
</table>
After table.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.count == 1)
if case .html(let content) = htmlSegments[0] {
#expect(content.contains("<table>"))
#expect(content.contains("</table>"))
}
}
@Test func divBlockExtracted() {
let markdown = """
Text.
<div align="center">
Content
</div>
More.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.count == 1)
}
@Test func pictureBlockExtracted() {
let markdown = """
Text.
<picture>
<source media="(prefers-color-scheme: dark)" srcset="dark.png">
<img src="light.png">
</picture>
More.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.count == 1)
}
// MARK: - Inline HTML extraction
@Test func kbdInParagraphExtracted() {
let markdown = """
Some text.
Press <kbd>Ctrl</kbd>+<kbd>C</kbd> to copy.
More text.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.count == 1)
if case .html(let content) = htmlSegments[0] {
#expect(content.contains("<kbd>"))
}
}
@Test func supSubInParagraphExtracted() {
let markdown = """
Normal paragraph.
Water is H<sub>2</sub>O and E=mc<sup>2</sup>.
Another normal paragraph.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.count == 1)
}
@Test func standaloneImgExtracted() {
let markdown = """
Text before.
<img src="image.png" alt="test" width="200">
Text after.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.count == 1)
}
// MARK: - No false positives
@Test func pureMarkdownRemainsText() {
let markdown = """
# Hello
This is **bold** and *italic* text.
- Item 1
- Item 2
[A link](https://example.com)
"""
let segments = MermaidParser.parse(markdown)
#expect(segments.count == 1)
if case .text = segments[0] {} else {
Issue.record("Expected .text segment")
}
}
@Test func angleBracketsInMathNotExtracted() {
let markdown = "If x < 10 and y > 5 then do something."
let segments = MermaidParser.parse(markdown)
#expect(segments.count == 1)
if case .text = segments[0] {} else {
Issue.record("Expected .text segment for math angle brackets")
}
}
@Test func htmlInsideFencedCodeBlockNotExtracted() {
let markdown = """
Some text.
```html
<details>
<summary>Example</summary>
This is code.
</details>
```
More text.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.isEmpty, "HTML inside fenced code blocks should not be extracted")
}
@Test func htmlInsideInlineCodeNotExtracted() {
let markdown = "Use `<details>` tag for collapsible sections and `<kbd>` for keyboard keys."
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.isEmpty, "HTML inside inline code should not be extracted")
}
@Test func markdownTablesRemainText() {
let markdown = """
| Header 1 | Header 2 |
|----------|----------|
| Cell 1 | Cell 2 |
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.isEmpty, "Markdown tables should remain as .text")
}
// MARK: - Mixed content
@Test func markdownPlusBlockHTMLPlusMermaid() {
let markdown = """
# Title
Regular paragraph.
<details>
<summary>Expand</summary>
Hidden content.
</details>
```mermaid
graph TD
A --> B
```
Final text.
"""
let segments = MermaidParser.parse(markdown)
var hasText = false
var hasHTML = false
var hasMermaid = false
for segment in segments {
switch segment {
case .text: hasText = true
case .html: hasHTML = true
case .mermaid: hasMermaid = true
}
}
#expect(hasText, "Should have text segments")
#expect(hasHTML, "Should have html segments")
#expect(hasMermaid, "Should have mermaid segments")
}
// MARK: - Edge cases
@Test func emptyStringProducesTextSegment() {
let segments = MermaidParser.parse("")
#expect(segments == [.text("")])
}
@Test func nestedTagsExtractedAsOneBlock() {
let markdown = """
Text.
<details>
<summary>Outer</summary>
<div>
<table>
<tr><td>Nested</td></tr>
</table>
</div>
</details>
More.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.count == 1, "Nested tags should be extracted as a single block")
}
@Test func selfClosingTagsExtracted() {
let markdown = """
Text before.
<img src="test.png" />
Text after.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(htmlSegments.count == 1)
}
// MARK: - Security: dangerous HTML is segmented as .html (so DOMPurify can sanitize it)
@Test func scriptTagSegmentedAsHTML() {
let markdown = """
Safe text.
<div><script>alert("xss")</script></div>
More safe text.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(!htmlSegments.isEmpty, "Script inside block tag should be segmented as .html for sanitization")
}
@Test func imgWithOnerrorSegmentedAsHTML() {
let markdown = """
Text.
<img src="x" onerror="alert('xss')">
More text.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(!htmlSegments.isEmpty, "img with onerror should be segmented as .html for sanitization")
}
@Test func dangerousInlineAttributesSegmentedAsHTML() {
let markdown = """
Normal paragraph.
<div onclick="alert('xss')">Click me</div>
Another paragraph.
"""
let segments = MermaidParser.parse(markdown)
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
#expect(!htmlSegments.isEmpty, "div with onclick should be segmented as .html for sanitization")
}
// MARK: - Segment ordering
@Test func segmentOrderPreserved() {
let markdown = """
First paragraph.
<table>
<tr><td>Cell</td></tr>
</table>
Middle paragraph.
<details>
<summary>Details</summary>
Content.
</details>
Last paragraph.
"""
let segments = MermaidParser.parse(markdown)
#expect(segments.count == 5)
if case .text = segments[0] {} else { Issue.record("Expected .text at 0") }
if case .html = segments[1] {} else { Issue.record("Expected .html at 1") }
if case .text = segments[2] {} else { Issue.record("Expected .text at 2") }
if case .html = segments[3] {} else { Issue.record("Expected .html at 3") }
if case .text = segments[4] {} else { Issue.record("Expected .text at 4") }
}
}