mirror of
https://codeberg.org/secana/Forji.git
synced 2026-06-17 13:53:53 -07:00
336 lines
9.5 KiB
Swift
336 lines
9.5 KiB
Swift
import Foundation
|
|
import Testing
|
|
@testable import Forji
|
|
|
|
@MainActor
|
|
struct HTMLParserTests {
|
|
|
|
// MARK: - Block-level extraction
|
|
|
|
@Test func detailsBlockExtracted() {
|
|
let markdown = """
|
|
# Title
|
|
|
|
Some text.
|
|
|
|
<details>
|
|
<summary>Click me</summary>
|
|
Hidden content here.
|
|
</details>
|
|
|
|
More text.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.count == 1)
|
|
if case .html(let content) = htmlSegments[0] {
|
|
#expect(content.contains("<details>"))
|
|
#expect(content.contains("</details>"))
|
|
}
|
|
}
|
|
|
|
@Test func tableBlockExtracted() {
|
|
let markdown = """
|
|
Before table.
|
|
|
|
<table>
|
|
<tr><th>Header</th></tr>
|
|
<tr><td>Cell</td></tr>
|
|
</table>
|
|
|
|
After table.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.count == 1)
|
|
if case .html(let content) = htmlSegments[0] {
|
|
#expect(content.contains("<table>"))
|
|
#expect(content.contains("</table>"))
|
|
}
|
|
}
|
|
|
|
@Test func divBlockExtracted() {
|
|
let markdown = """
|
|
Text.
|
|
|
|
<div align="center">
|
|
Content
|
|
</div>
|
|
|
|
More.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.count == 1)
|
|
}
|
|
|
|
@Test func pictureBlockExtracted() {
|
|
let markdown = """
|
|
Text.
|
|
|
|
<picture>
|
|
<source media="(prefers-color-scheme: dark)" srcset="dark.png">
|
|
<img src="light.png">
|
|
</picture>
|
|
|
|
More.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.count == 1)
|
|
}
|
|
|
|
// MARK: - Inline HTML extraction
|
|
|
|
@Test func kbdInParagraphExtracted() {
|
|
let markdown = """
|
|
Some text.
|
|
|
|
Press <kbd>Ctrl</kbd>+<kbd>C</kbd> to copy.
|
|
|
|
More text.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.count == 1)
|
|
if case .html(let content) = htmlSegments[0] {
|
|
#expect(content.contains("<kbd>"))
|
|
}
|
|
}
|
|
|
|
@Test func supSubInParagraphExtracted() {
|
|
let markdown = """
|
|
Normal paragraph.
|
|
|
|
Water is H<sub>2</sub>O and E=mc<sup>2</sup>.
|
|
|
|
Another normal paragraph.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.count == 1)
|
|
}
|
|
|
|
@Test func standaloneImgExtracted() {
|
|
let markdown = """
|
|
Text before.
|
|
|
|
<img src="image.png" alt="test" width="200">
|
|
|
|
Text after.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.count == 1)
|
|
}
|
|
|
|
// MARK: - No false positives
|
|
|
|
@Test func pureMarkdownRemainsText() {
|
|
let markdown = """
|
|
# Hello
|
|
|
|
This is **bold** and *italic* text.
|
|
|
|
- Item 1
|
|
- Item 2
|
|
|
|
[A link](https://example.com)
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
#expect(segments.count == 1)
|
|
if case .text = segments[0] {} else {
|
|
Issue.record("Expected .text segment")
|
|
}
|
|
}
|
|
|
|
@Test func angleBracketsInMathNotExtracted() {
|
|
let markdown = "If x < 10 and y > 5 then do something."
|
|
let segments = MermaidParser.parse(markdown)
|
|
#expect(segments.count == 1)
|
|
if case .text = segments[0] {} else {
|
|
Issue.record("Expected .text segment for math angle brackets")
|
|
}
|
|
}
|
|
|
|
@Test func htmlInsideFencedCodeBlockNotExtracted() {
|
|
let markdown = """
|
|
Some text.
|
|
|
|
```html
|
|
<details>
|
|
<summary>Example</summary>
|
|
This is code.
|
|
</details>
|
|
```
|
|
|
|
More text.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.isEmpty, "HTML inside fenced code blocks should not be extracted")
|
|
}
|
|
|
|
@Test func htmlInsideInlineCodeNotExtracted() {
|
|
let markdown = "Use `<details>` tag for collapsible sections and `<kbd>` for keyboard keys."
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.isEmpty, "HTML inside inline code should not be extracted")
|
|
}
|
|
|
|
@Test func markdownTablesRemainText() {
|
|
let markdown = """
|
|
| Header 1 | Header 2 |
|
|
|----------|----------|
|
|
| Cell 1 | Cell 2 |
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.isEmpty, "Markdown tables should remain as .text")
|
|
}
|
|
|
|
// MARK: - Mixed content
|
|
|
|
@Test func markdownPlusBlockHTMLPlusMermaid() {
|
|
let markdown = """
|
|
# Title
|
|
|
|
Regular paragraph.
|
|
|
|
<details>
|
|
<summary>Expand</summary>
|
|
Hidden content.
|
|
</details>
|
|
|
|
```mermaid
|
|
graph TD
|
|
A --> B
|
|
```
|
|
|
|
Final text.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
var hasText = false
|
|
var hasHTML = false
|
|
var hasMermaid = false
|
|
for segment in segments {
|
|
switch segment {
|
|
case .text: hasText = true
|
|
case .html: hasHTML = true
|
|
case .mermaid: hasMermaid = true
|
|
}
|
|
}
|
|
#expect(hasText, "Should have text segments")
|
|
#expect(hasHTML, "Should have html segments")
|
|
#expect(hasMermaid, "Should have mermaid segments")
|
|
}
|
|
|
|
// MARK: - Edge cases
|
|
|
|
@Test func emptyStringProducesTextSegment() {
|
|
let segments = MermaidParser.parse("")
|
|
#expect(segments == [.text("")])
|
|
}
|
|
|
|
@Test func nestedTagsExtractedAsOneBlock() {
|
|
let markdown = """
|
|
Text.
|
|
|
|
<details>
|
|
<summary>Outer</summary>
|
|
<div>
|
|
<table>
|
|
<tr><td>Nested</td></tr>
|
|
</table>
|
|
</div>
|
|
</details>
|
|
|
|
More.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.count == 1, "Nested tags should be extracted as a single block")
|
|
}
|
|
|
|
@Test func selfClosingTagsExtracted() {
|
|
let markdown = """
|
|
Text before.
|
|
|
|
<img src="test.png" />
|
|
|
|
Text after.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(htmlSegments.count == 1)
|
|
}
|
|
|
|
// MARK: - Security: dangerous HTML is segmented as .html (so DOMPurify can sanitize it)
|
|
|
|
@Test func scriptTagSegmentedAsHTML() {
|
|
let markdown = """
|
|
Safe text.
|
|
|
|
<div><script>alert("xss")</script></div>
|
|
|
|
More safe text.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(!htmlSegments.isEmpty, "Script inside block tag should be segmented as .html for sanitization")
|
|
}
|
|
|
|
@Test func imgWithOnerrorSegmentedAsHTML() {
|
|
let markdown = """
|
|
Text.
|
|
|
|
<img src="x" onerror="alert('xss')">
|
|
|
|
More text.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(!htmlSegments.isEmpty, "img with onerror should be segmented as .html for sanitization")
|
|
}
|
|
|
|
@Test func dangerousInlineAttributesSegmentedAsHTML() {
|
|
let markdown = """
|
|
Normal paragraph.
|
|
|
|
<div onclick="alert('xss')">Click me</div>
|
|
|
|
Another paragraph.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
let htmlSegments = segments.filter { if case .html = $0 { true } else { false } }
|
|
#expect(!htmlSegments.isEmpty, "div with onclick should be segmented as .html for sanitization")
|
|
}
|
|
|
|
// MARK: - Segment ordering
|
|
|
|
@Test func segmentOrderPreserved() {
|
|
let markdown = """
|
|
First paragraph.
|
|
|
|
<table>
|
|
<tr><td>Cell</td></tr>
|
|
</table>
|
|
|
|
Middle paragraph.
|
|
|
|
<details>
|
|
<summary>Details</summary>
|
|
Content.
|
|
</details>
|
|
|
|
Last paragraph.
|
|
"""
|
|
let segments = MermaidParser.parse(markdown)
|
|
#expect(segments.count == 5)
|
|
if case .text = segments[0] {} else { Issue.record("Expected .text at 0") }
|
|
if case .html = segments[1] {} else { Issue.record("Expected .html at 1") }
|
|
if case .text = segments[2] {} else { Issue.record("Expected .text at 2") }
|
|
if case .html = segments[3] {} else { Issue.record("Expected .html at 3") }
|
|
if case .text = segments[4] {} else { Issue.record("Expected .text at 4") }
|
|
}
|
|
}
|