Skip to content

Commit 645a7e6

Browse files
committed
Add HTML document parser
This reads an HTML document and produces a `ScopeContext` with all HTML-like features analyzed
1 parent 81a3b5f commit 645a7e6

File tree

4 files changed

+785
-0
lines changed

4 files changed

+785
-0
lines changed
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
import dedent from 'dedent'
2+
import { test } from 'vitest'
3+
import { scanHtml } from './scan'
4+
5+
test('parses HTML', async ({ expect }) => {
6+
let input = dedent`
7+
<div class="example">
8+
<script>
9+
console.log('Hello, world!')
10+
</script>
11+
</div>
12+
`
13+
14+
let scope = scanHtml({
15+
input,
16+
offset: 0,
17+
classAttributes: [],
18+
})
19+
20+
expect(scope).toEqual({
21+
kind: 'context',
22+
source: {
23+
scope: [0, 84],
24+
},
25+
meta: {
26+
lang: 'html',
27+
syntax: 'html',
28+
},
29+
children: [
30+
{
31+
kind: 'context',
32+
source: {
33+
scope: [32, 68],
34+
},
35+
meta: {
36+
lang: 'js',
37+
syntax: 'js',
38+
},
39+
children: [],
40+
},
41+
],
42+
})
43+
})
44+
45+
test('Identifies HTML comments', async ({ expect }) => {
46+
let input = dedent`
47+
<div class="example">
48+
<!--
49+
<span></span>
50+
-->
51+
</div>
52+
`
53+
54+
let scope = scanHtml({
55+
input,
56+
offset: 0,
57+
classAttributes: [],
58+
})
59+
60+
expect(scope).toEqual({
61+
children: [
62+
{
63+
kind: 'comment',
64+
source: { scope: [24, 52] },
65+
children: [],
66+
},
67+
],
68+
kind: 'context',
69+
meta: {
70+
lang: 'html',
71+
syntax: 'html',
72+
},
73+
source: {
74+
scope: [0, 59],
75+
},
76+
})
77+
})
78+
79+
test('Identifies class attributes', async ({ expect }) => {
80+
let input = dedent`
81+
<div class="flex">
82+
<span :class="flex-1"></span>
83+
<span [class]="flex-2"></span>
84+
<span :[class]="flex-3"></span>
85+
<span className="flex-4"></span>
86+
<span
87+
className={clsx(
88+
'flex-5',
89+
{ 'flex-6': true },
90+
{ 'flex-7': false },
91+
)}
92+
></span>
93+
</div>
94+
`
95+
96+
let scope = scanHtml({
97+
input,
98+
offset: 0,
99+
classAttributes: ['class', 'className'],
100+
})
101+
102+
expect(scope).toEqual({
103+
kind: 'context',
104+
source: {
105+
scope: [0, 275],
106+
},
107+
meta: {
108+
lang: 'html',
109+
syntax: 'html',
110+
},
111+
children: [
112+
{
113+
kind: 'class.attr',
114+
meta: { static: true },
115+
source: { scope: [12, 16] },
116+
children: [],
117+
},
118+
{
119+
kind: 'class.attr',
120+
meta: { static: false },
121+
source: { scope: [35, 41] },
122+
children: [],
123+
},
124+
{
125+
kind: 'class.attr',
126+
meta: { static: false },
127+
source: { scope: [68, 74] },
128+
children: [],
129+
},
130+
{
131+
kind: 'class.attr',
132+
meta: { static: false },
133+
source: { scope: [102, 108] },
134+
children: [],
135+
},
136+
{
137+
kind: 'class.attr',
138+
meta: { static: true },
139+
source: { scope: [137, 143] },
140+
children: [],
141+
},
142+
{
143+
kind: 'class.attr',
144+
meta: { static: false },
145+
source: { scope: [176, 256] },
146+
children: [],
147+
},
148+
],
149+
})
150+
})
151+
152+
test('quotes ignore element detection', async ({ expect }) => {
153+
let input = dedent`
154+
<div class="flex">
155+
<span class="<script></script>"></span>
156+
</div>
157+
`
158+
159+
let scope = scanHtml({
160+
input,
161+
offset: 0,
162+
classAttributes: ['class', 'className'],
163+
})
164+
165+
expect(scope).toEqual({
166+
kind: 'context',
167+
source: {
168+
scope: [0, 67],
169+
},
170+
meta: {
171+
lang: 'html',
172+
syntax: 'html',
173+
},
174+
children: [
175+
{
176+
kind: 'class.attr',
177+
meta: { static: true },
178+
source: { scope: [12, 16] },
179+
children: [],
180+
},
181+
{
182+
kind: 'class.attr',
183+
meta: { static: true },
184+
source: { scope: [34, 51] },
185+
children: [],
186+
},
187+
],
188+
})
189+
})
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
import { ScopeClassAttribute, ScopeComment, ScopeContext } from '../scope'
2+
import { createHtmlStream, StreamOptions } from './stream'
3+
4+
function newContext(start: number): ScopeContext {
5+
return {
6+
kind: 'context',
7+
source: {
8+
scope: [start, start],
9+
},
10+
meta: {
11+
syntax: 'html',
12+
lang: 'html',
13+
},
14+
children: [],
15+
}
16+
}
17+
18+
function newComment(start: number): ScopeComment {
19+
return {
20+
kind: 'comment',
21+
source: {
22+
scope: [start, start],
23+
},
24+
children: [],
25+
}
26+
}
27+
28+
function newClassAttr(start: number, end: number): ScopeClassAttribute {
29+
return {
30+
kind: 'class.attr',
31+
meta: {
32+
static: true,
33+
},
34+
source: {
35+
scope: [start, end],
36+
},
37+
children: [],
38+
}
39+
}
40+
41+
const enum State {
42+
Idle,
43+
InComment,
44+
WaitForTagOpen,
45+
WaitForTagClose,
46+
}
47+
48+
interface ScanOptions extends StreamOptions {
49+
/** A list of attributes which will get `class.attr` scopes */
50+
classAttributes: string[]
51+
}
52+
53+
export function scanHtml({ input, offset, classAttributes }: ScanOptions): ScopeContext {
54+
// Compile a regex to match class attributes in the form of:
55+
// - class
56+
// - [class]
57+
// - :class
58+
// - :[class]
59+
let patternAttrs = classAttributes.flatMap((x) => [x, `\\[${x}\\]`]).flatMap((x) => [x, `:${x}`])
60+
let isClassAttr = new RegExp(`^(${patternAttrs.join('|')})$`, 'i')
61+
62+
let root = newContext(0)
63+
root.source.scope[1] = input.length
64+
65+
let state = State.Idle
66+
let context: ScopeContext = newContext(0)
67+
let comment: ScopeComment = newComment(0)
68+
let currentTag = ''
69+
let currentAttr = ''
70+
71+
for (let event of createHtmlStream({ input, offset })) {
72+
// Element attributes
73+
if (event.kind === 'attr-name') {
74+
currentAttr = input.slice(event.span[0], event.span[1])
75+
}
76+
77+
// Attribute values
78+
else if (event.kind === 'attr-value' || event.kind === 'attr-expr') {
79+
let value = input.slice(event.span[0], event.span[1])
80+
81+
if (currentAttr === 'lang' || currentAttr === 'type') {
82+
context.meta.lang = value
83+
continue
84+
}
85+
86+
if (classAttributes.length && isClassAttr.test(currentAttr)) {
87+
let scope = newClassAttr(event.span[0], event.span[1])
88+
if (event.kind === 'attr-expr') {
89+
scope.meta.static = false
90+
} else if (currentAttr[0] === ':') {
91+
scope.meta.static = false
92+
} else if (currentAttr[0] === '[' && currentAttr[currentAttr.length - 1] === ']') {
93+
scope.meta.static = false
94+
}
95+
96+
root.children.push(scope)
97+
}
98+
}
99+
100+
// Comments
101+
else if (event.kind === 'comment-start') {
102+
comment = newComment(event.span[0])
103+
state = State.InComment
104+
} else if (event.kind === 'comment-end') {
105+
if (state === State.InComment) {
106+
comment.source.scope[1] = event.span[1]
107+
root.children.push(comment)
108+
state = State.Idle
109+
}
110+
}
111+
112+
// Elements
113+
else if (event.kind === 'element-start') {
114+
let tag = input.slice(event.span[0], event.span[1])
115+
if (tag === '<script') {
116+
currentTag = tag
117+
context = newContext(event.span[0])
118+
context.meta.lang = 'js'
119+
context.meta.syntax = 'js'
120+
state = State.WaitForTagOpen
121+
} else if (tag === '<style') {
122+
currentTag = tag
123+
context = newContext(event.span[0])
124+
context.meta.lang = 'css'
125+
context.meta.syntax = 'css'
126+
state = State.WaitForTagOpen
127+
} else if (tag === '</script') {
128+
if (currentTag !== '<script') continue
129+
context.source.scope[1] = event.span[0]
130+
root.children.push(context)
131+
context = root
132+
state = State.Idle
133+
} else if (tag === '</style') {
134+
if (currentTag !== '<style') continue
135+
context.source.scope[1] = event.span[0]
136+
root.children.push(context)
137+
context = root
138+
state = State.Idle
139+
}
140+
} else if (event.kind === 'element-end') {
141+
if (state === State.WaitForTagOpen) {
142+
context.source.scope[0] = event.span[1]
143+
state = State.Idle
144+
}
145+
}
146+
}
147+
148+
return root
149+
}

0 commit comments

Comments
 (0)