parse.spec.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. import { describe, it, expect } from 'vitest';
  2. import type { Document, Element } from 'domhandler';
  3. import { getParse } from './parse.js';
  4. import { parseDocument as parseWithHtmlparser2 } from 'htmlparser2';
  5. import { parseWithParse5 } from './parsers/parse5-adapter.js';
  6. const defaultOpts = { _useHtmlParser2: false };
  7. const parse = getParse((content, options, isDocument, context) =>
  8. options._useHtmlParser2
  9. ? parseWithHtmlparser2(content, options)
  10. : parseWithParse5(content, options, isDocument, context),
  11. );
  12. // Tags
  13. const basic = '<html></html>';
  14. const siblings = '<h2></h2><p></p>';
  15. // Single Tags
  16. const single = '<br/>';
  17. const singleWrong = '<br>';
  18. // Children
  19. const children = '<html><br/></html>';
  20. const li = '<li class="durian">Durian</li>';
  21. // Attributes
  22. const attributes = '<img src="hello.png" alt="man waving">';
  23. const noValueAttribute = '<textarea disabled></textarea>';
  24. // Comments
  25. const comment = '<!-- sexy -->';
  26. const conditional =
  27. '<!--[if IE 8]><html class="no-js ie8" lang="en"><![endif]-->';
  28. // Text
  29. const text = 'lorem ipsum';
  30. // Script
  31. const script = '<script type="text/javascript">alert("hi world!");</script>';
  32. const scriptEmpty = '<script></script>';
  33. // Style
  34. const style = '<style type="text/css"> h2 { color:blue; } </style>';
  35. const styleEmpty = '<style></style>';
  36. // Directives
  37. const directive = '<!doctype html>';
  38. function rootTest(root: Document) {
  39. expect(root).toHaveProperty('type', 'root');
  40. expect(root.nextSibling).toBe(null);
  41. expect(root.previousSibling).toBe(null);
  42. expect(root.parentNode).toBe(null);
  43. const child = root.childNodes[0];
  44. expect(child.parentNode).toBe(root);
  45. }
  46. describe('parse', () => {
  47. describe('evaluate', () => {
  48. it(`should parse basic empty tags: ${basic}`, () => {
  49. const [tag] = parse(basic, defaultOpts, true, null).children as Element[];
  50. expect(tag.type).toBe('tag');
  51. expect(tag.tagName).toBe('html');
  52. expect(tag.childNodes).toHaveLength(2);
  53. });
  54. it(`should handle sibling tags: ${siblings}`, () => {
  55. const dom = parse(siblings, defaultOpts, false, null)
  56. .children as Element[];
  57. const [h2, p] = dom;
  58. expect(dom).toHaveLength(2);
  59. expect(h2.tagName).toBe('h2');
  60. expect(p.tagName).toBe('p');
  61. });
  62. it(`should handle single tags: ${single}`, () => {
  63. const [tag] = parse(single, defaultOpts, false, null)
  64. .children as Element[];
  65. expect(tag.type).toBe('tag');
  66. expect(tag.tagName).toBe('br');
  67. expect(tag.childNodes).toHaveLength(0);
  68. });
  69. it(`should handle malformatted single tags: ${singleWrong}`, () => {
  70. const [tag] = parse(singleWrong, defaultOpts, false, null)
  71. .children as Element[];
  72. expect(tag.type).toBe('tag');
  73. expect(tag.tagName).toBe('br');
  74. expect(tag.childNodes).toHaveLength(0);
  75. });
  76. it(`should handle tags with children: ${children}`, () => {
  77. const [tag] = parse(children, defaultOpts, true, null)
  78. .children as Element[];
  79. expect(tag.type).toBe('tag');
  80. expect(tag.tagName).toBe('html');
  81. expect(tag.childNodes).toBeTruthy();
  82. expect(tag.childNodes[1]).toHaveProperty('tagName', 'body');
  83. expect((tag.childNodes[1] as Element).childNodes).toHaveLength(1);
  84. });
  85. it(`should handle tags with children: ${li}`, () => {
  86. const [tag] = parse(li, defaultOpts, false, null).children as Element[];
  87. expect(tag.childNodes).toHaveLength(1);
  88. expect(tag.childNodes[0]).toHaveProperty('data', 'Durian');
  89. });
  90. it(`should handle tags with attributes: ${attributes}`, () => {
  91. const attrs = parse(attributes, defaultOpts, false, null)
  92. .children[0] as Element;
  93. expect(attrs.attribs).toBeTruthy();
  94. expect(attrs.attribs).toHaveProperty('src', 'hello.png');
  95. expect(attrs.attribs).toHaveProperty('alt', 'man waving');
  96. });
  97. it(`should handle value-less attributes: ${noValueAttribute}`, () => {
  98. const attrs = parse(noValueAttribute, defaultOpts, false, null)
  99. .children[0] as Element;
  100. expect(attrs.attribs).toBeTruthy();
  101. expect(attrs.attribs).toHaveProperty('disabled', '');
  102. });
  103. it(`should handle comments: ${comment}`, () => {
  104. const elem = parse(comment, defaultOpts, false, null).children[0];
  105. expect(elem.type).toBe('comment');
  106. expect(elem).toHaveProperty('data', ' sexy ');
  107. });
  108. it(`should handle conditional comments: ${conditional}`, () => {
  109. const elem = parse(conditional, defaultOpts, false, null).children[0];
  110. expect(elem.type).toBe('comment');
  111. expect(elem).toHaveProperty(
  112. 'data',
  113. conditional.replace('<!--', '').replace('-->', ''),
  114. );
  115. });
  116. it(`should handle text: ${text}`, () => {
  117. const text_ = parse(text, defaultOpts, false, null).children[0];
  118. expect(text_.type).toBe('text');
  119. expect(text_).toHaveProperty('data', 'lorem ipsum');
  120. });
  121. it(`should handle script tags: ${script}`, () => {
  122. const script_ = parse(script, defaultOpts, false, null)
  123. .children[0] as Element;
  124. expect(script_.type).toBe('script');
  125. expect(script_.tagName).toBe('script');
  126. expect(script_.attribs).toHaveProperty('type', 'text/javascript');
  127. expect(script_.childNodes).toHaveLength(1);
  128. expect(script_.childNodes[0].type).toBe('text');
  129. expect(script_.childNodes[0]).toHaveProperty(
  130. 'data',
  131. 'alert("hi world!");',
  132. );
  133. });
  134. it(`should handle style tags: ${style}`, () => {
  135. const style_ = parse(style, defaultOpts, false, null)
  136. .children[0] as Element;
  137. expect(style_.type).toBe('style');
  138. expect(style_.tagName).toBe('style');
  139. expect(style_.attribs).toHaveProperty('type', 'text/css');
  140. expect(style_.childNodes).toHaveLength(1);
  141. expect(style_.childNodes[0].type).toBe('text');
  142. expect(style_.childNodes[0]).toHaveProperty(
  143. 'data',
  144. ' h2 { color:blue; } ',
  145. );
  146. });
  147. it(`should handle directives: ${directive}`, () => {
  148. const elem = parse(directive, defaultOpts, true, null).children[0];
  149. expect(elem.type).toBe('directive');
  150. expect(elem).toHaveProperty('data', '!DOCTYPE html');
  151. expect(elem).toHaveProperty('name', '!doctype');
  152. });
  153. });
  154. describe('.parse', () => {
  155. // Root test utility
  156. it(`should add root to: ${basic}`, () => {
  157. const root = parse(basic, defaultOpts, true, null);
  158. rootTest(root);
  159. expect(root.childNodes).toHaveLength(1);
  160. expect(root.childNodes[0]).toHaveProperty('tagName', 'html');
  161. });
  162. it(`should add root to: ${siblings}`, () => {
  163. const root = parse(siblings, defaultOpts, false, null);
  164. rootTest(root);
  165. expect(root.childNodes).toHaveLength(2);
  166. expect(root.childNodes[0]).toHaveProperty('tagName', 'h2');
  167. expect(root.childNodes[1]).toHaveProperty('tagName', 'p');
  168. expect(root.childNodes[1].parent).toBe(root);
  169. });
  170. it(`should add root to: ${comment}`, () => {
  171. const root = parse(comment, defaultOpts, false, null);
  172. rootTest(root);
  173. expect(root.childNodes).toHaveLength(1);
  174. expect(root.childNodes[0].type).toBe('comment');
  175. });
  176. it(`should add root to: ${text}`, () => {
  177. const root = parse(text, defaultOpts, false, null);
  178. rootTest(root);
  179. expect(root.childNodes).toHaveLength(1);
  180. expect(root.childNodes[0].type).toBe('text');
  181. });
  182. it(`should add root to: ${scriptEmpty}`, () => {
  183. const root = parse(scriptEmpty, defaultOpts, false, null);
  184. rootTest(root);
  185. expect(root.childNodes).toHaveLength(1);
  186. expect(root.childNodes[0].type).toBe('script');
  187. });
  188. it(`should add root to: ${styleEmpty}`, () => {
  189. const root = parse(styleEmpty, defaultOpts, false, null);
  190. rootTest(root);
  191. expect(root.childNodes).toHaveLength(1);
  192. expect(root.childNodes[0].type).toBe('style');
  193. });
  194. it(`should add root to: ${directive}`, () => {
  195. const root = parse(directive, defaultOpts, true, null);
  196. rootTest(root);
  197. expect(root.childNodes).toHaveLength(2);
  198. expect(root.childNodes[0].type).toBe('directive');
  199. });
  200. it('should simply return root', () => {
  201. const oldroot = parse(basic, defaultOpts, true, null);
  202. const root = parse(oldroot, defaultOpts, true, null);
  203. expect(root).toBe(oldroot);
  204. rootTest(root);
  205. expect(root.childNodes).toHaveLength(1);
  206. expect(root.childNodes[0]).toHaveProperty('tagName', 'html');
  207. });
  208. it('should expose the DOM level 1 API', () => {
  209. const root = parse(
  210. '<div><a></a><span></span><p></p></div>',
  211. defaultOpts,
  212. false,
  213. null,
  214. ).childNodes[0] as Element;
  215. const childNodes = root.childNodes as Element[];
  216. expect(childNodes).toHaveLength(3);
  217. expect(root.tagName).toBe('div');
  218. expect(root.firstChild).toBe(childNodes[0]);
  219. expect(root.lastChild).toBe(childNodes[2]);
  220. expect(childNodes[0].tagName).toBe('a');
  221. expect(childNodes[0].previousSibling).toBe(null);
  222. expect(childNodes[0].nextSibling).toBe(childNodes[1]);
  223. expect(childNodes[0].parentNode).toBe(root);
  224. expect((childNodes[0] as Element).childNodes).toHaveLength(0);
  225. expect(childNodes[0].firstChild).toBe(null);
  226. expect(childNodes[0].lastChild).toBe(null);
  227. expect(childNodes[1].tagName).toBe('span');
  228. expect(childNodes[1].previousSibling).toBe(childNodes[0]);
  229. expect(childNodes[1].nextSibling).toBe(childNodes[2]);
  230. expect(childNodes[1].parentNode).toBe(root);
  231. expect(childNodes[1].childNodes).toHaveLength(0);
  232. expect(childNodes[1].firstChild).toBe(null);
  233. expect(childNodes[1].lastChild).toBe(null);
  234. expect(childNodes[2].tagName).toBe('p');
  235. expect(childNodes[2].previousSibling).toBe(childNodes[1]);
  236. expect(childNodes[2].nextSibling).toBe(null);
  237. expect(childNodes[2].parentNode).toBe(root);
  238. expect(childNodes[2].childNodes).toHaveLength(0);
  239. expect(childNodes[2].firstChild).toBe(null);
  240. expect(childNodes[2].lastChild).toBe(null);
  241. });
  242. it('Should parse less than or equal sign sign', () => {
  243. const root = parse('<i>A</i><=<i>B</i>', defaultOpts, false, null);
  244. const { childNodes } = root;
  245. expect(childNodes[0]).toHaveProperty('tagName', 'i');
  246. expect((childNodes[0] as Element).childNodes[0]).toHaveProperty(
  247. 'data',
  248. 'A',
  249. );
  250. expect(childNodes[1]).toHaveProperty('data', '<=');
  251. expect(childNodes[2]).toHaveProperty('tagName', 'i');
  252. expect((childNodes[2] as Element).childNodes[0]).toHaveProperty(
  253. 'data',
  254. 'B',
  255. );
  256. });
  257. it('Should ignore unclosed CDATA', () => {
  258. const root = parse(
  259. '<a></a><script>foo //<![CDATA[ bar</script><b></b>',
  260. defaultOpts,
  261. false,
  262. null,
  263. );
  264. const childNodes = root.childNodes as Element[];
  265. expect(childNodes[0].tagName).toBe('a');
  266. expect(childNodes[1].tagName).toBe('script');
  267. expect(childNodes[1].childNodes[0]).toHaveProperty(
  268. 'data',
  269. 'foo //<![CDATA[ bar',
  270. );
  271. expect(childNodes[2].tagName).toBe('b');
  272. });
  273. it('Should add <head> to documents', () => {
  274. const root = parse('<html></html>', defaultOpts, true, null);
  275. const childNodes = root.childNodes as Element[];
  276. expect(childNodes[0].tagName).toBe('html');
  277. expect(childNodes[0].childNodes[0]).toHaveProperty('tagName', 'head');
  278. });
  279. it('Should implicitly create <tr> around <td>', () => {
  280. const root = parse(
  281. '<table><td>bar</td></tr></table>',
  282. defaultOpts,
  283. false,
  284. null,
  285. );
  286. const childNodes = root.childNodes as Element[];
  287. expect(childNodes[0].tagName).toBe('table');
  288. expect(childNodes[0].childNodes.length).toBe(1);
  289. expect(childNodes[0].childNodes[0]).toHaveProperty('tagName', 'tbody');
  290. expect((childNodes[0] as any).childNodes[0].childNodes[0]).toHaveProperty(
  291. 'tagName',
  292. 'tr',
  293. );
  294. expect(
  295. (childNodes[0] as any).childNodes[0].childNodes[0].childNodes[0]
  296. .tagName,
  297. ).toBe('td');
  298. expect(
  299. (childNodes[0] as any).childNodes[0].childNodes[0].childNodes[0]
  300. .childNodes[0].data,
  301. ).toBe('bar');
  302. });
  303. it('Should parse custom tag <line>', () => {
  304. const root = parse('<line>test</line>', defaultOpts, false, null);
  305. const childNodes = root.childNodes as Element[];
  306. expect(childNodes.length).toBe(1);
  307. expect(childNodes[0].tagName).toBe('line');
  308. expect(childNodes[0].childNodes[0]).toHaveProperty('data', 'test');
  309. });
  310. it('Should properly parse misnested table tags', () => {
  311. const root = parse(
  312. '<tr><td>i1</td></tr><tr><td>i2</td></td></tr><tr><td>i3</td></td></tr>',
  313. defaultOpts,
  314. false,
  315. null,
  316. );
  317. const childNodes = root.childNodes as Element[];
  318. expect(childNodes.length).toBe(3);
  319. for (let i = 0; i < childNodes.length; i++) {
  320. const child = childNodes[i];
  321. expect(child.tagName).toBe('tr');
  322. expect(child.childNodes[0]).toHaveProperty('tagName', 'td');
  323. expect((child.childNodes[0] as Element).childNodes[0]).toHaveProperty(
  324. 'data',
  325. `i${i + 1}`,
  326. );
  327. }
  328. });
  329. it('Should correctly parse data url attributes', () => {
  330. const html =
  331. '<div style=\'font-family:"butcherman-caps"; src:url(data:font/opentype;base64,AAEA...);\'></div>';
  332. const expectedAttr =
  333. 'font-family:"butcherman-caps"; src:url(data:font/opentype;base64,AAEA...);';
  334. const root = parse(html, defaultOpts, false, null);
  335. const childNodes = root.childNodes as Element[];
  336. expect(childNodes[0].attribs).toHaveProperty('style', expectedAttr);
  337. });
  338. it('Should treat <xmp> tag content as text', () => {
  339. const root = parse('<xmp><h2></xmp>', defaultOpts, false, null);
  340. const childNodes = root.childNodes as Element[];
  341. expect(childNodes[0].childNodes[0]).toHaveProperty('data', '<h2>');
  342. });
  343. it('Should correctly parse malformed numbered entities', () => {
  344. const root = parse('<p>z&#</p>', defaultOpts, false, null);
  345. const childNodes = root.childNodes as Element[];
  346. expect(childNodes[0].childNodes[0]).toHaveProperty('data', 'z&#');
  347. });
  348. it('Should correctly parse mismatched headings', () => {
  349. const root = parse('<h2>Test</h3><div></div>', defaultOpts, false, null);
  350. const { childNodes } = root;
  351. expect(childNodes.length).toBe(2);
  352. expect(childNodes[0]).toHaveProperty('tagName', 'h2');
  353. expect(childNodes[1]).toHaveProperty('tagName', 'div');
  354. });
  355. it('Should correctly parse tricky <pre> content', () => {
  356. const root = parse(
  357. '<pre>\nA <- factor(A, levels = c("c","a","b"))\n</pre>',
  358. defaultOpts,
  359. false,
  360. null,
  361. );
  362. const childNodes = root.childNodes as Element[];
  363. expect(childNodes.length).toBe(1);
  364. expect(childNodes[0].tagName).toBe('pre');
  365. expect(childNodes[0].childNodes[0]).toHaveProperty(
  366. 'data',
  367. 'A <- factor(A, levels = c("c","a","b"))\n',
  368. );
  369. });
  370. it('should pass the options for including the location info to parse5', () => {
  371. const root = parse(
  372. '<p>Hello</p>',
  373. { ...defaultOpts, sourceCodeLocationInfo: true },
  374. false,
  375. null,
  376. );
  377. const location = root.children[0].sourceCodeLocation;
  378. expect(typeof location).toBe('object');
  379. expect(location?.endOffset).toBe(12);
  380. });
  381. });
  382. });