From 6526dc5896de635a2ef41ac9536ba1c74b8f7c9f Mon Sep 17 00:00:00 2001 From: Abdullah Atta Date: Fri, 5 Apr 2024 17:13:35 +0500 Subject: [PATCH 1/2] cache node matchers to improve tag matching performance --- src/from_dom.ts | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/from_dom.ts b/src/from_dom.ts index fba9639..1711e14 100644 --- a/src/from_dom.ts +++ b/src/from_dom.ts @@ -196,6 +196,7 @@ export class DOMParser { /// Parse a document from the content of a DOM node. parse(dom: DOMNode, options: ParseOptions = {}): Node { let context = new ParseContext(this, options, false) + context.generateMatchers(dom as HTMLElement, this.tags) context.addAll(dom, options.from, options.to) return context.finish() as Node } @@ -208,6 +209,7 @@ export class DOMParser { /// the left of the input and the end of nodes at the end. parseSlice(dom: DOMNode, options: ParseOptions = {}) { let context = new ParseContext(this, options, true) + context.generateMatchers(dom as HTMLElement, this.tags) context.addAll(dom, options.from, options.to) return Slice.maxOpen(context.finish() as Fragment) } @@ -216,7 +218,7 @@ export class DOMParser { matchTag(dom: DOMNode, context: ParseContext, after?: ParseRule) { for (let i = after ? this.tags.indexOf(after) + 1 : 0; i < this.tags.length; i++) { let rule = this.tags[i] - if (matches(dom, rule.tag!) && + if (context.matchesNode(dom, rule.tag!) && (rule.namespace === undefined || (dom as HTMLElement).namespaceURI == rule.namespace) && (!rule.context || context.matchesContext(rule.context))) { if (rule.getAttrs) { @@ -398,6 +400,7 @@ class ParseContext { find: {node: DOMNode, offset: number, pos?: number}[] | undefined needsBlock: boolean nodes: NodeContext[] + matchers: Record boolean> = {}; constructor( // The parser we are using. @@ -675,6 +678,7 @@ class ParseContext { } finish() { + this.matchers = {} this.open = 0 this.closeExtra(this.isOpen) return this.nodes[0].finish(this.isOpen || this.options.topOpen) @@ -795,6 +799,30 @@ class ParseContext { if (level == upto) break } } + + /// Match a node against a CSS selector + matchesNode(node: DOMNode, selector: string) { + return this.matchers[selector] ? this.matchers[selector](node as HTMLElement) : matches(node, selector) + } + + /// Generates matchers based on the given parse rules. This is much, much + /// faster than matching each node individually. + generateMatchers(dom: HTMLElement, rules: ParseRule[]) { + for (const rule of rules) { + if (!rule.tag) continue + if (blockTags[rule.tag] || listTags[rule.tag]) + // for simple selectors like li, p etc. we can just do a simple + // tag name check. + this.matchers[rule.tag] = (node) => node.tagName === rule.tag + else { + // for more complex selectors, we collect all the matching nodes + // just once instead of calling `matches` over and over again for + // each node. + const nodes = new Set(dom.querySelectorAll(rule.tag).values()) + this.matchers[rule.tag] = (node) => nodes.has(node) + } + } + } } // Kludge to work around directly nested list nodes produced by some From 087fef73ce7003f9b3d3c4f259cc7744266f366b Mon Sep 17 00:00:00 2001 From: Abdullah Atta Date: Fri, 5 Apr 2024 20:16:17 +0500 Subject: [PATCH 2/2] fix tag name matcher not working --- src/from_dom.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/from_dom.ts b/src/from_dom.ts index 1711e14..160a73f 100644 --- a/src/from_dom.ts +++ b/src/from_dom.ts @@ -810,11 +810,12 @@ class ParseContext { generateMatchers(dom: HTMLElement, rules: ParseRule[]) { for (const rule of rules) { if (!rule.tag) continue - if (blockTags[rule.tag] || listTags[rule.tag]) + if (blockTags[rule.tag] || listTags[rule.tag]) { + const upperCaseTag = rule.tag.toUpperCase() // for simple selectors like li, p etc. we can just do a simple // tag name check. - this.matchers[rule.tag] = (node) => node.tagName === rule.tag - else { + this.matchers[rule.tag] = (node) => node.tagName === upperCaseTag + } else { // for more complex selectors, we collect all the matching nodes // just once instead of calling `matches` over and over again for // each node.