-
-
Notifications
You must be signed in to change notification settings - Fork 11.4k
Replaced cheerio with lightweight html-utils #26819
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,219 @@ | ||
| const {parseDocument} = require('htmlparser2'); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on my reading of the Cheerio source, it uses Parse5 unless you're in XML mode. Why use |
||
| const {selectAll} = require('css-select'); | ||
| const {removeElement, textContent, getChildren} = require('domutils'); | ||
| const render = require('dom-serializer').default; | ||
|
|
||
| /** | ||
| * Lightweight HTML manipulation utility that replaces cheerio. | ||
| * Uses the same underlying packages (htmlparser2, css-select, domutils, dom-serializer) | ||
| * but without the full jQuery-like wrapper, avoiding version conflicts with juice. | ||
| * | ||
| * Usage: | ||
| * const {load} = require('./lib/html-utils'); | ||
| * const $ = load(html); | ||
| * $('a').each((i, el) => { ... }); | ||
| * const result = $.html(); | ||
| */ | ||
|
|
||
| class WrappedSet { | ||
| constructor(elements, root) { | ||
| this._elements = Array.isArray(elements) ? elements : [elements]; | ||
| this._root = root; | ||
|
Comment on lines
+20
to
+21
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can we use proper JavaScript private members ( |
||
| this.length = this._elements.length; | ||
| // Support numeric indexing (e.g. $('a')[0]) | ||
| for (let i = 0; i < this._elements.length; i++) { | ||
| this[i] = this._elements[i]; | ||
| } | ||
| } | ||
|
|
||
| get(index) { | ||
| if (index === undefined) { | ||
| return this._elements; | ||
| } | ||
| return this._elements[index]; | ||
| } | ||
|
|
||
| toArray() { | ||
| return this._elements.slice(); | ||
| } | ||
|
|
||
| first() { | ||
| return new WrappedSet(this._elements.length ? [this._elements[0]] : [], this._root); | ||
| } | ||
|
|
||
| each(fn) { | ||
| this._elements.forEach((el, i) => { | ||
| fn(i, el); | ||
| }); | ||
| return this; | ||
| } | ||
|
|
||
| map(fn) { | ||
| return this._elements.map((el, i) => fn(i, el)); | ||
| } | ||
|
|
||
| attr(name, value) { | ||
| if (value === undefined) { | ||
| // Getter | ||
| const el = this._elements[0]; | ||
| if (!el || !el.attribs) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: when is |
||
| return undefined; | ||
| } | ||
| return el.attribs[name]; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Do we want to protect against keys on |
||
| } | ||
| // Setter | ||
| for (const el of this._elements) { | ||
| if (el.attribs) { | ||
| el.attribs[name] = value; | ||
| } | ||
| } | ||
| return this; | ||
| } | ||
|
|
||
| removeAttr(name) { | ||
| for (const el of this._elements) { | ||
| if (el.attribs) { | ||
| delete el.attribs[name]; | ||
| } | ||
| } | ||
| return this; | ||
| } | ||
|
|
||
| remove() { | ||
| for (const el of this._elements) { | ||
| removeElement(el); | ||
| } | ||
| return this; | ||
| } | ||
|
|
||
| addClass(className) { | ||
| for (const el of this._elements) { | ||
| if (el.attribs) { | ||
| const existing = el.attribs.class || ''; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: the ternary on the line below makes this |
||
| const classes = existing ? existing.split(/\s+/) : []; | ||
| if (!classes.includes(className)) { | ||
| classes.push(className); | ||
| } | ||
| el.attribs.class = classes.join(' '); | ||
| } | ||
| } | ||
| return this; | ||
| } | ||
|
|
||
| text() { | ||
| if (this._elements.length === 0) { | ||
| return ''; | ||
| } | ||
| return this._elements.map(el => textContent(el)).join(''); | ||
| } | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| html(content) { | ||
| if (content === undefined) { | ||
| // Get inner HTML | ||
| if (this._elements.length === 0) { | ||
| return ''; | ||
| } | ||
| const el = this._elements[0]; | ||
| const children = getChildren(el); | ||
| return render(children, {decodeEntities: false}); | ||
| } | ||
| // Set inner HTML | ||
| for (const el of this._elements) { | ||
| const parsed = parseDocument(content, {decodeEntities: false}); | ||
| const newChildren = getChildren(parsed); | ||
| el.children = newChildren; | ||
| for (let j = 0; j < newChildren.length; j++) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I've only seen |
||
| newChildren[j].parent = el; | ||
| newChildren[j].prev = j > 0 ? newChildren[j - 1] : null; | ||
| newChildren[j].next = j < newChildren.length - 1 ? newChildren[j + 1] : null; | ||
| } | ||
| } | ||
| return this; | ||
| } | ||
|
|
||
| find(selector) { | ||
| const results = []; | ||
| for (const el of this._elements) { | ||
| results.push(...selectAll(selector, el)); | ||
| } | ||
| return new WrappedSet(results, this._root); | ||
| } | ||
|
|
||
| before(htmlStr) { | ||
| for (const el of this._elements) { | ||
| const parent = el.parent; | ||
| if (!parent) { | ||
| continue; | ||
| } | ||
|
|
||
| const parsed = parseDocument(htmlStr, {decodeEntities: false}); | ||
| const newNodes = getChildren(parsed); | ||
| if (newNodes.length === 0) { | ||
| continue; | ||
| } | ||
|
|
||
| const idx = parent.children.indexOf(el); | ||
| parent.children.splice(idx, 0, ...newNodes); | ||
|
|
||
| // Rebuild prev/next links only for the affected window | ||
| const end = Math.min(parent.children.length, idx + newNodes.length + 1); | ||
| for (let i = Math.max(0, idx - 1); i < end; i++) { | ||
| const node = parent.children[i]; | ||
| node.parent = parent; | ||
| node.prev = i > 0 ? parent.children[i - 1] : null; | ||
| node.next = i < parent.children.length - 1 ? parent.children[i + 1] : null; | ||
|
Comment on lines
+162
to
+164
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I wonder if this could be DRYed out with what's above in |
||
| } | ||
| } | ||
| return this; | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Load an HTML string and return a query function. | ||
| * @param {string} html | ||
| * @returns {Function & {html: Function}} A jQuery-like query function with .html() method | ||
| */ | ||
| function load(html) { | ||
| const root = parseDocument(html, {decodeEntities: false}); | ||
|
|
||
| function $(selectorOrElement) { | ||
| if (typeof selectorOrElement === 'string') { | ||
| const elements = selectAll(selectorOrElement, root); | ||
| return new WrappedSet(elements, root); | ||
| } | ||
| if (selectorOrElement === null || selectorOrElement === undefined) { | ||
| return new WrappedSet([], root); | ||
| } | ||
| if (Array.isArray(selectorOrElement)) { | ||
| return new WrappedSet(selectorOrElement, root); | ||
| } | ||
| // Wrap a raw element | ||
| return new WrappedSet(selectorOrElement, root); | ||
| } | ||
|
Comment on lines
+179
to
+192
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Find places where optional element extraction might be wrapped back into $()
rg -nP '\.get\(\d+\)' ghost/core -g '!**/node_modules/**'
rg -nP '\$\(\s*[A-Za-z_$][A-Za-z0-9_$]*\s*\)' ghost/core -g '!**/node_modules/**'Repository: TryGhost/Ghost Length of output: 50371 🏁 Script executed: # Read the html-utils.js file around the $ function and WrappedSet class
head -n 250 ghost/core/core/server/lib/html-utils.js | tail -n +100Repository: TryGhost/Ghost Length of output: 3297 🏁 Script executed: # Check the WrappedSet constructor
grep -n "class WrappedSet\|function WrappedSet" ghost/core/core/server/lib/html-utils.js -A 20Repository: TryGhost/Ghost Length of output: 691 🏁 Script executed: # Look at the test file to understand expected behavior
sed -n '1,100p' ghost/core/test/unit/server/lib/html-utils.test.jsRepository: TryGhost/Ghost Length of output: 4030 🏁 Script executed: # Check test cases around line 76 and surrounding tests
sed -n '70,100p' ghost/core/test/unit/server/lib/html-utils.test.jsRepository: TryGhost/Ghost Length of output: 1233 🏁 Script executed: # Look for any usages where $() might receive null/undefined
grep -n "$(null\|$(undefined" ghost/core/core/server -r -g '!**/node_modules/**' 2>/dev/null || echo "No direct patterns found"Repository: TryGhost/Ghost Length of output: 126 🏁 Script executed: # Look at the complete html-utils.js file
wc -l ghost/core/core/server/lib/html-utils.jsRepository: TryGhost/Ghost Length of output: 102 🏁 Script executed: # Read the WrappedSet class methods to see if they assume valid elements
sed -n '18,150p' ghost/core/core/server/lib/html-utils.jsRepository: TryGhost/Ghost Length of output: 3580 🏁 Script executed: # Check email-renderer.js usage around line 409
sed -n '400,420p' ghost/core/core/server/services/email-service/email-renderer.jsRepository: TryGhost/Ghost Length of output: 902 🏁 Script executed: # Check mention-sending-service.js usage around line 191
sed -n '185,200p' ghost/core/core/server/services/mentions/mention-sending-service.jsRepository: TryGhost/Ghost Length of output: 603 🏁 Script executed: # Check the removeElement function
grep -n "function removeElement\|const removeElement" ghost/core/core/server/lib/html-utils.js -A 10Repository: TryGhost/Ghost Length of output: 40 🏁 Script executed: # Check if there are any tests that call $() with null/undefined
grep -n "null\|undefined" ghost/core/test/unit/server/lib/html-utils.test.js | head -20Repository: TryGhost/Ghost Length of output: 532 🏁 Script executed: # Look at the complete test file to see all tests
sed -n '1,250p' ghost/core/test/unit/server/lib/html-utils.test.js | tail -100Repository: TryGhost/Ghost Length of output: 4100 🏁 Script executed: # Check the removeElement function to see if it handles null/undefined safely
sed -n '1,212p' ghost/core/core/server/lib/html-utils.js | grep -B5 -A15 "removeElement"Repository: TryGhost/Ghost Length of output: 1270 🏁 Script executed: # Check textContent and getChildren functions
sed -n '1,212p' ghost/core/core/server/lib/html-utils.js | head -30Repository: TryGhost/Ghost Length of output: 1093 Handle nullish and array inputs in the When 🔧 Proposed fix function $(selectorOrElement) {
if (typeof selectorOrElement === 'string') {
const elements = selectAll(selectorOrElement, root);
return new WrappedSet(elements, root);
}
+ if (selectorOrElement === null || selectorOrElement === undefined) {
+ return new WrappedSet([], root);
+ }
+ if (Array.isArray(selectorOrElement)) {
+ return new WrappedSet(selectorOrElement, root);
+ }
// Wrap a raw element
return new WrappedSet(selectorOrElement, root);
}🤖 Prompt for AI Agents |
||
|
|
||
| $.html = function () { | ||
| return render(root, {decodeEntities: false}); | ||
| }; | ||
|
|
||
| $.text = function () { | ||
| return textContent(root); | ||
| }; | ||
|
|
||
| $.root = root; | ||
|
|
||
| return $; | ||
| } | ||
|
|
||
| /** | ||
| * Query HTML without loading - equivalent to cheerio(selector, html) | ||
| * @param {string} selector | ||
| * @param {string} html | ||
| * @returns {WrappedSet} | ||
| */ | ||
| function select(selector, html) { | ||
| const root = parseDocument(html, {decodeEntities: false}); | ||
| const elements = selectAll(selector, root); | ||
| return new WrappedSet(elements, root); | ||
| } | ||
|
|
||
| module.exports = {load, select}; | ||
kevinansfield marked this conversation as resolved.
Show resolved
Hide resolved
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd love to see JSDoc types for this file. Happy to help with that.