Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ghost/core/core/frontend/services/rss/generate-feed.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ const generateTags = function generateTags(data) {
};

const generateItem = function generateItem(post) {
const cheerio = require('cheerio');
const {load} = require('../../../server/lib/html-utils');

const itemUrl = routerManager.getUrlByResourceId(post.id, {absolute: true});
const htmlContent = cheerio.load(post.html || '');
const htmlContent = load(post.html || '');
const item = {
title: post.title,
// @TODO: DRY this up with data/meta/index & other excerpt code
Expand Down
219 changes: 219 additions & 0 deletions ghost/core/core/server/lib/html-utils.js
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd love to see JSDoc types for this file. Happy to help with that.

Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
const {parseDocument} = require('htmlparser2');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on my reading of the Cheerio source, it uses Parse5 unless you're in XML mode. Why use htmlparser2 here?

const {selectAll} = require('css-select');
const {removeElement, textContent, getChildren} = require('domutils');
const render = require('dom-serializer').default;

/**
* Lightweight HTML manipulation utility that replaces cheerio.
* Uses the same underlying packages (htmlparser2, css-select, domutils, dom-serializer)
* but without the full jQuery-like wrapper, avoiding version conflicts with juice.
*
* Usage:
* const {load} = require('./lib/html-utils');
* const $ = load(html);
* $('a').each((i, el) => { ... });
* const result = $.html();
*/

class WrappedSet {
constructor(elements, root) {
this._elements = Array.isArray(elements) ? elements : [elements];
this._root = root;
Comment on lines +20 to +21
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: can we use proper JavaScript private members (this.#elements and this.#root) instead?

this.length = this._elements.length;
// Support numeric indexing (e.g. $('a')[0])
for (let i = 0; i < this._elements.length; i++) {
this[i] = this._elements[i];
}
}

get(index) {
if (index === undefined) {
return this._elements;
}
return this._elements[index];
}

toArray() {
return this._elements.slice();
}

first() {
return new WrappedSet(this._elements.length ? [this._elements[0]] : [], this._root);
}

each(fn) {
this._elements.forEach((el, i) => {
fn(i, el);
});
return this;
}

map(fn) {
return this._elements.map((el, i) => fn(i, el));
}

attr(name, value) {
if (value === undefined) {
// Getter
const el = this._elements[0];
if (!el || !el.attribs) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: when is el.attribs missing? This question applies everywhere it's checked in this file.

return undefined;
}
return el.attribs[name];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Do we want to protect against keys on Object.prototype? For example, $('a').get('hasOwnProperty') could return an unexpected value.

}
// Setter
for (const el of this._elements) {
if (el.attribs) {
el.attribs[name] = value;
}
}
return this;
}

removeAttr(name) {
for (const el of this._elements) {
if (el.attribs) {
delete el.attribs[name];
}
}
return this;
}

remove() {
for (const el of this._elements) {
removeElement(el);
}
return this;
}

addClass(className) {
for (const el of this._elements) {
if (el.attribs) {
const existing = el.attribs.class || '';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: the ternary on the line below makes this || '' unnecessary.

const classes = existing ? existing.split(/\s+/) : [];
if (!classes.includes(className)) {
classes.push(className);
}
el.attribs.class = classes.join(' ');
}
}
return this;
}

text() {
if (this._elements.length === 0) {
return '';
}
return this._elements.map(el => textContent(el)).join('');
}

html(content) {
if (content === undefined) {
// Get inner HTML
if (this._elements.length === 0) {
return '';
}
const el = this._elements[0];
const children = getChildren(el);
return render(children, {decodeEntities: false});
}
// Set inner HTML
for (const el of this._elements) {
const parsed = parseDocument(content, {decodeEntities: false});
const newChildren = getChildren(parsed);
el.children = newChildren;
for (let j = 0; j < newChildren.length; j++) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I've only seen j used as an iteration index when i has already been used. Maybe we could change this to i?

newChildren[j].parent = el;
newChildren[j].prev = j > 0 ? newChildren[j - 1] : null;
newChildren[j].next = j < newChildren.length - 1 ? newChildren[j + 1] : null;
}
}
return this;
}

find(selector) {
const results = [];
for (const el of this._elements) {
results.push(...selectAll(selector, el));
}
return new WrappedSet(results, this._root);
}

before(htmlStr) {
for (const el of this._elements) {
const parent = el.parent;
if (!parent) {
continue;
}

const parsed = parseDocument(htmlStr, {decodeEntities: false});
const newNodes = getChildren(parsed);
if (newNodes.length === 0) {
continue;
}

const idx = parent.children.indexOf(el);
parent.children.splice(idx, 0, ...newNodes);

// Rebuild prev/next links only for the affected window
const end = Math.min(parent.children.length, idx + newNodes.length + 1);
for (let i = Math.max(0, idx - 1); i < end; i++) {
const node = parent.children[i];
node.parent = parent;
node.prev = i > 0 ? parent.children[i - 1] : null;
node.next = i < parent.children.length - 1 ? parent.children[i + 1] : null;
Comment on lines +162 to +164
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I wonder if this could be DRYed out with what's above in html().

}
}
return this;
}
}

/**
* Load an HTML string and return a query function.
* @param {string} html
* @returns {Function & {html: Function}} A jQuery-like query function with .html() method
*/
function load(html) {
const root = parseDocument(html, {decodeEntities: false});

function $(selectorOrElement) {
if (typeof selectorOrElement === 'string') {
const elements = selectAll(selectorOrElement, root);
return new WrappedSet(elements, root);
}
if (selectorOrElement === null || selectorOrElement === undefined) {
return new WrappedSet([], root);
}
if (Array.isArray(selectorOrElement)) {
return new WrappedSet(selectorOrElement, root);
}
// Wrap a raw element
return new WrappedSet(selectorOrElement, root);
}
Comment on lines +179 to +192
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Find places where optional element extraction might be wrapped back into $()
rg -nP '\.get\(\d+\)' ghost/core -g '!**/node_modules/**'
rg -nP '\$\(\s*[A-Za-z_$][A-Za-z0-9_$]*\s*\)' ghost/core -g '!**/node_modules/**'

Repository: TryGhost/Ghost

Length of output: 50371


🏁 Script executed:

# Read the html-utils.js file around the $ function and WrappedSet class
head -n 250 ghost/core/core/server/lib/html-utils.js | tail -n +100

Repository: TryGhost/Ghost

Length of output: 3297


🏁 Script executed:

# Check the WrappedSet constructor
grep -n "class WrappedSet\|function WrappedSet" ghost/core/core/server/lib/html-utils.js -A 20

Repository: TryGhost/Ghost

Length of output: 691


🏁 Script executed:

# Look at the test file to understand expected behavior
sed -n '1,100p' ghost/core/test/unit/server/lib/html-utils.test.js

Repository: TryGhost/Ghost

Length of output: 4030


🏁 Script executed:

# Check test cases around line 76 and surrounding tests
sed -n '70,100p' ghost/core/test/unit/server/lib/html-utils.test.js

Repository: TryGhost/Ghost

Length of output: 1233


🏁 Script executed:

# Look for any usages where $() might receive null/undefined
grep -n "$(null\|$(undefined" ghost/core/core/server -r -g '!**/node_modules/**' 2>/dev/null || echo "No direct patterns found"

Repository: TryGhost/Ghost

Length of output: 126


🏁 Script executed:

# Look at the complete html-utils.js file
wc -l ghost/core/core/server/lib/html-utils.js

Repository: TryGhost/Ghost

Length of output: 102


🏁 Script executed:

# Read the WrappedSet class methods to see if they assume valid elements
sed -n '18,150p' ghost/core/core/server/lib/html-utils.js

Repository: TryGhost/Ghost

Length of output: 3580


🏁 Script executed:

# Check email-renderer.js usage around line 409
sed -n '400,420p' ghost/core/core/server/services/email-service/email-renderer.js

Repository: TryGhost/Ghost

Length of output: 902


🏁 Script executed:

# Check mention-sending-service.js usage around line 191
sed -n '185,200p' ghost/core/core/server/services/mentions/mention-sending-service.js

Repository: TryGhost/Ghost

Length of output: 603


🏁 Script executed:

# Check the removeElement function
grep -n "function removeElement\|const removeElement" ghost/core/core/server/lib/html-utils.js -A 10

Repository: TryGhost/Ghost

Length of output: 40


🏁 Script executed:

# Check if there are any tests that call $() with null/undefined
grep -n "null\|undefined" ghost/core/test/unit/server/lib/html-utils.test.js | head -20

Repository: TryGhost/Ghost

Length of output: 532


🏁 Script executed:

# Look at the complete test file to see all tests
sed -n '1,250p' ghost/core/test/unit/server/lib/html-utils.test.js | tail -100

Repository: TryGhost/Ghost

Length of output: 4100


🏁 Script executed:

# Check the removeElement function to see if it handles null/undefined safely
sed -n '1,212p' ghost/core/core/server/lib/html-utils.js | grep -B5 -A15 "removeElement"

Repository: TryGhost/Ghost

Length of output: 1270


🏁 Script executed:

# Check textContent and getChildren functions
sed -n '1,212p' ghost/core/core/server/lib/html-utils.js | head -30

Repository: TryGhost/Ghost

Length of output: 1093


Handle nullish and array inputs in the $() function to return empty sets and prevent incorrect .length values.

When selectorOrElement is null, undefined, or an array, the function should treat these as empty sets rather than wrapping them as single invalid elements. Currently, passing null/undefined creates a WrappedSet with .length = 1 and an invalid element at index 0, causing incorrect behavior in .length checks and runtime failures in methods like .remove(), .text(), and .html().

🔧 Proposed fix
     function $(selectorOrElement) {
         if (typeof selectorOrElement === 'string') {
             const elements = selectAll(selectorOrElement, root);
             return new WrappedSet(elements, root);
         }
+        if (selectorOrElement === null || selectorOrElement === undefined) {
+            return new WrappedSet([], root);
+        }
+        if (Array.isArray(selectorOrElement)) {
+            return new WrappedSet(selectorOrElement, root);
+        }
         // Wrap a raw element
         return new WrappedSet(selectorOrElement, root);
     }
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@ghost/core/core/server/lib/html-utils.js` around lines 178 - 185, The $()
wrapper currently builds a WrappedSet around any input including null/undefined
or arrays, producing a bogus length of 1; change the function so that when
selectorOrElement is nullish (selectorOrElement == null) or is an Array
(Array.isArray(selectorOrElement)), it returns new WrappedSet([], root) instead
of wrapping the raw value; keep the existing path that calls
selectAll(selectorOrElement, root) and the fallback raw-element wrapping for
real single elements, but short-circuit these nullish/array cases to prevent
invalid .length and runtime errors in WrappedSet methods like remove(), text(),
and html().


$.html = function () {
return render(root, {decodeEntities: false});
};

$.text = function () {
return textContent(root);
};

$.root = root;

return $;
}

/**
* Query HTML without loading - equivalent to cheerio(selector, html)
* @param {string} selector
* @param {string} html
* @returns {WrappedSet}
*/
function select(selector, html) {
const root = parseDocument(html, {decodeEntities: false});
const elements = selectAll(selector, root);
return new WrappedSet(elements, root);
}

module.exports = {load, select};
14 changes: 5 additions & 9 deletions ghost/core/core/server/services/email-service/email-renderer.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,7 @@ function escapeRegExp(string) {
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

// This aids with lazyloading the cheerio dependency
function cheerioLoad(html) {
const cheerio = require('cheerio');
return cheerio.load(html);
}
const htmlUtils = require('../../lib/html-utils');

/**
* @typedef {string|null} Segment
Expand Down Expand Up @@ -305,7 +301,7 @@ class EmailRenderer {
return allowedSegments;
}

const $ = cheerioLoad(html);
const $ = htmlUtils.load(html);

let allSegments = $('[data-gh-segment]')
.get()
Expand Down Expand Up @@ -397,7 +393,7 @@ class EmailRenderer {
}
}

let $ = cheerioLoad(html);
let $ = htmlUtils.load(html);

// Remove parts of the HTML not applicable to the current segment - We do this
// before rendering the template as the preheader for the email may be generated
Expand Down Expand Up @@ -496,7 +492,7 @@ class EmailRenderer {
// juice will explicitly set the width/height attributes to `auto` on the <img /> tag
// This is not supported by Outlook, so we need to reset the width/height attributes to the original values
// Other clients will ignore the width/height attributes and use the inlined CSS instead
$ = cheerioLoad(html);
$ = htmlUtils.load(html);
const originalImageSizes = $('img').get().map((image) => {
const src = image.attribs.src;
const width = image.attribs.width;
Expand All @@ -513,7 +509,7 @@ class EmailRenderer {
html = juice(html, {inlinePseudoElements: true, removeStyleTags: true});

// happens after inlining of CSS so we can change element types without worrying about styling
$ = cheerioLoad(html);
$ = htmlUtils.load(html);

// Reset any `height="auto"` or `width="auto"` attributes to their original values before inlining CSS
const imageTags = $('img').get();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ module.exports = class MentionDiscoveryService {
return null;
}

const cheerio = require('cheerio');
const $ = cheerio.load(response.body);
const {load} = require('../../lib/html-utils');
const $ = load(response.body);

// must be first <link> OR <a> element with rel=webmention
href = $('a[rel="webmention"],link[rel="webmention"]').first().attr('href');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ module.exports = class MentionSendingService {
* @returns {URL[]}
*/
getLinks(html) {
const cheerio = require('cheerio');
const $ = cheerio.load(html);
const {load} = require('../../lib/html-utils');
const $ = load(html);
const urls = [];
const siteUrl = this.siteUrl;

Expand Down
4 changes: 2 additions & 2 deletions ghost/core/core/server/services/mentions/mention.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ module.exports = class Mention {

if (contentType.includes('text/html')) {
try {
const cheerio = require('cheerio');
const $ = cheerio.load(html);
const htmlUtils = require('../../lib/html-utils');
const $ = htmlUtils.load(html);
const hasTargetUrl = $('a[href*="' + this.target.href + '"], img[src*="' + this.target.href + '"], video[src*="' + this.target.href + '"]').length > 0;
this.#verified = hasTargetUrl;

Expand Down
4 changes: 2 additions & 2 deletions ghost/core/core/server/services/oembed/oembed-service.js
Original file line number Diff line number Diff line change
Expand Up @@ -377,12 +377,12 @@ class OEmbedService {
*/
async fetchOembedData(url, html, cardType) {
// Lazy require the library to keep boot quick
const cheerio = require('cheerio');
const {select} = require('../../lib/html-utils');

// check for <link rel="alternate" type="application/json+oembed"> element
let oembedUrl;
try {
oembedUrl = cheerio('link[type="application/json+oembed"]', html).attr('href');
oembedUrl = select('link[type="application/json+oembed"]', html).attr('href');
} catch (e) {
return this.unknownProvider(url);
}
Expand Down
6 changes: 5 additions & 1 deletion ghost/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
"cache-manager-ioredis": "2.1.0",
"chalk": "4.1.2",
"charset": "1.0.1",
"cheerio": "0.22.0",
"css-select": "5.1.0",
"clsx": "2.1.1",
"cluster-key-slot": "1.1.2",
"common-tags": "1.8.2",
Expand All @@ -145,7 +145,10 @@
"csso": "5.0.5",
"csv-writer": "1.6.0",
"date-fns": "2.30.0",
"dom-serializer": "2.0.0",
"domhandler": "5.0.3",
"dompurify": "3.3.0",
"domutils": "3.2.2",
"downsize": "0.0.8",
"entities": "4.5.0",
"express": "4.21.2",
Expand All @@ -167,6 +170,7 @@
"heic-convert": "2.1.0",
"html-to-text": "5.1.1",
"html5parser": "2.0.2",
"htmlparser2": "10.0.0",
"human-number": "2.0.7",
"iconv-lite": "0.6.3",
"image-size": "1.2.1",
Expand Down
Loading
Loading