diff --git a/packages/url-utils/package.json b/packages/url-utils/package.json index e774204e0..52496a3ea 100644 --- a/packages/url-utils/package.json +++ b/packages/url-utils/package.json @@ -37,7 +37,9 @@ "typescript": "5.9.3" }, "dependencies": { - "cheerio": "1.2.0", + "domhandler": "5.0.3", + "domutils": "3.2.2", + "htmlparser2": "10.1.0", "lodash": "4.17.23", "moment": "2.30.1", "moment-timezone": "0.6.0", diff --git a/packages/url-utils/src/utils/html-transform.ts b/packages/url-utils/src/utils/html-transform.ts index c585b7b42..56561fd5a 100644 --- a/packages/url-utils/src/utils/html-transform.ts +++ b/packages/url-utils/src/utils/html-transform.ts @@ -1,7 +1,7 @@ -import type {AnyNode} from 'domhandler'; +import type {Element, AnyNode} from 'domhandler'; +import {parseDocument} from 'htmlparser2'; +import {findAll, getAttributeValue, hasAttrib} from 'domutils'; import type {HtmlTransformOptions, HtmlTransformOptionsInput, UrlTransformFunction} from './types'; -// eslint-disable-next-line @typescript-eslint/no-require-imports -const cheerio = require('cheerio'); export const transformAttributes = [ 'href', @@ -55,7 +55,7 @@ function htmlTransform( return html; } - const htmlContent = cheerio.load(html, {decodeEntities: false}); + const dom = parseDocument(html, {decodeEntities: false}); // replacements is keyed with the attr name + original relative value so // that we can implement skips for untouchable urls @@ -85,22 +85,39 @@ function htmlTransform( replacements[key].push(replacement); } - transformAttributes.forEach((attributeName: string) => { - htmlContent('[' + attributeName + ']').each((ix: number, el: AnyNode) => { + function isInsideCode(el: AnyNode): boolean { + let node: AnyNode | null = el.parent as AnyNode | null; + while (node) { + if ('name' in node && node.name === 'code') { + return true; + } + node = node.parent as AnyNode | null; + } + return false; + } + + const elements = findAll((el: Element) => { + return transformAttributes.some(attr => hasAttrib(el, attr)); + }, dom.childNodes); + + for (const el of elements) { + for (const attributeName of transformAttributes) { + if (!hasAttrib(el, attributeName)) { + continue; + } + + const originalValue = getAttributeValue(el, attributeName) || ''; + // ignore elems and html inside of elements - const elementName = 'name' in el ? el.name : null; - if (elementName === 'stream' || htmlContent(el).closest('code').length) { + if (el.name === 'stream' || isInsideCode(el)) { addReplacement({ name: attributeName, - originalValue: htmlContent(el).attr(attributeName), + originalValue, skip: true }); - return; + continue; } - const elWrapper = htmlContent(el); - const originalValue = elWrapper.attr(attributeName); - if (attributeName === 'srcset' || attributeName === 'style') { let urls: string[]; @@ -137,8 +154,8 @@ function htmlTransform( }); } } - }); - }); + } + } // Loop over all replacements and use a regex to replace urls in the original html string. // Allows indentation and formatting to be kept compared to using DOM manipulation and render diff --git a/packages/url-utils/test/unit/utils/html-absolute-to-relative.test.js b/packages/url-utils/test/unit/utils/html-absolute-to-relative.test.js index 45935f407..15ee59b27 100644 --- a/packages/url-utils/test/unit/utils/html-absolute-to-relative.test.js +++ b/packages/url-utils/test/unit/utils/html-absolute-to-relative.test.js @@ -5,7 +5,7 @@ require('../../utils'); const sinon = require('sinon'); const rewire = require('rewire'); -const cheerio = require('cheerio'); +const htmlparser2 = require('htmlparser2'); const htmlTransformModule = rewire('../../../lib/utils/html-transform'); const htmlAbsToRelModule = rewire('../../../lib/utils/html-absolute-to-relative'); htmlAbsToRelModule.__set__('html_transform_1', htmlTransformModule); @@ -193,46 +193,46 @@ describe('utils: htmlAbsoluteToRelative()', function () { }); describe('DOM parsing is skipped', function () { - let cheerioLoadSpy, cheerioRestore, rewiredFn; + let parseDocumentSpy, parseDocumentRestore, rewiredFn; before(function () { rewiredFn = htmlAbsToRelModule.default; }); beforeEach(function () { - const cheerioProxy = {load: (...args) => cheerio.load(...args)}; - cheerioLoadSpy = sinon.spy(cheerioProxy, 'load'); - cheerioRestore = htmlTransformModule.__set__('cheerio', cheerioProxy); + const htmlparser2Proxy = {parseDocument: (...args) => htmlparser2.parseDocument(...args)}; + parseDocumentSpy = sinon.spy(htmlparser2Proxy, 'parseDocument'); + parseDocumentRestore = htmlTransformModule.__set__('htmlparser2_1', htmlparser2Proxy); }); afterEach(function () { - cheerioLoadSpy.restore(); - cheerioRestore(); + parseDocumentSpy.restore(); + parseDocumentRestore(); }); it('when html has no absolute URLs matching siteUrl', function () { const url = 'http://my-ghost-blog.com/'; rewiredFn('', url, options); - cheerioLoadSpy.called.should.be.false('blank html triggered parse'); + parseDocumentSpy.called.should.be.false('blank html triggered parse'); rewiredFn('test', url, options); - cheerioLoadSpy.called.should.be.false('hash url triggered parse'); + parseDocumentSpy.called.should.be.false('hash url triggered parse'); rewiredFn('test)', url, options); - cheerioLoadSpy.called.should.be.false('external url triggered parse'); + parseDocumentSpy.called.should.be.false('external url triggered parse'); rewiredFn('test)', url, options); - cheerioLoadSpy.calledOnce.should.be.true('site url didn\'t trigger parse'); + parseDocumentSpy.calledOnce.should.be.true('site url didn\'t trigger parse'); // ignores protocol when ignoreProtocol: true rewiredFn('test)', url, options); - cheerioLoadSpy.calledTwice.should.be.true('site url with different protocol didn\'t trigger parse'); + parseDocumentSpy.calledTwice.should.be.true('site url with different protocol didn\'t trigger parse'); // respects protocol when ignoreProtocol: false options.ignoreProtocol = false; rewiredFn('test)', url, options); - cheerioLoadSpy.calledTwice.should.be.true('site url with different protocol triggered parse when ignoreProtocol is false'); + parseDocumentSpy.calledTwice.should.be.true('site url with different protocol triggered parse when ignoreProtocol is false'); }); }); }); diff --git a/packages/url-utils/test/unit/utils/html-absolute-to-transform-ready.test.js b/packages/url-utils/test/unit/utils/html-absolute-to-transform-ready.test.js index 5fc7773ba..9e7e9ad1d 100644 --- a/packages/url-utils/test/unit/utils/html-absolute-to-transform-ready.test.js +++ b/packages/url-utils/test/unit/utils/html-absolute-to-transform-ready.test.js @@ -5,7 +5,7 @@ require('../../utils'); const rewire = require('rewire'); const sinon = require('sinon'); -const cheerio = require('cheerio'); +const htmlparser2 = require('htmlparser2'); const htmlTransformModule = rewire('../../../lib/utils/html-transform'); const htmlAbsToTRModule = rewire('../../../lib/utils/html-absolute-to-transform-ready'); htmlAbsToTRModule.__set__('html_transform_1', htmlTransformModule); @@ -348,46 +348,46 @@ describe('utils: htmlAbsoluteToTransformReady()', function () { }); describe('DOM parsing is skipped', function () { - let cheerioLoadSpy, cheerioRestore, rewiredFn; + let parseDocumentSpy, parseDocumentRestore, rewiredFn; before(function () { rewiredFn = htmlAbsToTRModule.default; }); beforeEach(function () { - const cheerioProxy = {load: (...args) => cheerio.load(...args)}; - cheerioLoadSpy = sinon.spy(cheerioProxy, 'load'); - cheerioRestore = htmlTransformModule.__set__('cheerio', cheerioProxy); + const htmlparser2Proxy = {parseDocument: (...args) => htmlparser2.parseDocument(...args)}; + parseDocumentSpy = sinon.spy(htmlparser2Proxy, 'parseDocument'); + parseDocumentRestore = htmlTransformModule.__set__('htmlparser2_1', htmlparser2Proxy); }); afterEach(function () { - cheerioLoadSpy.restore(); - cheerioRestore(); + parseDocumentSpy.restore(); + parseDocumentRestore(); }); it('when html has no absolute URLs matching siteUrl', function () { const url = 'http://my-ghost-blog.com/'; rewiredFn('', url, options); - cheerioLoadSpy.called.should.be.false('blank html triggered parse'); + parseDocumentSpy.called.should.be.false('blank html triggered parse'); rewiredFn('test', url, options); - cheerioLoadSpy.called.should.be.false('hash url triggered parse'); + parseDocumentSpy.called.should.be.false('hash url triggered parse'); rewiredFn('test)', url, options); - cheerioLoadSpy.called.should.be.false('external url triggered parse'); + parseDocumentSpy.called.should.be.false('external url triggered parse'); rewiredFn('test)', url, options); - cheerioLoadSpy.calledOnce.should.be.true('site url didn\'t trigger parse'); + parseDocumentSpy.calledOnce.should.be.true('site url didn\'t trigger parse'); // ignores protocol when ignoreProtocol: true rewiredFn('test)', url, options); - cheerioLoadSpy.calledTwice.should.be.true('site url with different protocol didn\'t trigger parse'); + parseDocumentSpy.calledTwice.should.be.true('site url with different protocol didn\'t trigger parse'); // respects protocol when ignoreProtocol: false options.ignoreProtocol = false; rewiredFn('test)', url, options); - cheerioLoadSpy.calledTwice.should.be.true('site url with different protocol triggered parse when ignoreProtocol is false'); + parseDocumentSpy.calledTwice.should.be.true('site url with different protocol triggered parse when ignoreProtocol is false'); }); it('when html contains CDN URLs, parsing is NOT skipped', function () { @@ -396,16 +396,16 @@ describe('utils: htmlAbsoluteToTransformReady()', function () { const mediaCdn = 'https://cdn.ghost.io/media'; const filesCdn = 'https://cdn.ghost.io/files'; - cheerioLoadSpy.resetHistory(); + parseDocumentSpy.resetHistory(); // HTML with ONLY image CDN URL should trigger parsing rewiredFn(``, url, { ...options, imageBaseUrl: imagesCdn }); - cheerioLoadSpy.calledOnce.should.be.true('image CDN URL didn\'t trigger parse'); + parseDocumentSpy.calledOnce.should.be.true('image CDN URL didn\'t trigger parse'); - cheerioLoadSpy.resetHistory(); + parseDocumentSpy.resetHistory(); // HTML with ONLY media CDN URL should trigger parsing rewiredFn(`