Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion packages/url-utils/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@
"typescript": "5.9.3"
},
"dependencies": {
"cheerio": "1.2.0",
"domhandler": "5.0.3",
"domutils": "3.2.2",
"htmlparser2": "10.1.0",
"lodash": "4.17.23",
"moment": "2.30.1",
"moment-timezone": "0.6.0",
Expand Down
47 changes: 32 additions & 15 deletions packages/url-utils/src/utils/html-transform.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import type {AnyNode} from 'domhandler';
import type {Element, AnyNode} from 'domhandler';
import {parseDocument} from 'htmlparser2';
import {findAll, getAttributeValue, hasAttrib} from 'domutils';
import type {HtmlTransformOptions, HtmlTransformOptionsInput, UrlTransformFunction} from './types';
// eslint-disable-next-line @typescript-eslint/no-require-imports
const cheerio = require('cheerio');

export const transformAttributes = [
'href',
Expand Down Expand Up @@ -55,7 +55,7 @@ function htmlTransform(
return html;
}

const htmlContent = cheerio.load(html, {decodeEntities: false});
const dom = parseDocument(html, {decodeEntities: false});

// replacements is keyed with the attr name + original relative value so
// that we can implement skips for untouchable urls
Expand Down Expand Up @@ -85,22 +85,39 @@ function htmlTransform(
replacements[key].push(replacement);
}

transformAttributes.forEach((attributeName: string) => {
htmlContent('[' + attributeName + ']').each((ix: number, el: AnyNode) => {
function isInsideCode(el: AnyNode): boolean {
let node: AnyNode | null = el.parent as AnyNode | null;
while (node) {
if ('name' in node && node.name === 'code') {
return true;
}
node = node.parent as AnyNode | null;
}
return false;
}

const elements = findAll((el: Element) => {
return transformAttributes.some(attr => hasAttrib(el, attr));
}, dom.childNodes);

for (const el of elements) {
for (const attributeName of transformAttributes) {
if (!hasAttrib(el, attributeName)) {
continue;
}

const originalValue = getAttributeValue(el, attributeName) || '';

// ignore <stream> elems and html inside of <code> elements
const elementName = 'name' in el ? el.name : null;
if (elementName === 'stream' || htmlContent(el).closest('code').length) {
if (el.name === 'stream' || isInsideCode(el)) {
addReplacement({
name: attributeName,
originalValue: htmlContent(el).attr(attributeName),
originalValue,
skip: true
});
return;
continue;
}

const elWrapper = htmlContent(el);
const originalValue = elWrapper.attr(attributeName);

if (attributeName === 'srcset' || attributeName === 'style') {
let urls: string[];

Expand Down Expand Up @@ -137,8 +154,8 @@ function htmlTransform(
});
}
}
});
});
}
}

// Loop over all replacements and use a regex to replace urls in the original html string.
// Allows indentation and formatting to be kept compared to using DOM manipulation and render
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ require('../../utils');
const sinon = require('sinon');
const rewire = require('rewire');

const cheerio = require('cheerio');
const htmlparser2 = require('htmlparser2');
const htmlTransformModule = rewire('../../../lib/utils/html-transform');
const htmlAbsToRelModule = rewire('../../../lib/utils/html-absolute-to-relative');
htmlAbsToRelModule.__set__('html_transform_1', htmlTransformModule);
Expand Down Expand Up @@ -193,46 +193,46 @@ describe('utils: htmlAbsoluteToRelative()', function () {
});

describe('DOM parsing is skipped', function () {
let cheerioLoadSpy, cheerioRestore, rewiredFn;
let parseDocumentSpy, parseDocumentRestore, rewiredFn;

before(function () {
rewiredFn = htmlAbsToRelModule.default;
});

beforeEach(function () {
const cheerioProxy = {load: (...args) => cheerio.load(...args)};
cheerioLoadSpy = sinon.spy(cheerioProxy, 'load');
cheerioRestore = htmlTransformModule.__set__('cheerio', cheerioProxy);
const htmlparser2Proxy = {parseDocument: (...args) => htmlparser2.parseDocument(...args)};
parseDocumentSpy = sinon.spy(htmlparser2Proxy, 'parseDocument');
parseDocumentRestore = htmlTransformModule.__set__('htmlparser2_1', htmlparser2Proxy);
});

afterEach(function () {
cheerioLoadSpy.restore();
cheerioRestore();
parseDocumentSpy.restore();
parseDocumentRestore();
});

it('when html has no absolute URLs matching siteUrl', function () {
const url = 'http://my-ghost-blog.com/';

rewiredFn('', url, options);
cheerioLoadSpy.called.should.be.false('blank html triggered parse');
parseDocumentSpy.called.should.be.false('blank html triggered parse');

rewiredFn('<a href="#test">test</a>', url, options);
cheerioLoadSpy.called.should.be.false('hash url triggered parse');
parseDocumentSpy.called.should.be.false('hash url triggered parse');

rewiredFn('<a href="https://example.com">test</a>)', url, options);
cheerioLoadSpy.called.should.be.false('external url triggered parse');
parseDocumentSpy.called.should.be.false('external url triggered parse');

rewiredFn('<a href="http://my-ghost-blog.com">test</a>)', url, options);
cheerioLoadSpy.calledOnce.should.be.true('site url didn\'t trigger parse');
parseDocumentSpy.calledOnce.should.be.true('site url didn\'t trigger parse');

// ignores protocol when ignoreProtocol: true
rewiredFn('<a href="https://my-ghost-blog.com">test</a>)', url, options);
cheerioLoadSpy.calledTwice.should.be.true('site url with different protocol didn\'t trigger parse');
parseDocumentSpy.calledTwice.should.be.true('site url with different protocol didn\'t trigger parse');

// respects protocol when ignoreProtocol: false
options.ignoreProtocol = false;
rewiredFn('<a href="https://my-ghost-blog.com">test</a>)', url, options);
cheerioLoadSpy.calledTwice.should.be.true('site url with different protocol triggered parse when ignoreProtocol is false');
parseDocumentSpy.calledTwice.should.be.true('site url with different protocol triggered parse when ignoreProtocol is false');
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ require('../../utils');
const rewire = require('rewire');
const sinon = require('sinon');

const cheerio = require('cheerio');
const htmlparser2 = require('htmlparser2');
const htmlTransformModule = rewire('../../../lib/utils/html-transform');
const htmlAbsToTRModule = rewire('../../../lib/utils/html-absolute-to-transform-ready');
htmlAbsToTRModule.__set__('html_transform_1', htmlTransformModule);
Expand Down Expand Up @@ -348,46 +348,46 @@ describe('utils: htmlAbsoluteToTransformReady()', function () {
});

describe('DOM parsing is skipped', function () {
let cheerioLoadSpy, cheerioRestore, rewiredFn;
let parseDocumentSpy, parseDocumentRestore, rewiredFn;

before(function () {
rewiredFn = htmlAbsToTRModule.default;
});

beforeEach(function () {
const cheerioProxy = {load: (...args) => cheerio.load(...args)};
cheerioLoadSpy = sinon.spy(cheerioProxy, 'load');
cheerioRestore = htmlTransformModule.__set__('cheerio', cheerioProxy);
const htmlparser2Proxy = {parseDocument: (...args) => htmlparser2.parseDocument(...args)};
parseDocumentSpy = sinon.spy(htmlparser2Proxy, 'parseDocument');
parseDocumentRestore = htmlTransformModule.__set__('htmlparser2_1', htmlparser2Proxy);
});

afterEach(function () {
cheerioLoadSpy.restore();
cheerioRestore();
parseDocumentSpy.restore();
parseDocumentRestore();
});

it('when html has no absolute URLs matching siteUrl', function () {
const url = 'http://my-ghost-blog.com/';

rewiredFn('', url, options);
cheerioLoadSpy.called.should.be.false('blank html triggered parse');
parseDocumentSpy.called.should.be.false('blank html triggered parse');

rewiredFn('<a href="#test">test</a>', url, options);
cheerioLoadSpy.called.should.be.false('hash url triggered parse');
parseDocumentSpy.called.should.be.false('hash url triggered parse');

rewiredFn('<a href="https://example.com">test</a>)', url, options);
cheerioLoadSpy.called.should.be.false('external url triggered parse');
parseDocumentSpy.called.should.be.false('external url triggered parse');

rewiredFn('<a href="http://my-ghost-blog.com">test</a>)', url, options);
cheerioLoadSpy.calledOnce.should.be.true('site url didn\'t trigger parse');
parseDocumentSpy.calledOnce.should.be.true('site url didn\'t trigger parse');

// ignores protocol when ignoreProtocol: true
rewiredFn('<a href="https://my-ghost-blog.com">test</a>)', url, options);
cheerioLoadSpy.calledTwice.should.be.true('site url with different protocol didn\'t trigger parse');
parseDocumentSpy.calledTwice.should.be.true('site url with different protocol didn\'t trigger parse');

// respects protocol when ignoreProtocol: false
options.ignoreProtocol = false;
rewiredFn('<a href="https://my-ghost-blog.com">test</a>)', url, options);
cheerioLoadSpy.calledTwice.should.be.true('site url with different protocol triggered parse when ignoreProtocol is false');
parseDocumentSpy.calledTwice.should.be.true('site url with different protocol triggered parse when ignoreProtocol is false');
});

it('when html contains CDN URLs, parsing is NOT skipped', function () {
Expand All @@ -396,36 +396,36 @@ describe('utils: htmlAbsoluteToTransformReady()', function () {
const mediaCdn = 'https://cdn.ghost.io/media';
const filesCdn = 'https://cdn.ghost.io/files';

cheerioLoadSpy.resetHistory();
parseDocumentSpy.resetHistory();

// HTML with ONLY image CDN URL should trigger parsing
rewiredFn(`<img src="${imagesCdn}/content/images/photo.jpg">`, url, {
...options,
imageBaseUrl: imagesCdn
});
cheerioLoadSpy.calledOnce.should.be.true('image CDN URL didn\'t trigger parse');
parseDocumentSpy.calledOnce.should.be.true('image CDN URL didn\'t trigger parse');

cheerioLoadSpy.resetHistory();
parseDocumentSpy.resetHistory();

// HTML with ONLY media CDN URL should trigger parsing
rewiredFn(`<video src="${mediaCdn}/content/media/video.mp4">`, url, {
...options,
staticMediaUrlPrefix: 'content/media',
mediaBaseUrl: mediaCdn
});
cheerioLoadSpy.calledOnce.should.be.true('media CDN URL didn\'t trigger parse');
parseDocumentSpy.calledOnce.should.be.true('media CDN URL didn\'t trigger parse');

cheerioLoadSpy.resetHistory();
parseDocumentSpy.resetHistory();

// HTML with ONLY files CDN URL should trigger parsing
rewiredFn(`<a href="${filesCdn}/content/files/doc.pdf">Download</a>`, url, {
...options,
staticFilesUrlPrefix: 'content/files',
filesBaseUrl: filesCdn
});
cheerioLoadSpy.calledOnce.should.be.true('files CDN URL didn\'t trigger parse');
parseDocumentSpy.calledOnce.should.be.true('files CDN URL didn\'t trigger parse');

cheerioLoadSpy.resetHistory();
parseDocumentSpy.resetHistory();

// HTML with multiple CDN URLs but no site URL should trigger parsing
rewiredFn(`
Expand All @@ -437,21 +437,21 @@ describe('utils: htmlAbsoluteToTransformReady()', function () {
imageBaseUrl: imagesCdn,
mediaBaseUrl: mediaCdn
});
cheerioLoadSpy.calledOnce.should.be.true('multiple CDN URLs didn\'t trigger parse');
parseDocumentSpy.calledOnce.should.be.true('multiple CDN URLs didn\'t trigger parse');
});

it('when html has no matching URLs (no site or CDN), parsing is skipped', function () {
const url = 'http://my-ghost-blog.com/';
const imagesCdn = 'https://cdn.ghost.io/images';

cheerioLoadSpy.resetHistory();
parseDocumentSpy.resetHistory();

// External URL with CDN configured should not trigger parsing
rewiredFn('<a href="https://example.com">test</a>', url, {
...options,
imageBaseUrl: imagesCdn
});
cheerioLoadSpy.called.should.be.false('external url triggered parse even with CDN configured');
parseDocumentSpy.called.should.be.false('external url triggered parse even with CDN configured');
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ require('../../utils');
const sinon = require('sinon');
const rewire = require('rewire');

const cheerio = require('cheerio');
const htmlparser2 = require('htmlparser2');
const htmlTransformModule = rewire('../../../lib/utils/html-transform');
const htmlRelToAbsModule = rewire('../../../lib/utils/html-relative-to-absolute');
htmlRelToAbsModule.__set__('html_transform_1', htmlTransformModule);
Expand Down Expand Up @@ -252,47 +252,47 @@ describe('utils: htmlRelativeToAbsolute()', function () {
});

describe('DOM parsing is skipped', function () {
let cheerioLoadSpy, cheerioRestore, rewiredFn;
let parseDocumentSpy, parseDocumentRestore, rewiredFn;

before(function () {
rewiredFn = htmlRelToAbsModule.default;
});

beforeEach(function () {
const cheerioProxy = {load: (...args) => cheerio.load(...args)};
cheerioLoadSpy = sinon.spy(cheerioProxy, 'load');
cheerioRestore = htmlTransformModule.__set__('cheerio', cheerioProxy);
const htmlparser2Proxy = {parseDocument: (...args) => htmlparser2.parseDocument(...args)};
parseDocumentSpy = sinon.spy(htmlparser2Proxy, 'parseDocument');
parseDocumentRestore = htmlTransformModule.__set__('htmlparser2_1', htmlparser2Proxy);
});

afterEach(function () {
cheerioLoadSpy.restore();
cheerioRestore();
parseDocumentSpy.restore();
parseDocumentRestore();
});

it('when html has no attributes that would be transformed', function () {
const url = 'http://my-ghost-blog.com/';

rewiredFn('', url, itemPath, options);
cheerioLoadSpy.called.should.be.false('blank html triggered parse');
parseDocumentSpy.called.should.be.false('blank html triggered parse');

rewiredFn('<p>HTML without links</p>', url, itemPath, options);
cheerioLoadSpy.called.should.be.false('html with no links triggered parse');
parseDocumentSpy.called.should.be.false('html with no links triggered parse');

rewiredFn('<a href="#test">test</a>', url, itemPath, options);
cheerioLoadSpy.callCount.should.equal(1, 'href didn\'t trigger parse');
parseDocumentSpy.callCount.should.equal(1, 'href didn\'t trigger parse');

rewiredFn('<img src="/image.png">', url, itemPath, options);
cheerioLoadSpy.callCount.should.equal(2, 'src didn\'t trigger parse');
parseDocumentSpy.callCount.should.equal(2, 'src didn\'t trigger parse');

rewiredFn('<img srcset="/image-4x.png 4x, /image-2x.png 2x">)', url, itemPath, options);
cheerioLoadSpy.callCount.should.equal(3, 'srcset didn\'t trigger parse');
parseDocumentSpy.callCount.should.equal(3, 'srcset didn\'t trigger parse');

rewiredFn('<div style="background-image: url(\'/image.png\')"></div>', url, itemPath, options);
cheerioLoadSpy.callCount.should.equal(4, 'style didn\'t trigger parse');
parseDocumentSpy.callCount.should.equal(4, 'style didn\'t trigger parse');

options.assetsOnly = true;
rewiredFn('<a href="/my-post/">post</a>', url, itemPath, options);
cheerioLoadSpy.callCount.should.equal(4, 'href triggered parse when no url matches asset path');
parseDocumentSpy.callCount.should.equal(4, 'href triggered parse when no url matches asset path');
});
});
});
Loading
Loading