From d923ebf00f02603654514bfa4b26fe72d92c1157 Mon Sep 17 00:00:00 2001 From: Alexei Novikov Date: Sat, 4 Jun 2016 19:40:36 +0200 Subject: [PATCH 1/5] It is possible to send an url containing unescaped unicode characters, which would make Readabilty.js crash. Proactively encoding the url guards from this to happen. --- server.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server.js b/server.js index 8358af6..4cd4e2c 100644 --- a/server.js +++ b/server.js @@ -36,7 +36,7 @@ app.get("/api", function(req, res) { }); app.get("/api/get", function(req, res) { - var url = req.query.url, + var url = encodeURI(req.query.url), sanitize = boolArg(req.query.sanitize), userAgent = req.query.userAgent; if (!url) { From 9ffd34fd3d3b9349fcec1e54082da7658a958f26 Mon Sep 17 00:00:00 2001 From: Alexei Novikov Date: Sat, 4 Jun 2016 19:53:22 +0200 Subject: [PATCH 2/5] As it is now, Readability.js seems to be unable to detect the text direction. This is a fix to the problem, given that the direction css property is defined at the body tag level of a document. --- phantom-scrape.js | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/phantom-scrape.js b/phantom-scrape.js index d1c1ce5..2261e90 100644 --- a/phantom-scrape.js +++ b/phantom-scrape.js @@ -23,6 +23,17 @@ function outputJSON(object) { */ function runReadability(url, userAgent, pageContent) { var location = document.location; + var getDirection = function(document) { + var bodyTags = document.getElementsByTagName('body'); + if (bodyTags.length === 0) { + return ""; + } + if (window.getComputedStyle) { + return window.getComputedStyle(bodyTags[0], null).getPropertyValue('direction'); + } + return ""; + }; + var uri = { spec: location.href, host: location.host, @@ -37,6 +48,9 @@ function runReadability(url, userAgent, pageContent) { if (result) { result.userAgent = userAgent; result.isProbablyReaderable = isProbablyReaderable; + if (result.dir === undefined || result === '') { + result.dir = getDirection(document); + } } else { result = { error: { From b43762acedef5013c8350d0a17f04b1323817c2e Mon Sep 17 00:00:00 2001 From: Alexei Novikov Date: Sat, 4 Jun 2016 20:14:27 +0200 Subject: [PATCH 3/5] In a use case of Readability.js being run by the server side, this service would probably be the only part that sees the original HTML document. So it is important to read some crucial meta-data here. In my case I need to know the document's language. --- README.md | 2 ++ phantom-scrape.js | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/README.md b/README.md index 3e1c7dc..f62aa44 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ Content sanitization enabled: { "byline":"Nicolas Perriault —", "content":"

So finally you're testing", + "language": "en" "length":2867, "title":"Get your Frontend JavaScript Code Covered | Code", "uri":"https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/", @@ -77,6 +78,7 @@ Content sanitization disabled (default): { "byline":"Nicolas Perriault —", "content":"

\n

So finally you're…", + "language": "en" "length":3851, "title":"Get your Frontend JavaScript Code Covered | Code", "uri":"https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/", diff --git a/phantom-scrape.js b/phantom-scrape.js index 2261e90..6a1d519 100644 --- a/phantom-scrape.js +++ b/phantom-scrape.js @@ -23,6 +23,20 @@ function outputJSON(object) { */ function runReadability(url, userAgent, pageContent) { var location = document.location; + var getLanguage = function(document) { + var lang = document.documentElement.lang; + if (lang !== undefined) { + return lang; + } + var metas = document.getElementsByTagName('meta'); + for (var i=0; i Date: Sat, 4 Jun 2016 22:02:14 +0200 Subject: [PATCH 4/5] Added IntelliJ-specific ignores --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 93f1361..bdba0b6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ node_modules npm-debug.log +.idea +*.iml \ No newline at end of file From 277e21a1a0812d0b485f4d0dc294d7687227093e Mon Sep 17 00:00:00 2001 From: Alexei Novikov Date: Mon, 6 Jun 2016 17:24:57 +0200 Subject: [PATCH 5/5] One second request timeout is not enough on some pages. Making it three secs. --- phantom-scrape.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phantom-scrape.js b/phantom-scrape.js index 6a1d519..232dcec 100644 --- a/phantom-scrape.js +++ b/phantom-scrape.js @@ -101,7 +101,7 @@ if (userAgent) { page.settings.loadImages = false; // ensure we don't waste time trying to load slow/missing resources -page.settings.resourceTimeout = 1000; +page.settings.resourceTimeout = 3000; page.onConsoleMessage = function(msg) { consoleLogs.push(msg);