From d923ebf00f02603654514bfa4b26fe72d92c1157 Mon Sep 17 00:00:00 2001
From: Alexei Novikov <alexei.novikov@dzhaworks.com>
Date: Sat, 4 Jun 2016 19:40:36 +0200
Subject: [PATCH 1/5] It is possible to send an url containing unescaped
 unicode characters, which would make Readabilty.js crash. Proactively
 encoding the url guards from this to happen.

---
 server.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/server.js b/server.js
index 8358af6..4cd4e2c 100644
--- a/server.js
+++ b/server.js
@@ -36,7 +36,7 @@ app.get("/api", function(req, res) {
 });
 
 app.get("/api/get", function(req, res) {
-  var url = req.query.url,
+  var url = encodeURI(req.query.url),
       sanitize = boolArg(req.query.sanitize),
       userAgent = req.query.userAgent;
   if (!url) {

From 9ffd34fd3d3b9349fcec1e54082da7658a958f26 Mon Sep 17 00:00:00 2001
From: Alexei Novikov <alexei.novikov@dzhaworks.com>
Date: Sat, 4 Jun 2016 19:53:22 +0200
Subject: [PATCH 2/5] As it is now, Readability.js seems to be unable to detect
 the text direction. This is a fix to the problem, given that the direction
 css property is defined at the body tag level of a document.

---
 phantom-scrape.js | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/phantom-scrape.js b/phantom-scrape.js
index d1c1ce5..2261e90 100644
--- a/phantom-scrape.js
+++ b/phantom-scrape.js
@@ -23,6 +23,17 @@ function outputJSON(object) {
  */
 function runReadability(url, userAgent, pageContent) {
   var location = document.location;
+  var getDirection = function(document) {
+    var bodyTags = document.getElementsByTagName('body');
+    if (bodyTags.length === 0) {
+      return "";
+    }
+    if (window.getComputedStyle) {
+      return window.getComputedStyle(bodyTags[0], null).getPropertyValue('direction');
+    }
+    return "";
+  };
+
   var uri = {
     spec: location.href,
     host: location.host,
@@ -37,6 +48,9 @@ function runReadability(url, userAgent, pageContent) {
     if (result) {
       result.userAgent = userAgent;
       result.isProbablyReaderable = isProbablyReaderable;
+      if (result.dir === undefined || result === '') {
+        result.dir = getDirection(document);
+      }
     } else {
       result = {
         error: {

From b43762acedef5013c8350d0a17f04b1323817c2e Mon Sep 17 00:00:00 2001
From: Alexei Novikov <alexei.novikov@dzhaworks.com>
Date: Sat, 4 Jun 2016 20:14:27 +0200
Subject: [PATCH 3/5] In a use case of Readability.js being run by the server
 side, this service would probably be the only part that sees the original
 HTML document.  So it is important to read some crucial meta-data here. In my
 case I need to know the document's language.

---
 README.md         |  2 ++
 phantom-scrape.js | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/README.md b/README.md
index 3e1c7dc..f62aa44 100644
--- a/README.md
+++ b/README.md
@@ -65,6 +65,7 @@ Content sanitization enabled:
     {
       "byline":"Nicolas Perriault —",
       "content":"<p><strong>So finally you&#39;re <a href=\"https://nicolas.perriault.net/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/\">testing",
+      "language": "en"
       "length":2867,
       "title":"Get your Frontend JavaScript Code Covered | Code",
       "uri":"https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/",
@@ -77,6 +78,7 @@ Content sanitization disabled (default):
     {
       "byline":"Nicolas Perriault —",
       "content":"<div id=\"readability-page-1\" class=\"page\"><section class=\"\">\n<p><strong>So finally you're…",
+      "language": "en"
       "length":3851,
       "title":"Get your Frontend JavaScript Code Covered | Code",
       "uri":"https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/",
diff --git a/phantom-scrape.js b/phantom-scrape.js
index 2261e90..6a1d519 100644
--- a/phantom-scrape.js
+++ b/phantom-scrape.js
@@ -23,6 +23,20 @@ function outputJSON(object) {
  */
 function runReadability(url, userAgent, pageContent) {
   var location = document.location;
+  var getLanguage = function(document) {
+    var lang = document.documentElement.lang;
+    if (lang !== undefined) {
+      return lang;
+    }
+    var metas = document.getElementsByTagName('meta');
+    for (var i=0; i<metas.length; i++) {
+      if (metas[i].getAttribute("name") === "language") {
+        return metas[i].getAttribute("content");
+      }
+    }
+    return "";
+  };
+
   var getDirection = function(document) {
     var bodyTags = document.getElementsByTagName('body');
     if (bodyTags.length === 0) {
@@ -48,6 +62,7 @@ function runReadability(url, userAgent, pageContent) {
     if (result) {
       result.userAgent = userAgent;
       result.isProbablyReaderable = isProbablyReaderable;
+      result.language = getLanguage(document);
       if (result.dir === undefined || result === '') {
         result.dir = getDirection(document);
       }

From 6ed23a21f7f35a1145342d9aed26fb7e0bff93dd Mon Sep 17 00:00:00 2001
From: Alexei Novikov <alexei.novikov@dzhaworks.com>
Date: Sat, 4 Jun 2016 22:02:14 +0200
Subject: [PATCH 4/5] Added IntelliJ-specific ignores

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 93f1361..bdba0b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 node_modules
 npm-debug.log
+.idea
+*.iml
\ No newline at end of file

From 277e21a1a0812d0b485f4d0dc294d7687227093e Mon Sep 17 00:00:00 2001
From: Alexei Novikov <alexei.novikov@dzhaworks.com>
Date: Mon, 6 Jun 2016 17:24:57 +0200
Subject: [PATCH 5/5] One second request timeout is not enough on some pages.
 Making it three secs.

---
 phantom-scrape.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/phantom-scrape.js b/phantom-scrape.js
index 6a1d519..232dcec 100644
--- a/phantom-scrape.js
+++ b/phantom-scrape.js
@@ -101,7 +101,7 @@ if (userAgent) {
 page.settings.loadImages = false;
 
 // ensure we don't waste time trying to load slow/missing resources
-page.settings.resourceTimeout = 1000;
+page.settings.resourceTimeout = 3000;
 
 page.onConsoleMessage = function(msg) {
   consoleLogs.push(msg);