From dbd38f5bb0b4605b79479126ec2450f958401dbe Mon Sep 17 00:00:00 2001
From: Filioglo Alexandr <nightbloos@gmail.com>
Date: Wed, 15 Jul 2020 17:45:06 +0300
Subject: [PATCH 1/6] Add MaxDocumentLength and custom UserAgent support

---
 goscraper.go | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/goscraper.go b/goscraper.go
index abbe60e..b6e3b57 100644
--- a/goscraper.go
+++ b/goscraper.go
@@ -2,6 +2,7 @@ package goscraper
 
 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
@@ -18,10 +19,16 @@ var (
 	fragmentRegexp         = regexp.MustCompile("#!(.*)")
 )
 
+type ScraperOptions struct {
+	MaxDocumentLength int64
+	UserAgent         string
+}
+
 type Scraper struct {
 	Url                *url.URL
 	EscapedFragmentUrl *url.URL
 	MaxRedirect        int
+	Options            ScraperOptions
 }
 
 type Document struct {
@@ -38,12 +45,12 @@ type DocumentPreview struct {
 	Link        string
 }
 
-func Scrape(uri string, maxRedirect int) (*Document, error) {
+func Scrape(uri string, maxRedirect int, options ScraperOptions) (*Document, error) {
 	u, err := url.Parse(uri)
 	if err != nil {
 		return nil, err
 	}
-	return (&Scraper{Url: u, MaxRedirect: maxRedirect}).Scrape()
+	return (&Scraper{Url: u, MaxRedirect: maxRedirect, Options: options}).Scrape()
 }
 
 func (scraper *Scraper) Scrape() (*Document, error) {
@@ -109,6 +116,16 @@ func (scraper *Scraper) toFragmentUrl() error {
 }
 
 func (scraper *Scraper) getDocument() (*Document, error) {
+	addUserAgent := func(req *http.Request) *http.Request {
+		userAgent := "GoScraper"
+		if len(scraper.Options.UserAgent) != 0 {
+			userAgent = scraper.Options.UserAgent
+		}
+		req.Header.Add("User-Agent", userAgent)
+
+		return req
+	}
+
 	scraper.MaxRedirect -= 1
 	if strings.Contains(scraper.Url.String(), "#!") {
 		scraper.toFragmentUrl()
@@ -117,11 +134,31 @@ func (scraper *Scraper) getDocument() (*Document, error) {
 		scraper.EscapedFragmentUrl = scraper.Url
 	}
 
+	if scraper.Options.MaxDocumentLength > 0 {
+		// We try first to check content length (if it's present) - and if isn't - already limit by body size
+		req, err := http.NewRequest("HEAD", scraper.getUrl(), nil)
+		if err != nil {
+			return nil, err
+		}
+		req = addUserAgent(req)
+
+		resp, err := http.DefaultClient.Do(req)
+		if resp != nil {
+			defer resp.Body.Close()
+		}
+		if err != nil {
+			return nil, err
+		}
+		if resp.ContentLength > scraper.Options.MaxDocumentLength {
+			return nil, errors.New("Content-Length exceed limits")
+		}
+	}
+
 	req, err := http.NewRequest("GET", scraper.getUrl(), nil)
 	if err != nil {
 		return nil, err
 	}
-	req.Header.Add("User-Agent", "GoScraper")
+	req = addUserAgent(req)
 
 	resp, err := http.DefaultClient.Do(req)
 	if resp != nil {
@@ -135,6 +172,11 @@ func (scraper *Scraper) getDocument() (*Document, error) {
 		scraper.EscapedFragmentUrl = nil
 		scraper.Url = resp.Request.URL
 	}
+
+	if scraper.Options.MaxDocumentLength > 0 {
+		resp.Body = http.MaxBytesReader(nil, resp.Body, scraper.Options.MaxDocumentLength)
+	}
+
 	b, err := convertUTF8(resp.Body, resp.Header.Get("content-type"))
 	if err != nil {
 		return nil, err
@@ -197,7 +239,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 				if cleanStr(attr.Key) == "rel" && cleanStr(attr.Val) == "canonical" {
 					canonical = true
 				}
-				if cleanStr(attr.Key) == "rel" && strings.Contains(cleanStr(attr.Val),  "icon") {
+				if cleanStr(attr.Key) == "rel" && strings.Contains(cleanStr(attr.Val), "icon") {
 					hasIcon = true
 				}
 				if cleanStr(attr.Key) == "href" {

From c5a417ac61c90cfa97aecac9d99d2fb20b825259 Mon Sep 17 00:00:00 2001
From: Alexandr Filioglo <nightbloos@mail.ru>
Date: Fri, 7 Aug 2020 13:18:03 +0300
Subject: [PATCH 2/6] Fixes and improvements (#1)

- added support for og:type
- fixed incorrect handling of relative paths
- changed to "silent" checks Content-Length in Head requests
---
 goscraper.go | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/goscraper.go b/goscraper.go
index b6e3b57..a50b64e 100644
--- a/goscraper.go
+++ b/goscraper.go
@@ -41,6 +41,7 @@ type DocumentPreview struct {
 	Name        string
 	Title       string
 	Description string
+	Type        string
 	Images      []string
 	Link        string
 }
@@ -137,20 +138,18 @@ func (scraper *Scraper) getDocument() (*Document, error) {
 	if scraper.Options.MaxDocumentLength > 0 {
 		// We try first to check content length (if it's present) - and if isn't - already limit by body size
 		req, err := http.NewRequest("HEAD", scraper.getUrl(), nil)
-		if err != nil {
-			return nil, err
-		}
-		req = addUserAgent(req)
+		if err == nil {
+			req = addUserAgent(req)
 
-		resp, err := http.DefaultClient.Do(req)
-		if resp != nil {
-			defer resp.Body.Close()
-		}
-		if err != nil {
-			return nil, err
-		}
-		if resp.ContentLength > scraper.Options.MaxDocumentLength {
-			return nil, errors.New("Content-Length exceed limits")
+			resp, err := http.DefaultClient.Do(req)
+			if resp != nil {
+				defer resp.Body.Close()
+			}
+			if err == nil {
+				if resp.ContentLength > scraper.Options.MaxDocumentLength {
+					return nil, errors.New("Content-Length exceed limits")
+				}
+			}
 		}
 	}
 
@@ -280,6 +279,8 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 				doc.Preview.Name = content
 			case "og:title":
 				doc.Preview.Title = content
+			case "og:type":
+				doc.Preview.Type = content
 			case "og:description":
 				doc.Preview.Description = content
 			case "description":
@@ -322,7 +323,11 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 						return err
 					}
 					if !imgUrl.IsAbs() {
-						doc.Preview.Images = append(doc.Preview.Images, fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, imgUrl.Path))
+						if string(imgUrl.Path[0]) == "/" {
+							doc.Preview.Images = append(doc.Preview.Images, fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, imgUrl.Path))
+						} else {
+							doc.Preview.Images = append(doc.Preview.Images, fmt.Sprintf("%s://%s/%s", scraper.Url.Scheme, scraper.Url.Host, imgUrl.Path))
+						}
 					} else {
 						doc.Preview.Images = append(doc.Preview.Images, attr.Val)
 					}

From 5044043eb93a4cb78ee2345bdfa1f7121c88aa5a Mon Sep 17 00:00:00 2001
From: Alexandr Filioglo <nightbloos@mail.ru>
Date: Fri, 11 Sep 2020 17:26:50 +0300
Subject: [PATCH 3/6] Added support of response headers & added builder. Split
 `Scrape` into 2 different functions `GetDocument` and `ParseDocument` (#2)

---
 goscraper.go | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 81 insertions(+), 3 deletions(-)

diff --git a/goscraper.go b/goscraper.go
index a50b64e..868fc28 100644
--- a/goscraper.go
+++ b/goscraper.go
@@ -19,6 +19,63 @@ var (
 	fragmentRegexp         = regexp.MustCompile("#!(.*)")
 )
 
+type scrapeSettings struct {
+	userAgent         string
+	maxDocumentLength int64
+	url               string
+	maxRedirect       int
+}
+
+type ScrapeBuilder interface {
+	SetUserAgent(string) ScrapeBuilder
+	SetMaxDocumentLength(int64) ScrapeBuilder
+	SetUrl(string) ScrapeBuilder
+	SetMaxRedirect(int) ScrapeBuilder
+	Build() (*Scraper, error)
+}
+
+type scrapeBuilder struct {
+	scrapeSettings scrapeSettings
+}
+
+func (b *scrapeBuilder) Build() (*Scraper, error) {
+	u, err := url.Parse(b.scrapeSettings.url)
+	if err != nil {
+		return nil, err
+	}
+	return &Scraper{
+		Url:         u,
+		MaxRedirect: b.scrapeSettings.maxRedirect,
+		Options: ScraperOptions{
+			MaxDocumentLength: b.scrapeSettings.maxDocumentLength,
+			UserAgent:         b.scrapeSettings.userAgent,
+		}}, nil
+}
+
+func (b *scrapeBuilder) SetUrl(s string) ScrapeBuilder {
+	b.scrapeSettings.url = s
+	return b
+}
+
+func (b *scrapeBuilder) SetMaxRedirect(i int) ScrapeBuilder {
+	b.scrapeSettings.maxRedirect = i
+	return b
+}
+
+func (b *scrapeBuilder) SetMaxDocumentLength(maxDocLength int64) ScrapeBuilder {
+	b.scrapeSettings.maxDocumentLength = maxDocLength
+	return b
+}
+
+func (b *scrapeBuilder) SetUserAgent(s string) ScrapeBuilder {
+	b.scrapeSettings.userAgent = s
+	return b
+}
+
+func NewScrapeBuilder() ScrapeBuilder {
+	return &scrapeBuilder{scrapeSettings{userAgent: "GoScraper"}}
+}
+
 type ScraperOptions struct {
 	MaxDocumentLength int64
 	UserAgent         string
@@ -32,8 +89,13 @@ type Scraper struct {
 }
 
 type Document struct {
-	Body    bytes.Buffer
-	Preview DocumentPreview
+	Body      bytes.Buffer
+	Preview   DocumentPreview
+	ResHeader ResHeaders
+}
+
+type ResHeaders struct {
+	ContentType string
 }
 
 type DocumentPreview struct {
@@ -66,6 +128,18 @@ func (scraper *Scraper) Scrape() (*Document, error) {
 	return doc, nil
 }
 
+func (scraper *Scraper) GetDocument() (*Document, error) {
+	return scraper.getDocument()
+}
+
+func (scraper *Scraper) ParseDocument(doc *Document) (*Document, error) {
+	err := scraper.parseDocument(doc)
+	if err != nil {
+		return nil, err
+	}
+	return doc, nil
+}
+
 func (scraper *Scraper) getUrl() string {
 	if scraper.EscapedFragmentUrl != nil {
 		return scraper.EscapedFragmentUrl.String()
@@ -180,7 +254,11 @@ func (scraper *Scraper) getDocument() (*Document, error) {
 	if err != nil {
 		return nil, err
 	}
-	doc := &Document{Body: b, Preview: DocumentPreview{Link: scraper.Url.String()}}
+	doc := &Document{
+		Body:      b,
+		Preview:   DocumentPreview{Link: scraper.Url.String()},
+		ResHeader: ResHeaders{ContentType: resp.Header.Get("content-type")},
+	}
 
 	return doc, nil
 }

From afaa757a65510e19e4b0182d174bce424c897226 Mon Sep 17 00:00:00 2001
From: Alexandr Filioglo <nightbloos@mail.ru>
Date: Tue, 20 Oct 2020 16:43:12 +0300
Subject: [PATCH 4/6] Added Scrape service interface

---
 .gitignore   | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++
 goscraper.go |  14 +++++--
 2 files changed, 116 insertions(+), 3 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f3f7ad1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,105 @@
+# Created by .ignore support plugin (hsz.mobi)
+### VisualStudioCode template
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+
+# Local History for Visual Studio Code
+.history/
+
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+.idea
+
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### Go template
+# Binaries for programs and plugins
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+
+# Test binary, built with `go test -c`
+*.test
+
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
+
+# Dependency directories (remove the comment below to include it)
+# vendor/
+
+tests/
diff --git a/goscraper.go b/goscraper.go
index 868fc28..c8871f8 100644
--- a/goscraper.go
+++ b/goscraper.go
@@ -31,14 +31,14 @@ type ScrapeBuilder interface {
 	SetMaxDocumentLength(int64) ScrapeBuilder
 	SetUrl(string) ScrapeBuilder
 	SetMaxRedirect(int) ScrapeBuilder
-	Build() (*Scraper, error)
+	Build() (ScrapeService, error)
 }
 
 type scrapeBuilder struct {
 	scrapeSettings scrapeSettings
 }
 
-func (b *scrapeBuilder) Build() (*Scraper, error) {
+func (b *scrapeBuilder) Build() (ScrapeService, error) {
 	u, err := url.Parse(b.scrapeSettings.url)
 	if err != nil {
 		return nil, err
@@ -73,7 +73,9 @@ func (b *scrapeBuilder) SetUserAgent(s string) ScrapeBuilder {
 }
 
 func NewScrapeBuilder() ScrapeBuilder {
-	return &scrapeBuilder{scrapeSettings{userAgent: "GoScraper"}}
+	return &scrapeBuilder{
+		scrapeSettings: scrapeSettings{userAgent: "GoScraper"},
+	}
 }
 
 type ScraperOptions struct {
@@ -108,6 +110,12 @@ type DocumentPreview struct {
 	Link        string
 }
 
+type ScrapeService interface {
+	Scrape() (*Document, error)
+	GetDocument() (*Document, error)
+	ParseDocument(doc *Document) (*Document, error)
+}
+
 func Scrape(uri string, maxRedirect int, options ScraperOptions) (*Document, error) {
 	u, err := url.Parse(uri)
 	if err != nil {

From 38c9e83c465d5e7b67913621ff8e060125032c22 Mon Sep 17 00:00:00 2001
From: Alexandr Filioglo <nightbloos@mail.ru>
Date: Wed, 18 Nov 2020 10:06:20 +0200
Subject: [PATCH 5/6] Fixed incorrect creation URLs from non absolute one

Due to the reason that we "re-create" new link - only by scheme, host, and path - is present risk to lose some other data from the original link.
Previously `/some/path.png?param=value`, was transformed into `http://mydomain.com/some/path.png`
Now this issue should be fixed, and the output should be `http://mydomain.com/some/path.png?param=value`
---
 goscraper.go | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/goscraper.go b/goscraper.go
index c8871f8..a466f30 100644
--- a/goscraper.go
+++ b/goscraper.go
@@ -382,10 +382,8 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 					return err
 				}
 				if !ogImgUrl.IsAbs() {
-					ogImgUrl, err = url.Parse(fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, ogImgUrl.Path))
-					if err != nil {
-						return err
-					}
+					ogImgUrl.Host = scraper.Url.Host
+					ogImgUrl.Scheme = scraper.Url.Scheme
 				}
 
 				doc.Preview.Images = []string{ogImgUrl.String()}

From 44a43d859e3ddccb808a6bf0e94b0729ec0c6be9 Mon Sep 17 00:00:00 2001
From: Alexandr Filioglo <nightbloos@mail.ru>
Date: Mon, 18 Jan 2021 10:10:00 +0200
Subject: [PATCH 6/6] Add support for "deeper" search of optional property.
 (#5)

For some ULRs was found that we can't get for strange reason the `og:type` data.
One of this ULRs - was youtube links.
Was detected that in YouTube they keep metadata in body (and not in head as other normal services).
And because previously the criteria for breaking loop of procession of tokens was "we have Title + description + ogImage and we passed head" - we were not able to process all other optional meta after that we pass head.

Now we are able to control how much tokens we can process before breaking loop (or if we found required optional fields already)
---
 goscraper.go | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/goscraper.go b/goscraper.go
index a466f30..f9e5f7f 100644
--- a/goscraper.go
+++ b/goscraper.go
@@ -24,6 +24,7 @@ type scrapeSettings struct {
 	maxDocumentLength int64
 	url               string
 	maxRedirect       int
+	maxTokenDepth     int
 }
 
 type ScrapeBuilder interface {
@@ -32,6 +33,7 @@ type ScrapeBuilder interface {
 	SetUrl(string) ScrapeBuilder
 	SetMaxRedirect(int) ScrapeBuilder
 	Build() (ScrapeService, error)
+	SetMaxTokenDepth(int) ScrapeBuilder
 }
 
 type scrapeBuilder struct {
@@ -49,6 +51,7 @@ func (b *scrapeBuilder) Build() (ScrapeService, error) {
 		Options: ScraperOptions{
 			MaxDocumentLength: b.scrapeSettings.maxDocumentLength,
 			UserAgent:         b.scrapeSettings.userAgent,
+			MaxTokenDepth:     b.scrapeSettings.maxTokenDepth,
 		}}, nil
 }
 
@@ -62,6 +65,11 @@ func (b *scrapeBuilder) SetMaxRedirect(i int) ScrapeBuilder {
 	return b
 }
 
+func (b *scrapeBuilder) SetMaxTokenDepth(i int) ScrapeBuilder {
+	b.scrapeSettings.maxTokenDepth = i
+	return b
+}
+
 func (b *scrapeBuilder) SetMaxDocumentLength(maxDocLength int64) ScrapeBuilder {
 	b.scrapeSettings.maxDocumentLength = maxDocLength
 	return b
@@ -81,6 +89,7 @@ func NewScrapeBuilder() ScrapeBuilder {
 type ScraperOptions struct {
 	MaxDocumentLength int64
 	UserAgent         string
+	MaxTokenDepth     int
 }
 
 type Scraper struct {
@@ -286,7 +295,8 @@ func convertUTF8(content io.Reader, contentType string) (bytes.Buffer, error) {
 
 func (scraper *Scraper) parseDocument(doc *Document) error {
 	t := html.NewTokenizer(&doc.Body)
-	var ogImage bool
+	var hasOgImage bool
+	var hasOgType bool
 	var headPassed bool
 	var hasFragment bool
 	var hasCanonical bool
@@ -298,6 +308,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 	doc.Preview.Name = scraper.Url.Host
 	// set default icon to web root if <link rel="icon" href="/favicon.ico"> not found
 	doc.Preview.Icon = fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, "/favicon.ico")
+	depth := 0
 	for {
 		tokenType := t.Next()
 		if tokenType == html.ErrorToken {
@@ -342,6 +353,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 					doc.Preview.Icon = href
 				}
 			}
+			depth = 0
 
 		case "meta":
 			if len(token.Attr) != 2 {
@@ -367,6 +379,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 				doc.Preview.Title = content
 			case "og:type":
 				doc.Preview.Type = content
+				hasOgType = true
 			case "og:description":
 				doc.Preview.Description = content
 			case "description":
@@ -376,7 +389,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 			case "og:url":
 				doc.Preview.Link = content
 			case "og:image":
-				ogImage = true
+				hasOgImage = true
 				ogImgUrl, err := url.Parse(content)
 				if err != nil {
 					return err
@@ -389,6 +402,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 				doc.Preview.Images = []string{ogImgUrl.String()}
 
 			}
+			depth = 0
 
 		case "title":
 			if tokenType == html.StartTagToken {
@@ -398,6 +412,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 					doc.Preview.Title = token.Data
 				}
 			}
+			depth = 0
 
 		case "img":
 			for _, attr := range token.Attr {
@@ -418,6 +433,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 
 				}
 			}
+			depth = 0
 		}
 
 		if hasCanonical && headPassed && scraper.MaxRedirect > 0 {
@@ -448,10 +464,15 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 			return scraper.parseDocument(doc)
 		}
 
-		if len(doc.Preview.Title) > 0 && len(doc.Preview.Description) > 0 && ogImage && headPassed {
-			return nil
+		if len(doc.Preview.Title) > 0 && len(doc.Preview.Description) > 0 && hasOgImage && headPassed {
+			if scraper.Options.MaxTokenDepth == 0 {
+				return nil
+			}
+			if hasOgType || depth >= scraper.Options.MaxTokenDepth {
+				return nil
+			}
+			depth++
 		}
-
 	}
 
 	return nil