From dbd38f5bb0b4605b79479126ec2450f958401dbe Mon Sep 17 00:00:00 2001 From: Filioglo Alexandr Date: Wed, 15 Jul 2020 17:45:06 +0300 Subject: [PATCH 1/6] Add MaxDocumentLength and custom UserAgent support --- goscraper.go | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/goscraper.go b/goscraper.go index abbe60e..b6e3b57 100644 --- a/goscraper.go +++ b/goscraper.go @@ -2,6 +2,7 @@ package goscraper import ( "bytes" + "errors" "fmt" "io" "net/http" @@ -18,10 +19,16 @@ var ( fragmentRegexp = regexp.MustCompile("#!(.*)") ) +type ScraperOptions struct { + MaxDocumentLength int64 + UserAgent string +} + type Scraper struct { Url *url.URL EscapedFragmentUrl *url.URL MaxRedirect int + Options ScraperOptions } type Document struct { @@ -38,12 +45,12 @@ type DocumentPreview struct { Link string } -func Scrape(uri string, maxRedirect int) (*Document, error) { +func Scrape(uri string, maxRedirect int, options ScraperOptions) (*Document, error) { u, err := url.Parse(uri) if err != nil { return nil, err } - return (&Scraper{Url: u, MaxRedirect: maxRedirect}).Scrape() + return (&Scraper{Url: u, MaxRedirect: maxRedirect, Options: options}).Scrape() } func (scraper *Scraper) Scrape() (*Document, error) { @@ -109,6 +116,16 @@ func (scraper *Scraper) toFragmentUrl() error { } func (scraper *Scraper) getDocument() (*Document, error) { + addUserAgent := func(req *http.Request) *http.Request { + userAgent := "GoScraper" + if len(scraper.Options.UserAgent) != 0 { + userAgent = scraper.Options.UserAgent + } + req.Header.Add("User-Agent", userAgent) + + return req + } + scraper.MaxRedirect -= 1 if strings.Contains(scraper.Url.String(), "#!") { scraper.toFragmentUrl() @@ -117,11 +134,31 @@ func (scraper *Scraper) getDocument() (*Document, error) { scraper.EscapedFragmentUrl = scraper.Url } + if scraper.Options.MaxDocumentLength > 0 { + // We try first to check content length (if it's present) - and if isn't - already limit by body size + req, err := http.NewRequest("HEAD", scraper.getUrl(), nil) + if err != nil { + return nil, err + } + req = addUserAgent(req) + + resp, err := http.DefaultClient.Do(req) + if resp != nil { + defer resp.Body.Close() + } + if err != nil { + return nil, err + } + if resp.ContentLength > scraper.Options.MaxDocumentLength { + return nil, errors.New("Content-Length exceed limits") + } + } + req, err := http.NewRequest("GET", scraper.getUrl(), nil) if err != nil { return nil, err } - req.Header.Add("User-Agent", "GoScraper") + req = addUserAgent(req) resp, err := http.DefaultClient.Do(req) if resp != nil { @@ -135,6 +172,11 @@ func (scraper *Scraper) getDocument() (*Document, error) { scraper.EscapedFragmentUrl = nil scraper.Url = resp.Request.URL } + + if scraper.Options.MaxDocumentLength > 0 { + resp.Body = http.MaxBytesReader(nil, resp.Body, scraper.Options.MaxDocumentLength) + } + b, err := convertUTF8(resp.Body, resp.Header.Get("content-type")) if err != nil { return nil, err @@ -197,7 +239,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error { if cleanStr(attr.Key) == "rel" && cleanStr(attr.Val) == "canonical" { canonical = true } - if cleanStr(attr.Key) == "rel" && strings.Contains(cleanStr(attr.Val), "icon") { + if cleanStr(attr.Key) == "rel" && strings.Contains(cleanStr(attr.Val), "icon") { hasIcon = true } if cleanStr(attr.Key) == "href" { From c5a417ac61c90cfa97aecac9d99d2fb20b825259 Mon Sep 17 00:00:00 2001 From: Alexandr Filioglo Date: Fri, 7 Aug 2020 13:18:03 +0300 Subject: [PATCH 2/6] Fixes and improvements (#1) - added support for og:type - fixed incorrect handling of relative paths - changed to "silent" checks Content-Length in Head requests --- goscraper.go | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/goscraper.go b/goscraper.go index b6e3b57..a50b64e 100644 --- a/goscraper.go +++ b/goscraper.go @@ -41,6 +41,7 @@ type DocumentPreview struct { Name string Title string Description string + Type string Images []string Link string } @@ -137,20 +138,18 @@ func (scraper *Scraper) getDocument() (*Document, error) { if scraper.Options.MaxDocumentLength > 0 { // We try first to check content length (if it's present) - and if isn't - already limit by body size req, err := http.NewRequest("HEAD", scraper.getUrl(), nil) - if err != nil { - return nil, err - } - req = addUserAgent(req) + if err == nil { + req = addUserAgent(req) - resp, err := http.DefaultClient.Do(req) - if resp != nil { - defer resp.Body.Close() - } - if err != nil { - return nil, err - } - if resp.ContentLength > scraper.Options.MaxDocumentLength { - return nil, errors.New("Content-Length exceed limits") + resp, err := http.DefaultClient.Do(req) + if resp != nil { + defer resp.Body.Close() + } + if err == nil { + if resp.ContentLength > scraper.Options.MaxDocumentLength { + return nil, errors.New("Content-Length exceed limits") + } + } } } @@ -280,6 +279,8 @@ func (scraper *Scraper) parseDocument(doc *Document) error { doc.Preview.Name = content case "og:title": doc.Preview.Title = content + case "og:type": + doc.Preview.Type = content case "og:description": doc.Preview.Description = content case "description": @@ -322,7 +323,11 @@ func (scraper *Scraper) parseDocument(doc *Document) error { return err } if !imgUrl.IsAbs() { - doc.Preview.Images = append(doc.Preview.Images, fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, imgUrl.Path)) + if string(imgUrl.Path[0]) == "/" { + doc.Preview.Images = append(doc.Preview.Images, fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, imgUrl.Path)) + } else { + doc.Preview.Images = append(doc.Preview.Images, fmt.Sprintf("%s://%s/%s", scraper.Url.Scheme, scraper.Url.Host, imgUrl.Path)) + } } else { doc.Preview.Images = append(doc.Preview.Images, attr.Val) } From 5044043eb93a4cb78ee2345bdfa1f7121c88aa5a Mon Sep 17 00:00:00 2001 From: Alexandr Filioglo Date: Fri, 11 Sep 2020 17:26:50 +0300 Subject: [PATCH 3/6] Added support of response headers & added builder. Split `Scrape` into 2 different functions `GetDocument` and `ParseDocument` (#2) --- goscraper.go | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 3 deletions(-) diff --git a/goscraper.go b/goscraper.go index a50b64e..868fc28 100644 --- a/goscraper.go +++ b/goscraper.go @@ -19,6 +19,63 @@ var ( fragmentRegexp = regexp.MustCompile("#!(.*)") ) +type scrapeSettings struct { + userAgent string + maxDocumentLength int64 + url string + maxRedirect int +} + +type ScrapeBuilder interface { + SetUserAgent(string) ScrapeBuilder + SetMaxDocumentLength(int64) ScrapeBuilder + SetUrl(string) ScrapeBuilder + SetMaxRedirect(int) ScrapeBuilder + Build() (*Scraper, error) +} + +type scrapeBuilder struct { + scrapeSettings scrapeSettings +} + +func (b *scrapeBuilder) Build() (*Scraper, error) { + u, err := url.Parse(b.scrapeSettings.url) + if err != nil { + return nil, err + } + return &Scraper{ + Url: u, + MaxRedirect: b.scrapeSettings.maxRedirect, + Options: ScraperOptions{ + MaxDocumentLength: b.scrapeSettings.maxDocumentLength, + UserAgent: b.scrapeSettings.userAgent, + }}, nil +} + +func (b *scrapeBuilder) SetUrl(s string) ScrapeBuilder { + b.scrapeSettings.url = s + return b +} + +func (b *scrapeBuilder) SetMaxRedirect(i int) ScrapeBuilder { + b.scrapeSettings.maxRedirect = i + return b +} + +func (b *scrapeBuilder) SetMaxDocumentLength(maxDocLength int64) ScrapeBuilder { + b.scrapeSettings.maxDocumentLength = maxDocLength + return b +} + +func (b *scrapeBuilder) SetUserAgent(s string) ScrapeBuilder { + b.scrapeSettings.userAgent = s + return b +} + +func NewScrapeBuilder() ScrapeBuilder { + return &scrapeBuilder{scrapeSettings{userAgent: "GoScraper"}} +} + type ScraperOptions struct { MaxDocumentLength int64 UserAgent string @@ -32,8 +89,13 @@ type Scraper struct { } type Document struct { - Body bytes.Buffer - Preview DocumentPreview + Body bytes.Buffer + Preview DocumentPreview + ResHeader ResHeaders +} + +type ResHeaders struct { + ContentType string } type DocumentPreview struct { @@ -66,6 +128,18 @@ func (scraper *Scraper) Scrape() (*Document, error) { return doc, nil } +func (scraper *Scraper) GetDocument() (*Document, error) { + return scraper.getDocument() +} + +func (scraper *Scraper) ParseDocument(doc *Document) (*Document, error) { + err := scraper.parseDocument(doc) + if err != nil { + return nil, err + } + return doc, nil +} + func (scraper *Scraper) getUrl() string { if scraper.EscapedFragmentUrl != nil { return scraper.EscapedFragmentUrl.String() @@ -180,7 +254,11 @@ func (scraper *Scraper) getDocument() (*Document, error) { if err != nil { return nil, err } - doc := &Document{Body: b, Preview: DocumentPreview{Link: scraper.Url.String()}} + doc := &Document{ + Body: b, + Preview: DocumentPreview{Link: scraper.Url.String()}, + ResHeader: ResHeaders{ContentType: resp.Header.Get("content-type")}, + } return doc, nil } From afaa757a65510e19e4b0182d174bce424c897226 Mon Sep 17 00:00:00 2001 From: Alexandr Filioglo Date: Tue, 20 Oct 2020 16:43:12 +0300 Subject: [PATCH 4/6] Added Scrape service interface --- .gitignore | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++ goscraper.go | 14 +++++-- 2 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f3f7ad1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,105 @@ +# Created by .ignore support plugin (hsz.mobi) +### VisualStudioCode template +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf +.idea + + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Go template +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Dependency directories (remove the comment below to include it) +# vendor/ + +tests/ diff --git a/goscraper.go b/goscraper.go index 868fc28..c8871f8 100644 --- a/goscraper.go +++ b/goscraper.go @@ -31,14 +31,14 @@ type ScrapeBuilder interface { SetMaxDocumentLength(int64) ScrapeBuilder SetUrl(string) ScrapeBuilder SetMaxRedirect(int) ScrapeBuilder - Build() (*Scraper, error) + Build() (ScrapeService, error) } type scrapeBuilder struct { scrapeSettings scrapeSettings } -func (b *scrapeBuilder) Build() (*Scraper, error) { +func (b *scrapeBuilder) Build() (ScrapeService, error) { u, err := url.Parse(b.scrapeSettings.url) if err != nil { return nil, err @@ -73,7 +73,9 @@ func (b *scrapeBuilder) SetUserAgent(s string) ScrapeBuilder { } func NewScrapeBuilder() ScrapeBuilder { - return &scrapeBuilder{scrapeSettings{userAgent: "GoScraper"}} + return &scrapeBuilder{ + scrapeSettings: scrapeSettings{userAgent: "GoScraper"}, + } } type ScraperOptions struct { @@ -108,6 +110,12 @@ type DocumentPreview struct { Link string } +type ScrapeService interface { + Scrape() (*Document, error) + GetDocument() (*Document, error) + ParseDocument(doc *Document) (*Document, error) +} + func Scrape(uri string, maxRedirect int, options ScraperOptions) (*Document, error) { u, err := url.Parse(uri) if err != nil { From 38c9e83c465d5e7b67913621ff8e060125032c22 Mon Sep 17 00:00:00 2001 From: Alexandr Filioglo Date: Wed, 18 Nov 2020 10:06:20 +0200 Subject: [PATCH 5/6] Fixed incorrect creation URLs from non absolute one Due to the reason that we "re-create" new link - only by scheme, host, and path - is present risk to lose some other data from the original link. Previously `/some/path.png?param=value`, was transformed into `http://mydomain.com/some/path.png` Now this issue should be fixed, and the output should be `http://mydomain.com/some/path.png?param=value` --- goscraper.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/goscraper.go b/goscraper.go index c8871f8..a466f30 100644 --- a/goscraper.go +++ b/goscraper.go @@ -382,10 +382,8 @@ func (scraper *Scraper) parseDocument(doc *Document) error { return err } if !ogImgUrl.IsAbs() { - ogImgUrl, err = url.Parse(fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, ogImgUrl.Path)) - if err != nil { - return err - } + ogImgUrl.Host = scraper.Url.Host + ogImgUrl.Scheme = scraper.Url.Scheme } doc.Preview.Images = []string{ogImgUrl.String()} From 44a43d859e3ddccb808a6bf0e94b0729ec0c6be9 Mon Sep 17 00:00:00 2001 From: Alexandr Filioglo Date: Mon, 18 Jan 2021 10:10:00 +0200 Subject: [PATCH 6/6] Add support for "deeper" search of optional property. (#5) For some ULRs was found that we can't get for strange reason the `og:type` data. One of this ULRs - was youtube links. Was detected that in YouTube they keep metadata in body (and not in head as other normal services). And because previously the criteria for breaking loop of procession of tokens was "we have Title + description + ogImage and we passed head" - we were not able to process all other optional meta after that we pass head. Now we are able to control how much tokens we can process before breaking loop (or if we found required optional fields already) --- goscraper.go | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/goscraper.go b/goscraper.go index a466f30..f9e5f7f 100644 --- a/goscraper.go +++ b/goscraper.go @@ -24,6 +24,7 @@ type scrapeSettings struct { maxDocumentLength int64 url string maxRedirect int + maxTokenDepth int } type ScrapeBuilder interface { @@ -32,6 +33,7 @@ type ScrapeBuilder interface { SetUrl(string) ScrapeBuilder SetMaxRedirect(int) ScrapeBuilder Build() (ScrapeService, error) + SetMaxTokenDepth(int) ScrapeBuilder } type scrapeBuilder struct { @@ -49,6 +51,7 @@ func (b *scrapeBuilder) Build() (ScrapeService, error) { Options: ScraperOptions{ MaxDocumentLength: b.scrapeSettings.maxDocumentLength, UserAgent: b.scrapeSettings.userAgent, + MaxTokenDepth: b.scrapeSettings.maxTokenDepth, }}, nil } @@ -62,6 +65,11 @@ func (b *scrapeBuilder) SetMaxRedirect(i int) ScrapeBuilder { return b } +func (b *scrapeBuilder) SetMaxTokenDepth(i int) ScrapeBuilder { + b.scrapeSettings.maxTokenDepth = i + return b +} + func (b *scrapeBuilder) SetMaxDocumentLength(maxDocLength int64) ScrapeBuilder { b.scrapeSettings.maxDocumentLength = maxDocLength return b @@ -81,6 +89,7 @@ func NewScrapeBuilder() ScrapeBuilder { type ScraperOptions struct { MaxDocumentLength int64 UserAgent string + MaxTokenDepth int } type Scraper struct { @@ -286,7 +295,8 @@ func convertUTF8(content io.Reader, contentType string) (bytes.Buffer, error) { func (scraper *Scraper) parseDocument(doc *Document) error { t := html.NewTokenizer(&doc.Body) - var ogImage bool + var hasOgImage bool + var hasOgType bool var headPassed bool var hasFragment bool var hasCanonical bool @@ -298,6 +308,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error { doc.Preview.Name = scraper.Url.Host // set default icon to web root if not found doc.Preview.Icon = fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, "/favicon.ico") + depth := 0 for { tokenType := t.Next() if tokenType == html.ErrorToken { @@ -342,6 +353,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error { doc.Preview.Icon = href } } + depth = 0 case "meta": if len(token.Attr) != 2 { @@ -367,6 +379,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error { doc.Preview.Title = content case "og:type": doc.Preview.Type = content + hasOgType = true case "og:description": doc.Preview.Description = content case "description": @@ -376,7 +389,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error { case "og:url": doc.Preview.Link = content case "og:image": - ogImage = true + hasOgImage = true ogImgUrl, err := url.Parse(content) if err != nil { return err @@ -389,6 +402,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error { doc.Preview.Images = []string{ogImgUrl.String()} } + depth = 0 case "title": if tokenType == html.StartTagToken { @@ -398,6 +412,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error { doc.Preview.Title = token.Data } } + depth = 0 case "img": for _, attr := range token.Attr { @@ -418,6 +433,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error { } } + depth = 0 } if hasCanonical && headPassed && scraper.MaxRedirect > 0 { @@ -448,10 +464,15 @@ func (scraper *Scraper) parseDocument(doc *Document) error { return scraper.parseDocument(doc) } - if len(doc.Preview.Title) > 0 && len(doc.Preview.Description) > 0 && ogImage && headPassed { - return nil + if len(doc.Preview.Title) > 0 && len(doc.Preview.Description) > 0 && hasOgImage && headPassed { + if scraper.Options.MaxTokenDepth == 0 { + return nil + } + if hasOgType || depth >= scraper.Options.MaxTokenDepth { + return nil + } + depth++ } - } return nil