From ea0bfa4b2acf002483ed2b90d8e60a181c368443 Mon Sep 17 00:00:00 2001 From: Horacio Duran Date: Mon, 8 Apr 2019 18:30:15 -0300 Subject: [PATCH] Add preferTag option to `Scrape` function Add the possibility to get the title from the tag even if the `og:title` meta tag is present since sometimes it can be a bit disapointing in terms of info. Additionally added a few comments to make lint happier --- goscraper.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/goscraper.go b/goscraper.go index 9c03bb9..a7c7e9d 100644 --- a/goscraper.go +++ b/goscraper.go @@ -18,17 +18,22 @@ var ( fragmentRegexp = regexp.MustCompile("#!(.*)") ) +// Scraper contains the necessary information to trigger the Scraping process +// in a given url type Scraper struct { Url *url.URL EscapedFragmentUrl *url.URL MaxRedirect int + PreferTag bool } +// Document holds the Body of a page and it's preview. type Document struct { Body bytes.Buffer Preview DocumentPreview } +// DocumentPreview holds preview metadata for a given site. type DocumentPreview struct { Icon string Name string @@ -38,7 +43,11 @@ type DocumentPreview struct { Link string } -func Scrape(uri string, maxRedirect int) (*Document, error) { +// Scrape will return a document containing, if possible, the preview for the passed +// in URL or where that redirects up to maxRedirect hops or error if not possible. +// if preferTag is true some bits of info, such as title, might be taken from +// the specific html tag even if the meta is present. +func Scrape(uri string, maxRedirect int, preferTag bool) (*Document, error) { u, err := url.Parse(uri) if err != nil { return nil, err @@ -46,6 +55,7 @@ func Scrape(uri string, maxRedirect int) (*Document, error) { return (&Scraper{Url: u, MaxRedirect: maxRedirect}).Scrape() } +// Scrape scrapes the documentd contained in this Scraper. func (scraper *Scraper) Scrape() (*Document, error) { doc, err := scraper.getDocument() if err != nil { @@ -267,7 +277,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error { if tokenType == html.StartTagToken { t.Next() token = t.Token() - if len(doc.Preview.Title) == 0 { + if len(doc.Preview.Title) == 0 || scraper.PreferTag { doc.Preview.Title = token.Data } }