diff --git a/go.mod b/go.mod index ce9c8fb..d186b46 100644 --- a/go.mod +++ b/go.mod @@ -2,17 +2,20 @@ module github.com/firede/agent-fetch go 1.25.0 +require ( + github.com/JohannesKaufmann/html-to-markdown/v2 v2.5.0 + github.com/chromedp/cdproto v0.0.0-20250724212937-08a3db8b4327 + github.com/chromedp/chromedp v0.14.2 + github.com/go-shiori/go-readability v0.0.0-20251205110129-5db1dc9836f0 +) + require ( github.com/JohannesKaufmann/dom v0.2.0 // indirect - github.com/JohannesKaufmann/html-to-markdown/v2 v2.5.0 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect - github.com/chromedp/cdproto v0.0.0-20250724212937-08a3db8b4327 // indirect - github.com/chromedp/chromedp v0.14.2 // indirect github.com/chromedp/sysutil v1.1.0 // indirect github.com/go-json-experiment/json v0.0.0-20250725192818-e39067aee2d2 // indirect github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect - github.com/go-shiori/go-readability v0.0.0-20251205110129-5db1dc9836f0 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect diff --git a/go.sum b/go.sum index 6a67d35..4e3ea9e 100644 --- a/go.sum +++ b/go.sum @@ -13,6 +13,8 @@ github.com/chromedp/chromedp v0.14.2/go.mod h1:rHzAv60xDE7VNy/MYtTUrYreSc0ujt2O1 github.com/chromedp/sysutil v1.1.0 h1:PUFNv5EcprjqXZD9nJb9b/c9ibAbxiYo4exNWZyipwM= github.com/chromedp/sysutil v1.1.0/go.mod h1:WiThHUdltqCNKGc4gaU50XgYjwjYIhKWoHGPTUfWTJ8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/go-json-experiment/json v0.0.0-20250725192818-e39067aee2d2 h1:iizUGZ9pEquQS5jTGkh4AqeeHCMbfbjeb0zMt0aEFzs= github.com/go-json-experiment/json v0.0.0-20250725192818-e39067aee2d2/go.mod h1:TiCD2a1pcmjd7YnhGH0f/zKNcCD06B029pHhzV23c2M= github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w= @@ -28,13 +30,25 @@ github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakr github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= +github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= +github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg= +github.com/sebdah/goldie/v2 v2.8.0 h1:dZb9wR8q5++oplmEiJT+U/5KyotVD+HNGCAc5gNr8rc= +github.com/sebdah/goldie/v2 v2.8.0/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= +github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= +github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yuin/goldmark v1.7.13 h1:GPddIs617DnBLFFVJFgpo1aBfe/4xcvMc3SB5t/D0pA= +github.com/yuin/goldmark v1.7.13/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= @@ -107,3 +121,5 @@ golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxb golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/fetcher/fetcher.go b/internal/fetcher/fetcher.go index 232f56b..b3b6152 100644 --- a/internal/fetcher/fetcher.go +++ b/internal/fetcher/fetcher.go @@ -3,6 +3,7 @@ package fetcher import ( "bytes" "context" + "encoding/json" "errors" "fmt" "io" @@ -21,14 +22,18 @@ import ( ) const ( - ModeAuto = "auto" - ModeStatic = "static" - ModeBrowser = "browser" + ModeAuto = "auto" + ModeStatic = "static" + ModeBrowser = "browser" + maxMarkdownSampleSize = 12000 ) var ( ErrUnsupportedMode = errors.New("unsupported mode") ErrNoContent = errors.New("no content could be extracted") + ErrHTTPStatus = errors.New("unexpected HTTP status code") + + browserHTMLToMarkdownFn = browserHTMLToMarkdown ) type Config struct { @@ -89,6 +94,9 @@ func Fetch(ctx context.Context, rawURL string, cfg Config) (Result, error) { func fetchAuto(ctx context.Context, rawURL string, cfg Config) (Result, error) { resp, err := fetchHTTP(ctx, rawURL, cfg, true) if err != nil { + if errors.Is(err, ErrHTTPStatus) { + return fetchBrowserOnly(ctx, rawURL, cfg) + } return Result{}, err } @@ -101,14 +109,7 @@ func fetchAuto(ctx context.Context, rawURL string, cfg Config) (Result, error) { return Result{Markdown: md, Source: "http-static", FinalURL: resp.FinalURL}, nil } - browMD, finalURL, err := browserHTMLToMarkdown(ctx, rawURL, cfg) - if err != nil { - return Result{}, err - } - if strings.TrimSpace(browMD) == "" { - return Result{}, ErrNoContent - } - return Result{Markdown: browMD, Source: "browser", FinalURL: finalURL}, nil + return fetchBrowserOnly(ctx, rawURL, cfg) } func fetchStaticOnly(ctx context.Context, rawURL string, cfg Config) (Result, error) { @@ -133,7 +134,7 @@ func fetchStaticOnly(ctx context.Context, rawURL string, cfg Config) (Result, er } func fetchBrowserOnly(ctx context.Context, rawURL string, cfg Config) (Result, error) { - md, finalURL, err := browserHTMLToMarkdown(ctx, rawURL, cfg) + md, finalURL, err := browserHTMLToMarkdownFn(ctx, rawURL, cfg) if err != nil { return Result{}, err } @@ -180,6 +181,9 @@ func fetchHTTP(ctx context.Context, rawURL string, cfg Config, preferMarkdown bo if resp.Request != nil && resp.Request.URL != nil { finalURL = resp.Request.URL.String() } + if resp.StatusCode >= http.StatusBadRequest { + return responseData{}, fmt.Errorf("%w: %d %s (%s)", ErrHTTPStatus, resp.StatusCode, http.StatusText(resp.StatusCode), finalURL) + } return responseData{ Body: body, @@ -287,8 +291,12 @@ func isLikelyMarkdown(body []byte, contentType string) bool { } sample := trimmed - if len(sample) > 12000 { - sample = sample[:12000] + if len(sample) > maxMarkdownSampleSize { + sample = sample[:maxMarkdownSampleSize] + } + lcType := strings.ToLower(contentType) + if looksLikeJSONPayload(sample, lcType) { + return false } lower := strings.ToLower(sample) @@ -309,7 +317,6 @@ func isLikelyMarkdown(body []byte, contentType string) bool { return true } - lcType := strings.ToLower(contentType) if strings.Contains(lcType, "text/markdown") && htmlTagCount == 0 { return true } @@ -324,6 +331,25 @@ func isLikelyMarkdown(body []byte, contentType string) bool { return false } +func looksLikeJSONPayload(sample, contentType string) bool { + if strings.Contains(contentType, "json") { + return true + } + + trim := strings.TrimSpace(sample) + if trim == "" { + return false + } + if trim[0] != '{' && trim[0] != '[' { + return false + } + if json.Valid([]byte(trim)) { + return true + } + // Treat truncated JSON-like payloads as structured data instead of markdown. + return strings.Contains(trim, "\":") +} + var htmlTagRe = regexp.MustCompile(`]*)?>`) func markdownScore(input string) int { @@ -442,7 +468,12 @@ func toCDPHeaders(h http.Header) network.Headers { res := make(network.Headers, len(keys)) for _, k := range keys { - res[k] = strings.Join(h.Values(k), ", ") + vals := h.Values(k) + if strings.EqualFold(k, "Cookie") { + res[k] = strings.Join(vals, "; ") + continue + } + res[k] = strings.Join(vals, ", ") } return res } diff --git a/internal/fetcher/fetcher_test.go b/internal/fetcher/fetcher_test.go index 18c33e8..354881c 100644 --- a/internal/fetcher/fetcher_test.go +++ b/internal/fetcher/fetcher_test.go @@ -2,6 +2,7 @@ package fetcher import ( "context" + "errors" "fmt" "net/http" "net/http/httptest" @@ -13,6 +14,7 @@ import ( func TestIsLikelyMarkdown(t *testing.T) { md := []byte("# Title\n\n- item\n\nThis is markdown.\n") html := []byte("

Title

") + jsonPayload := []byte(`{"items":[{"id":1,"name":"alpha"},{"id":2,"name":"beta"},{"id":3,"name":"gamma"}],"meta":{"total":203,"note":"this payload is intentionally long enough to trigger old fallback behavior"}}`) if !isLikelyMarkdown(md, "text/plain") { t.Fatal("expected markdown sample to be detected as markdown") @@ -20,6 +22,9 @@ func TestIsLikelyMarkdown(t *testing.T) { if isLikelyMarkdown(html, "text/html") { t.Fatal("expected HTML sample to not be treated as markdown") } + if isLikelyMarkdown(jsonPayload, "application/json") { + t.Fatal("expected JSON sample to not be treated as markdown") + } } func TestStaticHTMLToMarkdown(t *testing.T) { @@ -104,3 +109,72 @@ func TestFetchStaticConvertsHTML(t *testing.T) { t.Fatalf("unexpected markdown output: %q", res.Markdown) } } + +func TestFetchReturnsErrorOnHTTPStatus(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.WriteHeader(http.StatusNotFound) + fmt.Fprint(w, "

Not Found

") + })) + defer ts.Close() + + cfg := DefaultConfig() + cfg.Mode = ModeStatic + cfg.Timeout = 5 * time.Second + + _, err := Fetch(context.Background(), ts.URL, cfg) + if !errors.Is(err, ErrHTTPStatus) { + t.Fatalf("expected ErrHTTPStatus, got %v", err) + } +} + +func TestFetchAutoFallsBackToBrowserOnHTTPStatus(t *testing.T) { + originalBrowserFn := browserHTMLToMarkdownFn + browserHTMLToMarkdownFn = func(_ context.Context, _ string, _ Config) (string, string, error) { + return "# Browser Rendered\n", "https://browser.example/final", nil + } + defer func() { + browserHTMLToMarkdownFn = originalBrowserFn + }() + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.WriteHeader(http.StatusForbidden) + fmt.Fprint(w, "

Forbidden

") + })) + defer ts.Close() + + cfg := DefaultConfig() + cfg.Mode = ModeAuto + cfg.Timeout = 5 * time.Second + + res, err := Fetch(context.Background(), ts.URL, cfg) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if res.Source != "browser" { + t.Fatalf("expected source browser, got %q", res.Source) + } + if res.FinalURL != "https://browser.example/final" { + t.Fatalf("unexpected final URL: %q", res.FinalURL) + } + if !strings.Contains(res.Markdown, "Browser Rendered") { + t.Fatalf("unexpected markdown output: %q", res.Markdown) + } +} + +func TestToCDPHeadersCookieFormatting(t *testing.T) { + h := make(http.Header) + h.Add("Cookie", "a=1") + h.Add("Cookie", "b=2") + h.Add("X-Test", "one") + h.Add("X-Test", "two") + + got := toCDPHeaders(h) + if got["Cookie"] != "a=1; b=2" { + t.Fatalf("expected cookie header to use '; ' separator, got %v", got["Cookie"]) + } + if got["X-Test"] != "one, two" { + t.Fatalf("expected generic headers to use ', ' separator, got %v", got["X-Test"]) + } +}