From e49385630290a148a97c45a832628eb003083c35 Mon Sep 17 00:00:00 2001 From: Rahul Thomas Date: Mon, 9 Feb 2026 21:57:39 -0800 Subject: [PATCH 1/3] ci: run publish workflow only on release publication --- .github/workflows/ci.yml | 58 +++++++ .github/workflows/publish.yml | 53 ++++++ .travis.yml | 13 -- CHANGELOG.md | 46 +++++ README.md | 223 +++++++++++++------------ VERSION | 1 + adapter/basicadapters.go | 12 +- adapter/basicadapters_test.go | 35 ++++ adapter/doc.go | 4 +- go.mod | 12 ++ go.sum | 4 + octopus/doc.go | 44 ++--- octopus/htmlparse_test.go | 35 ++++ octopus/modelfactory_test.go | 33 ++++ octopus/models.go | 40 ++--- octopus/pipe_augment_linkabsolution.go | 6 +- octopus/pipe_ctrl_ratelimit.go | 4 +- octopus/pipe_process_htmlparsing.go | 7 +- octopus/pipe_spl_distributor.go | 15 +- octopus/pipes_test.go | 63 +++++++ octopus/setup.go | 3 +- octopus/setup_test.go | 50 ++++++ 22 files changed, 572 insertions(+), 189 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/publish.yml delete mode 100644 .travis.yml create mode 100644 CHANGELOG.md create mode 100644 VERSION create mode 100644 adapter/basicadapters_test.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 octopus/htmlparse_test.go create mode 100644 octopus/modelfactory_test.go create mode 100644 octopus/pipes_test.go create mode 100644 octopus/setup_test.go diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..89d9acf --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,58 @@ +name: CI + +on: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + push: + branches: [main, master] + +permissions: + contents: read + +jobs: + test-and-lint: + runs-on: ubuntu-latest + timeout-minutes: 20 + + strategy: + fail-fast: false + matrix: + go-version: ['1.23.x', '1.24.x'] + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go-version }} + cache: true + + - name: Verify module graph + run: | + go mod tidy + git diff --exit-code -- go.mod go.sum + + - name: Format check + run: | + unformatted=$(gofmt -l $(git ls-files '*.go')) + if [ -n "$unformatted" ]; then + echo "These files are not gofmt-formatted:" + echo "$unformatted" + exit 1 + fi + + - name: Vet + run: go vet ./... + + - name: Staticcheck + run: | + go install honnef.co/go/tools/cmd/staticcheck@latest + staticcheck ./... + + - name: Unit tests + run: go test ./... + + - name: Race tests + run: go test -race ./... diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..f00d074 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,53 @@ +name: Publish + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + publish-go-module: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '1.23.x' + + - name: Validate release tag format and VERSION match + run: | + TAG="${{ github.event.release.tag_name }}" + case "$TAG" in + v*) ;; + *) + echo "Release tag must start with 'v' (got: $TAG)" + exit 1 + ;; + esac + + TAG_NO_V="${TAG#v}" + VERSION_FILE=$(tr -d '[:space:]' < VERSION) + test "$TAG_NO_V" = "$VERSION_FILE" + + - name: Trigger Go module proxy indexing + env: + MODULE: github.com/rapidclock/web-octopus + run: | + set -euxo pipefail + VERSION="${{ github.event.release.tag_name }}" + curl -fsSL "https://proxy.golang.org/${MODULE}/@v/${VERSION}.info" + + - name: Trigger pkg.go.dev refresh + env: + MODULE: github.com/rapidclock/web-octopus + run: | + set -euxo pipefail + VERSION="${{ github.event.release.tag_name }}" + curl -fsSL "https://pkg.go.dev/fetch/${MODULE}@${VERSION}" diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 8102e24..0000000 --- a/.travis.yml +++ /dev/null @@ -1,13 +0,0 @@ -sudo: false - -language: go -go: - - "1.11" - -notifications: - email: false - -script: - - go doc octopus - - go doc adapter - - go test -v ./... \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..c5ff5f0 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,46 @@ +# Changelog + +All notable changes to this project are documented in this file. + +## [1.4.1] - 2026-02-10 + +### Changed +- Updated publish automation to run **only** when a GitHub Release is published (not on every tag push). +- Added strict release-tag validation in publish workflow and kept VERSION/tag consistency checks. + +## [1.4.0] - 2026-02-10 + +### Added +- Added GitHub Actions CI workflow for pull requests and default-branch pushes with module, formatting, vet, static analysis, test, and race checks. +- Added GitHub Actions publish workflow on version tags to trigger Go proxy and pkg.go.dev indexing. + +### Changed +- Removed legacy Travis CI configuration in favor of GitHub Actions. + +## [1.3.0] - 2026-02-10 + +### Added +- Added parser-focused unit coverage to validate anchor extraction behavior. + +### Changed +- Restored robust HTML parsing using `golang.org/x/net/html` tokenizer instead of regex-based extraction. +- Restored mature rate limiting with `golang.org/x/time/rate` for predictable throttling semantics. +- Added module `replace` directives to GitHub mirrors for `golang.org/x/*` to improve fetch reliability in restricted environments. +- Fixed distributor shutdown to stop forwarding after quit signal without closing inbound channels owned by upstream producers. + +## [1.1.0] - 2026-02-10 + +### Added +- Go module support (`go.mod`) with explicit dependency versions. +- Unit tests for crawler defaults and factory helpers. +- Unit tests for rate-limit validation and timeout setup behavior. +- Unit tests for pipeline helper behavior (link absolution, duplicate filter, timeout propagation). +- Unit test coverage for `adapter.FileWriterAdapter` write behavior. +- Comprehensive README covering installation, architecture, configuration, adapters, testing, and release policy. + +### Changed +- Replaced `log.Fatal` behavior in core library paths with non-process-terminating logic (`panic` for invalid setup value and early return for nil or file-open failure paths). +- Improved file adapter open flags to use write-only, create, and truncate semantics for predictable output. + +### Notes +- This version focuses on modernization and maintainability without changing the fundamental crawler pipeline design. diff --git a/README.md b/README.md index d6022ac..342a257 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,44 @@ # web-octopus -[![GoDoc](https://godoc.org/github.com/rapidclock/web-octopus/pq?status.svg)](https://godoc.org/github.com/rapidclock/web-octopus) -[![Build Status](https://travis-ci.com/rapidclock/web-octopus.svg?token=hJhLfHtyz41UyuLTTdFx&branch=master)](https://travis-ci.com/rapidclock/web-octopus) -
-A concurent web crawler written in Go. -## Install +A concurrent, channel-pipeline web crawler in Go. - go get github.com/rapidclock/web-octopus/octopus - go get github.com/rapidclock/web-octopus/adapter +> This release modernizes the project for current Go module workflows, testing expectations, and maintainability standards. -## Current Features: -- Depth Limited Crawling -- User specified valid protocols -- User buildable adapters that the crawler feeds output to. -- Filter Duplicates. (Default, Non-Customizable) -- Filter URLs that fail a HEAD request. (Default, Non-Customizable) -- User specifiable max timeout between two successive url requests. -- Max Number of Links to be crawled. +## Table of contents +- [Highlights](#highlights) +- [Installation](#installation) +- [Quick start](#quick-start) +- [Architecture](#architecture) +- [Configuration reference](#configuration-reference) +- [Output adapters](#output-adapters) +- [Testing](#testing) +- [Versioning and release](#versioning-and-release) +- [Compatibility notes](#compatibility-notes) -### Sample Implementation Snippet +## Highlights + +- Uses Go modules (`go.mod`) instead of legacy `go get`-only workflow. +- Includes automated unit tests for crawler defaults, validation behavior, pipeline helpers, and adapter output. +- Improved adapter safety around file handling and error paths. +- Expanded docs with architecture details and operational guidance. + +## Installation + +```bash +go get github.com/rapidclock/web-octopus@v1.4.1 +``` + +Import packages: + +```go +import ( + "github.com/rapidclock/web-octopus/adapter" + "github.com/rapidclock/web-octopus/octopus" +) +``` + +## Quick start ```go package main @@ -31,134 +50,128 @@ import ( func main() { opAdapter := &adapter.StdOpAdapter{} - + options := octopus.GetDefaultCrawlOptions() options.MaxCrawlDepth = 3 options.TimeToQuit = 10 options.CrawlRatePerSec = 5 options.CrawlBurstLimitPerSec = 8 options.OpAdapter = opAdapter - + crawler := octopus.New(options) crawler.SetupSystem() crawler.BeginCrawling("https://www.example.com") } ``` -### List of customizations +## Architecture + +`web-octopus` uses a staged channel pipeline. Nodes (URLs + metadata) flow through filter and processing stages: + +1. Ingest +2. Link absolution +3. Protocol filter +4. Duplicate filter +5. URL validation (`HEAD`) +6. Optional rate limiter +7. Page requisition (`GET`) +8. Distributor + - Output adapter stream + - Max delay watchdog stream +9. Max crawled links limiter (optional) +10. Crawl depth filter +11. HTML parsing back into ingest + +This design allows localized extension by replacing adapters and modifying options, while preserving high concurrency. + +## Configuration reference + +`CrawlOptions` controls crawler behavior: -Customizations can be made by supplying the crawler an instance of `CrawlOptions`. The basic structure is shown below, with a brief explanation for each option. +- `MaxCrawlDepth int64` — max depth for crawled nodes. +- `MaxCrawledUrls int64` — max total unique URLs; `-1` means unlimited. +- `CrawlRatePerSec int64` — request rate limit, negative to disable. +- `CrawlBurstLimitPerSec int64` — burst capacity for rate limiting. +- `IncludeBody bool` — include body in crawled node (currently internal pipeline behavior). +- `OpAdapter OutputAdapter` — required output sink. +- `ValidProtocols []string` — accepted URL schemes (e.g., `http`, `https`). +- `TimeToQuit int64` — max idle seconds before automatic quit. + +### Defaults + +Use: ```go -type CrawlOptions struct { - MaxCrawlDepth int64 // Max Depth of Crawl, 0 is the initial link. - MaxCrawledUrls int64 // Max number of links to be crawled in total. - StayWithinBaseHost bool // [Not-Implemented-Yet] - CrawlRatePerSec int64 // Max Rate at which requests can be made (req/sec). - CrawlBurstLimitPerSec int64 // Max Burst Capacity (should be atleast the crawl rate). - RespectRobots bool // [Not-Implemented-Yet] - IncludeBody bool // Include the Request Body (Contents of the web page) in the result of the crawl. - OpAdapter OutputAdapter // A user defined crawl output handler (See next section for info). - ValidProtocols []string // Valid protocols to crawl (http, https, ftp, etc.) - TimeToQuit int64 // Timeout (seconds) between two attempts or requests, before the crawler quits. -} +opts := octopus.GetDefaultCrawlOptions() ``` -A default instance of the `CrawlOptions` can be obtained by calling `octopus.GetDefaultCrawlOptions()`. This can be further customized by overriding individual properties. +Default values are tuned for local experimentation: -**NOTE:** If rate-limiting is not required, then just ignore(don't set value) both `CrawlRatePerSec` and `CrawlBurstLimitPerSec` in the `CrawlOptions`. +- Depth: `2` +- Max links: `-1` (unbounded) +- Rate limit: disabled +- Protocols: `http`, `https` +- Timeout gap: `30s` -### Output Adapters +## Output adapters -An Output Adapter is the final destination of a crawler processed request. The output of the crawler is fed here, according to the customizations made before starting the crawler through the `CrawlOptions` attached to the crawler. - -The `OutputAdapter` is a Go Interface, that has to be implemented by your(user-defined) processor. +The crawler emits processed nodes through the `OutputAdapter` interface: ```go type OutputAdapter interface { - Consume() *NodeChSet + Consume() *NodeChSet } ``` -The user has to implement the `Consume()` method that returns a __*pointer*__ to a `NodeChSet`. The `NodeChSet` is described below. The crawler uses the returned channel to send the crawl output. The user can start listening for output from the crawler. +### Built-in adapters -**Note** : If the user chooses to implement their custom `OutputAdapter` **REMEMBER** to listen for the output on another go-routine. Otherwise you might block the crawler from running. Atleast begin the crawling on another go-routine before you begin processing output. +1. `adapter.StdOpAdapter` + - Prints `count - depth - URL` to stdout. +2. `adapter.FileWriterAdapter` + - Writes `depth - URL` lines to a file. -The structure of the `NodeChSet` is given below. +### Writing a custom adapter -```go -type NodeChSet struct { - NodeCh chan<- *Node - *StdChannels -} +Create channels, return `*octopus.NodeChSet`, and consume nodes in a goroutine. Always handle quit signals to avoid goroutine leaks. -type StdChannels struct { - QuitCh chan<- int -} +## Testing -type Node struct { - *NodeInfo - Body io.ReadCloser -} +Run the full test suite: -type NodeInfo struct { - ParentUrlString string - UrlString string - Depth int64 -} +```bash +go test ./... ``` -You can use the utility function `MakeDefaultNodeChSet()` to get a `NodeChSet` built for you. This also returns the `Node` and quit channels. Example given below: +Recommended local checks before release: -```go -var opNodeChSet *NodeChSet -var nodeCh chan *Node -var quitCh chan int -// above to demo the types. One can easily use go lang type erasure. -opNodeChSet, nodeCh, quitCh = MakeDefaultNodeChSet() +```bash +go test ./... -race +go vet ./... ``` -The user should supply the custom OutputAdapter as an argument to the `CrawlOptions`. -#### Default Output Adapters: +## CI/CD -We supply two default Adapters for you to try out. They are not meant to be feature rich, but you can still use them. Their primary purpose is meant to be a demonstration of how to build and use a `OutputAdapter`. +This repository uses GitHub Actions (not Travis CI): -1. `adapter.StdOpAdapter` : Writes the crawled output (only links, not body) to the standard output. -1. `adapter.FileWriterAdapter` : Writes the crawled output (only links, not body) to a supplied file. +- **CI workflow** (`.github/workflows/ci.yml`) runs automatically on PR open/sync/reopen and on pushes to the default branch. It validates module tidiness, formatting, vet/staticcheck, and test suites (including race detection). +- **Publish workflow** (`.github/workflows/publish.yml`) runs only when a GitHub **Release** is published and triggers indexing on both the Go proxy and pkg.go.dev so new versions are discoverable quickly. -#### Implementation of the `adapter.StdOpAdapter`: -We have supplied the implementation of `adapter.StdOpAdapter` below to get a rough idea of what goes into building your own adapter. +Release flow: -```go -// StdOpAdapter is an output adapter that just prints the output onto the -// screen. -// -// Sample Output Format is: -// LinkNum - Depth - Url -type StdOpAdapter struct{} - -func (s *StdOpAdapter) Consume() *oct.NodeChSet { - listenCh := make(chan *oct.Node) - quitCh := make(chan int, 1) - listenChSet := &oct.NodeChSet{ - NodeCh: listenCh, - StdChannels: &oct.StdChannels{ - QuitCh: quitCh, - }, - } - go func() { - i := 1 - for { - select { - case output := <-listenCh: - fmt.Printf("%d - %d - %s\n", i, output.Depth, output.UrlString) - i++ - case <-quitCh: - return - } - } - }() - return listenChSet -} -``` +1. Update `VERSION` and `CHANGELOG.md`. +2. Merge to default branch. +3. Create and push tag `vX.Y.Z` matching `VERSION`. +4. Publish a GitHub Release for that tag. +5. GitHub Actions publish workflow handles Go portal refresh calls. + +## Versioning and release + +- Project follows semantic versioning. +- Current release in this repository: **v1.4.1**. +- See `CHANGELOG.md` for release notes. + +## Compatibility notes + +- Legacy examples using old `go get` package paths still map to the same module path. +- Existing adapters remain source-compatible. diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..347f583 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +1.4.1 diff --git a/adapter/basicadapters.go b/adapter/basicadapters.go index f9a9e0e..b213236 100644 --- a/adapter/basicadapters.go +++ b/adapter/basicadapters.go @@ -13,7 +13,8 @@ import ( // screen. // // Sample Output Format is: -// LinkNum - Depth - Url +// +// LinkNum - Depth - Url type StdOpAdapter struct{} func (s *StdOpAdapter) Consume() *oct.NodeChSet { @@ -43,7 +44,8 @@ func (s *StdOpAdapter) Consume() *oct.NodeChSet { // FileWriterAdapter is an output adapter that writes the output to a // specified file. // Sample Output Format is: -// Depth - Url +// +// Depth - Url type FileWriterAdapter struct { FilePath string } @@ -65,8 +67,8 @@ func (fw *FileWriterAdapter) writeToFile(listenCh chan *oct.Node, quitCh chan int) { fp, err := fw.getFilePointer() if err != nil { - fp.Close() - log.Fatal(err) + log.Printf("failed to open output file %q: %v", fw.FilePath, err) + return } go func() { defer fp.Close() @@ -86,6 +88,6 @@ func (fw *FileWriterAdapter) writeToFile(listenCh chan *oct.Node, } func (fw *FileWriterAdapter) getFilePointer() (w io.WriteCloser, err error) { - w, err = os.OpenFile(fw.FilePath, os.O_RDWR|os.O_CREATE, 0644) + w, err = os.OpenFile(fw.FilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o644) return } diff --git a/adapter/basicadapters_test.go b/adapter/basicadapters_test.go new file mode 100644 index 0000000..3044ac9 --- /dev/null +++ b/adapter/basicadapters_test.go @@ -0,0 +1,35 @@ +package adapter + +import ( + "os" + "path/filepath" + "testing" + "time" + + oct "github.com/rapidclock/web-octopus/octopus" +) + +func TestFileWriterAdapterWritesOutput(t *testing.T) { + tmpDir := t.TempDir() + filePath := filepath.Join(tmpDir, "crawl.log") + + adapter := &FileWriterAdapter{FilePath: filePath} + chSet := adapter.Consume() + chSet.NodeCh <- &oct.Node{NodeInfo: &oct.NodeInfo{Depth: 2, UrlString: "https://example.com"}} + chSet.QuitCh <- 1 + + deadline := time.Now().Add(500 * time.Millisecond) + for { + data, err := os.ReadFile(filePath) + if err == nil && len(data) > 0 { + if got := string(data); got != "2 - https://example.com\n" { + t.Fatalf("unexpected file content: %q", got) + } + break + } + if time.Now().After(deadline) { + t.Fatalf("timed out waiting for file content: %v", err) + } + time.Sleep(10 * time.Millisecond) + } +} diff --git a/adapter/doc.go b/adapter/doc.go index 2eac6a3..16d0516 100644 --- a/adapter/doc.go +++ b/adapter/doc.go @@ -8,5 +8,5 @@ screen). The FileWriterAdapter prints the output to a specified File. Both can be used as an OutputAdapter as part of the octopus crawler's CrawlOptions. - */ -package adapter \ No newline at end of file +*/ +package adapter diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..b8679dd --- /dev/null +++ b/go.mod @@ -0,0 +1,12 @@ +module github.com/rapidclock/web-octopus + +go 1.23 + +require ( + golang.org/x/net v0.35.0 + golang.org/x/time v0.10.0 +) + +replace golang.org/x/net => github.com/golang/net v0.35.0 + +replace golang.org/x/time => github.com/golang/time v0.10.0 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..5c9f64f --- /dev/null +++ b/go.sum @@ -0,0 +1,4 @@ +github.com/golang/net v0.35.0 h1:PozKZpK7ktpdVnmhjJPp6Qwol8S+hFnfYKxMcMeX7aQ= +github.com/golang/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= +github.com/golang/time v0.10.0 h1:6O5iMEWv8U6TO7Za6tqWgOj0J7QgqpQB1Np+Yzoa/dg= +github.com/golang/time v0.10.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= diff --git a/octopus/doc.go b/octopus/doc.go index 8a758eb..3a80a15 100644 --- a/octopus/doc.go +++ b/octopus/doc.go @@ -4,35 +4,35 @@ The octopus uses a pipeline of channels to implement a non-blocking web crawler. The octopus also provides user configurable options that can be used to customize the behaviour of the crawler. -Features +# Features Current Features of the crawler include: - 1. User specifiable Depth Limited Crawling - 2. User specified valid protocols - 3. User buildable adapters that the crawler feeds output to. - 4. Filter Duplicates. - 5. Filter URLs that fail a HEAD request. - 6. User specifiable max timeout between two successive url requests. - 7. User specifiable Max Number of Links to be crawled. + 1. User specifiable Depth Limited Crawling + 2. User specified valid protocols + 3. User buildable adapters that the crawler feeds output to. + 4. Filter Duplicates. + 5. Filter URLs that fail a HEAD request. + 6. User specifiable max timeout between two successive url requests. + 7. User specifiable Max Number of Links to be crawled. - -Pipeline Overview +# Pipeline Overview The overview of the Pipeline is given below: - 1. Ingest - 2. Link Absolution - 3. Protocol Filter - 4. Duplicate Filter - 5. Invalid Url Filter (Urls whose HEAD request Fails) - (5x) (Optional) Crawl Rate Limiter. - [6]. Make GET Request - 7a. Send to Output Adapter - 7b. Check for Timeout (gap between two output on this channel). - 8. Max Links Crawled Limit Filter - 9. Depth Limit Filter - 10. Parse Page for more URLs. + 1. Ingest + 2. Link Absolution + 3. Protocol Filter + 4. Duplicate Filter + 5. Invalid Url Filter (Urls whose HEAD request Fails) + (5x) (Optional) Crawl Rate Limiter. + [6]. Make GET Request + 7a. Send to Output Adapter + 7b. Check for Timeout (gap between two output on this channel). + 8. Max Links Crawled Limit Filter + 9. Depth Limit Filter + 10. Parse Page for more URLs. Note: The output from 7b. is fed to 8. + 1 -> 2 -> 3 -> 4 -> 5 -> (5x) -> [6] -> 7b -> 8 -> 9 -> 10 -> 1 */ package octopus diff --git a/octopus/htmlparse_test.go b/octopus/htmlparse_test.go new file mode 100644 index 0000000..1dba189 --- /dev/null +++ b/octopus/htmlparse_test.go @@ -0,0 +1,35 @@ +package octopus + +import ( + "io" + "strings" + "testing" + "time" +) + +func TestParseHtmlPageExtractsAnchorLinks(t *testing.T) { + outNodeCh := make(chan *Node, 2) + outQuitCh := make(chan int, 1) + out := MakeNodeChSet(outNodeCh, outQuitCh) + + node := &Node{ + NodeInfo: &NodeInfo{UrlString: "https://example.com", Depth: 1}, + Body: io.NopCloser(strings.NewReader(`AB`)), + } + + parseHtmlPage(node, out) + + got := make([]string, 0, 2) + for i := 0; i < 2; i++ { + select { + case n := <-outNodeCh: + got = append(got, n.UrlString) + case <-time.After(200 * time.Millisecond): + t.Fatal("timed out waiting for parsed link") + } + } + + if got[0] != "/a" || got[1] != "https://other/b" { + t.Fatalf("unexpected parsed links: %#v", got) + } +} diff --git a/octopus/modelfactory_test.go b/octopus/modelfactory_test.go new file mode 100644 index 0000000..8dd946d --- /dev/null +++ b/octopus/modelfactory_test.go @@ -0,0 +1,33 @@ +package octopus + +import "testing" + +func TestGetDefaultCrawlOptions(t *testing.T) { + opts := GetDefaultCrawlOptions() + + if opts.MaxCrawlDepth != defaultMaxDepth { + t.Fatalf("expected default max depth %d, got %d", defaultMaxDepth, opts.MaxCrawlDepth) + } + if opts.MaxCrawledUrls != defaultLinkCrawlLimit { + t.Fatalf("expected default max crawled urls %d, got %d", defaultLinkCrawlLimit, opts.MaxCrawledUrls) + } + if opts.CrawlRatePerSec != defaultCrawlRateLimit || opts.CrawlBurstLimitPerSec != defaultCrawlRateLimit { + t.Fatalf("unexpected crawl rate defaults: rate=%d burst=%d", opts.CrawlRatePerSec, opts.CrawlBurstLimitPerSec) + } + if opts.TimeToQuit != defaultTimeToQuit { + t.Fatalf("expected default timeout %d, got %d", defaultTimeToQuit, opts.TimeToQuit) + } +} + +func TestMakeDefaultNodeChSet(t *testing.T) { + chSet, nodeCh, quitCh := MakeDefaultNodeChSet() + if chSet == nil || chSet.StdChannels == nil { + t.Fatal("expected non-nil channel set") + } + if chSet.NodeCh == nil || chSet.QuitCh == nil { + t.Fatal("expected non-nil channels on set") + } + if nodeCh == nil || quitCh == nil { + t.Fatal("expected returned concrete channels to be non-nil") + } +} diff --git a/octopus/models.go b/octopus/models.go index 8cae64e..551abd9 100644 --- a/octopus/models.go +++ b/octopus/models.go @@ -31,35 +31,35 @@ type octopus struct { // You can specify depth of exploration for each link, // if crawler should ignore other host names (except from base host). // -// MaxCrawlDepth - Indicates the maximum depth that will be crawled, -// for each new link. +// MaxCrawlDepth - Indicates the maximum depth that will be crawled, +// for each new link. // -// MaxCrawledUrls - Specifies the Maximum Number of Unique Links that will be crawled. -// Note : When combined with DepthPerLink, it will combine both. -// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal). +// MaxCrawledUrls - Specifies the Maximum Number of Unique Links that will be crawled. +// Note : When combined with DepthPerLink, it will combine both. +// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal). // -// StayWithinBaseHost - (unimplemented) Ensures crawler stays within the -// level 1 link's hostname. +// StayWithinBaseHost - (unimplemented) Ensures crawler stays within the +// level 1 link's hostname. // -// CrawlRatePerSec - is the rate at which requests will be made (per second). -// If this is negative, Crawl feature will be ignored. Default is negative. +// CrawlRatePerSec - is the rate at which requests will be made (per second). +// If this is negative, Crawl feature will be ignored. Default is negative. // -// CrawlBurstLimitPerSec - Represents the max burst capacity with which requests -// can be made. This must be greater than or equal to the CrawlRatePerSec. +// CrawlBurstLimitPerSec - Represents the max burst capacity with which requests +// can be made. This must be greater than or equal to the CrawlRatePerSec. // -// RespectRobots (unimplemented) choose whether to respect robots.txt or not. +// RespectRobots (unimplemented) choose whether to respect robots.txt or not. // -// IncludeBody - (unimplemented) Include the response Body in the crawled -// NodeInfo (for further processing). +// IncludeBody - (unimplemented) Include the response Body in the crawled +// NodeInfo (for further processing). // -// OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler -// will pump output onto the implementation's channel returned by its Consume method. +// OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler +// will pump output onto the implementation's channel returned by its Consume method. // -// ValidProtocols - This is an array containing the list of url protocols that -// should be crawled. +// ValidProtocols - This is an array containing the list of url protocols that +// should be crawled. // -// TimeToQuit - represents the total time to wait between two new nodes to be -// generated before the crawler quits. This is in seconds. +// TimeToQuit - represents the total time to wait between two new nodes to be +// generated before the crawler quits. This is in seconds. type CrawlOptions struct { MaxCrawlDepth int64 MaxCrawledUrls int64 diff --git a/octopus/pipe_augment_linkabsolution.go b/octopus/pipe_augment_linkabsolution.go index 5f7780b..9636958 100644 --- a/octopus/pipe_augment_linkabsolution.go +++ b/octopus/pipe_augment_linkabsolution.go @@ -1,9 +1,6 @@ package octopus -import ( - "log" - "net/url" -) +import "net/url" func (o *octopus) makeLinkAbsolutionPipe(outChSet *NodeChSet) *NodeChSet { return stdLinearNodeFunc(makeLinkAbsolute, outChSet, "Link Absolution") @@ -11,7 +8,6 @@ func (o *octopus) makeLinkAbsolutionPipe(outChSet *NodeChSet) *NodeChSet { func makeLinkAbsolute(node *Node, outChSet *NodeChSet) { if node == nil || outChSet == nil { - log.Fatal("NIL ERROR") return } if node.ParentUrlString != "" { diff --git a/octopus/pipe_ctrl_ratelimit.go b/octopus/pipe_ctrl_ratelimit.go index a6f5917..d6faf7d 100644 --- a/octopus/pipe_ctrl_ratelimit.go +++ b/octopus/pipe_ctrl_ratelimit.go @@ -1,8 +1,6 @@ package octopus -import ( - "time" -) +import "time" func (o *octopus) makeRateLimitingPipe(outChSet *NodeChSet) *NodeChSet { return stdLinearNodeFunc(o.rateLimit, outChSet, "Crawl Rate Limit") diff --git a/octopus/pipe_process_htmlparsing.go b/octopus/pipe_process_htmlparsing.go index d6a50dd..8d40ec2 100644 --- a/octopus/pipe_process_htmlparsing.go +++ b/octopus/pipe_process_htmlparsing.go @@ -1,8 +1,6 @@ package octopus -import ( - "golang.org/x/net/html" -) +import "golang.org/x/net/html" func (o *octopus) makeParseNodeFromHtmlPipe(outChSet *NodeChSet) *NodeChSet { return stdLinearNodeFunc(parseHtmlPage, outChSet, "Link Parsing") @@ -14,6 +12,9 @@ func parseHtmlPage(node *Node, outChSet *NodeChSet) { node.Body.Close() } }() + if node == nil || node.Body == nil { + return + } z := html.NewTokenizer(node.Body) for { tt := z.Next() diff --git a/octopus/pipe_spl_distributor.go b/octopus/pipe_spl_distributor.go index 8465529..14649ac 100644 --- a/octopus/pipe_spl_distributor.go +++ b/octopus/pipe_spl_distributor.go @@ -3,7 +3,7 @@ package octopus // makeDistributorPipe - Distributes any node received on its listen channel // to the list of channels passed into this. // Basically this behaves like a repeater or a hub. -func (o *octopus) makeDistributorPipe(outChSetList ... *NodeChSet) ( +func (o *octopus) makeDistributorPipe(outChSetList ...*NodeChSet) ( listenChSet *NodeChSet) { listenCh := make(chan *Node) listenQuitCh := make(chan int, 1) @@ -18,9 +18,7 @@ func (o *octopus) makeDistributorPipe(outChSetList ... *NodeChSet) ( } func distribute(listenCh chan *Node, listenQuitCh chan int, - outChSetList ... *NodeChSet) { - defer close(listenCh) - defer close(listenQuitCh) + outChSetList ...*NodeChSet) { for { select { case node := <-listenCh: @@ -32,13 +30,12 @@ func distribute(listenCh chan *Node, listenQuitCh chan int, } } case <-listenQuitCh: - { - for _, outChSet := range outChSetList { - if outChSet != nil { - outChSet.QuitCh <- 1 - } + for _, outChSet := range outChSetList { + if outChSet != nil { + outChSet.QuitCh <- 1 } } + return } } } diff --git a/octopus/pipes_test.go b/octopus/pipes_test.go new file mode 100644 index 0000000..7f94912 --- /dev/null +++ b/octopus/pipes_test.go @@ -0,0 +1,63 @@ +package octopus + +import ( + "sync" + "testing" + "time" +) + +func TestMakeLinkAbsolute(t *testing.T) { + nodeCh := make(chan *Node, 1) + quitCh := make(chan int, 1) + out := MakeNodeChSet(nodeCh, quitCh) + + node := createNode("https://example.com/guide/", "../about", 2) + makeLinkAbsolute(node, out) + + select { + case got := <-nodeCh: + if got.UrlString != "https://example.com/about" { + t.Fatalf("expected absolute url, got %q", got.UrlString) + } + case <-time.After(500 * time.Millisecond): + t.Fatal("timed out waiting for output node") + } +} + +func TestFilterDuplicates(t *testing.T) { + nodeCh := make(chan *Node, 2) + quitCh := make(chan int, 1) + o := &octopus{visited: new(sync.Map)} + out := MakeNodeChSet(nodeCh, quitCh) + + node := createNode("", "https://example.com", 1) + o.filterDuplicates(node, out) + o.filterDuplicates(node, out) + + select { + case <-nodeCh: + default: + t.Fatal("expected first node to pass duplicate filter") + } + + select { + case <-nodeCh: + t.Fatal("expected second node to be filtered") + default: + } +} + +func TestConnectWithTimeout(t *testing.T) { + listenNodeCh := make(chan *Node, 1) + listenQuitCh := make(chan int, 1) + outNodeCh := make(chan *Node, 1) + outQuitCh := make(chan int, 1) + + go connectWithTimeout(listenNodeCh, listenQuitCh, MakeNodeChSet(outNodeCh, outQuitCh), 25*time.Millisecond) + + select { + case <-outQuitCh: + case <-time.After(200 * time.Millisecond): + t.Fatal("expected timeout quit signal") + } +} diff --git a/octopus/setup.go b/octopus/setup.go index 733ca81..c0612ce 100644 --- a/octopus/setup.go +++ b/octopus/setup.go @@ -1,7 +1,6 @@ package octopus import ( - "log" "time" "golang.org/x/time/rate" @@ -25,7 +24,7 @@ func (o *octopus) setupTimeToQuit() { if o.TimeToQuit > 0 { o.timeToQuit = time.Duration(o.TimeToQuit) * time.Second } else { - log.Fatalln("TimeToQuit is not greater than 0") + panic("TimeToQuit is not greater than 0") } } diff --git a/octopus/setup_test.go b/octopus/setup_test.go new file mode 100644 index 0000000..3210ffb --- /dev/null +++ b/octopus/setup_test.go @@ -0,0 +1,50 @@ +package octopus + +import ( + "testing" + + "golang.org/x/time/rate" +) + +func TestValidateRateLimits(t *testing.T) { + tests := []struct { + name string + rate rate.Limit + burst rate.Limit + wantPanic bool + }{ + {name: "disabled", rate: -1, burst: -1, wantPanic: false}, + {name: "valid", rate: 5, burst: 8, wantPanic: false}, + {name: "zero rate", rate: 0, burst: 1, wantPanic: true}, + {name: "burst lower than rate", rate: 8, burst: 5, wantPanic: true}, + {name: "negative rate with positive burst", rate: -1, burst: 1, wantPanic: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + defer func() { + recovered := recover() + if tt.wantPanic && recovered == nil { + t.Fatal("expected panic") + } + if !tt.wantPanic && recovered != nil { + t.Fatalf("unexpected panic: %v", recovered) + } + }() + + validateRateLimits(tt.rate, tt.burst) + }) + } +} + +func TestSetupTimeToQuitPanicsOnInvalidValue(t *testing.T) { + o := NewWithDefaultOptions() + o.TimeToQuit = 0 + + defer func() { + if recover() == nil { + t.Fatal("expected setupTimeToQuit to panic when TimeToQuit <= 0") + } + }() + o.setupTimeToQuit() +} From e6a28623c4690e0d326d42098d0d3b884809718a Mon Sep 17 00:00:00 2001 From: Rahul Thomas Date: Mon, 9 Feb 2026 22:00:02 -0800 Subject: [PATCH 2/3] change version from 1.4.1 to 1.3.0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 347f583..f0bb29e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.4.1 +1.3.0 From 4b79f7d75708647b131387edf9465f8ec3b42e69 Mon Sep 17 00:00:00 2001 From: Rahul Thomas Date: Mon, 9 Feb 2026 22:13:07 -0800 Subject: [PATCH 3/3] Update CHANGELOG.md to reflect recent changes Removed outdated version entries and consolidated changelog to focus on the latest updates. --- CHANGELOG.md | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c5ff5f0..93847cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,35 +2,22 @@ All notable changes to this project are documented in this file. -## [1.4.1] - 2026-02-10 - +## [1.3.0] - 2026-02-10 ### Changed - Updated publish automation to run **only** when a GitHub Release is published (not on every tag push). - Added strict release-tag validation in publish workflow and kept VERSION/tag consistency checks. - -## [1.4.0] - 2026-02-10 - -### Added -- Added GitHub Actions CI workflow for pull requests and default-branch pushes with module, formatting, vet, static analysis, test, and race checks. -- Added GitHub Actions publish workflow on version tags to trigger Go proxy and pkg.go.dev indexing. - -### Changed - Removed legacy Travis CI configuration in favor of GitHub Actions. - -## [1.3.0] - 2026-02-10 - -### Added -- Added parser-focused unit coverage to validate anchor extraction behavior. - -### Changed - Restored robust HTML parsing using `golang.org/x/net/html` tokenizer instead of regex-based extraction. - Restored mature rate limiting with `golang.org/x/time/rate` for predictable throttling semantics. - Added module `replace` directives to GitHub mirrors for `golang.org/x/*` to improve fetch reliability in restricted environments. - Fixed distributor shutdown to stop forwarding after quit signal without closing inbound channels owned by upstream producers. - -## [1.1.0] - 2026-02-10 +- Replaced `log.Fatal` behavior in core library paths with non-process-terminating logic (`panic` for invalid setup value and early return for nil or file-open failure paths). +- Improved file adapter open flags to use write-only, create, and truncate semantics for predictable output. ### Added +- Added GitHub Actions CI workflow for pull requests and default-branch pushes with module, formatting, vet, static analysis, test, and race checks. +- Added GitHub Actions publish workflow on version tags to trigger Go proxy and pkg.go.dev indexing. +- Added parser-focused unit coverage to validate anchor extraction behavior. - Go module support (`go.mod`) with explicit dependency versions. - Unit tests for crawler defaults and factory helpers. - Unit tests for rate-limit validation and timeout setup behavior. @@ -38,9 +25,5 @@ All notable changes to this project are documented in this file. - Unit test coverage for `adapter.FileWriterAdapter` write behavior. - Comprehensive README covering installation, architecture, configuration, adapters, testing, and release policy. -### Changed -- Replaced `log.Fatal` behavior in core library paths with non-process-terminating logic (`panic` for invalid setup value and early return for nil or file-open failure paths). -- Improved file adapter open flags to use write-only, create, and truncate semantics for predictable output. - ### Notes - This version focuses on modernization and maintainability without changing the fundamental crawler pipeline design.