diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..a8b1490
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,68 @@
+name: CI
+
+on:
+ pull_request:
+ types: [opened, synchronize, reopened, ready_for_review]
+ push:
+ branches: [main, master]
+
+permissions:
+ contents: read
+
+concurrency:
+ group: ci-${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ test-and-lint:
+ runs-on: ubuntu-latest
+ timeout-minutes: 20
+
+ strategy:
+ fail-fast: false
+ matrix:
+ go-version: ['1.24.x', '1.25.x']
+
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Setup Go
+ uses: actions/setup-go@v5
+ with:
+ go-version: ${{ matrix.go-version }}
+ cache: true
+
+ - name: Verify module graph
+ if: matrix.go-version == '1.23.x'
+ run: |
+ go mod tidy
+ git diff --exit-code -- go.mod go.sum
+
+ - name: Format check
+ run: |
+ unformatted=$(gofmt -l $(git ls-files '*.go'))
+ if [ -n "$unformatted" ]; then
+ echo "These files are not gofmt-formatted:"
+ echo "$unformatted"
+ exit 1
+ fi
+
+ - name: Vet
+ run: go vet ./...
+
+ - name: Staticcheck
+ if: matrix.go-version == '1.23.x'
+ run: |
+ go install honnef.co/go/tools/cmd/staticcheck@v0.6.1
+ TOOLBIN="$(go env GOBIN)"
+ if [ -z "$TOOLBIN" ]; then
+ TOOLBIN="$(go env GOPATH)/bin"
+ fi
+ "$TOOLBIN/staticcheck" ./...
+
+ - name: Unit tests
+ run: go test ./...
+
+ - name: Race tests
+ run: go test -race ./...
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..0983dbe
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,91 @@
+name: Publish
+
+on:
+ release:
+ types: [published]
+
+permissions:
+ contents: read
+
+jobs:
+ publish-go-module:
+ if: github.event.release.prerelease == false
+ runs-on: ubuntu-latest
+ timeout-minutes: 10
+
+ steps:
+ - name: Checkout release tag
+ uses: actions/checkout@v4
+ with:
+ ref: refs/tags/${{ github.event.release.tag_name }}
+ fetch-depth: 0
+
+ - name: Setup Go
+ uses: actions/setup-go@v5
+ with:
+ go-version: '1.24.x'
+
+ - name: Validate release tag format and VERSION match
+ run: |
+ TAG="${{ github.event.release.tag_name }}"
+ case "$TAG" in
+ v*) ;;
+ *)
+ echo "Release tag must start with 'v' (got: $TAG)"
+ exit 1
+ ;;
+ esac
+
+ TAG_NO_V="${TAG#v}"
+ VERSION_FILE=$(tr -d '[:space:]' < VERSION)
+ if ! test "$TAG_NO_V" = "$VERSION_FILE"; then
+ echo "Release tag version does not match VERSION file"
+ echo " tag version (without 'v'): $TAG_NO_V"
+ echo " VERSION file: $VERSION_FILE"
+ exit 1
+ fi
+
+ - name: Verify release tag is on main
+ run: |
+ set -euo pipefail
+ git fetch origin main:refs/remotes/origin/main
+ MAIN_SHA=$(git rev-parse origin/main)
+ if ! test "${GITHUB_SHA}" = "${MAIN_SHA}"; then
+ echo "Release tag is not at main HEAD; refusing to publish"
+ echo " tag: ${{ github.event.release.tag_name }}"
+ echo " tag sha: ${GITHUB_SHA}"
+ echo " main head: ${MAIN_SHA}"
+ exit 1
+ fi
+
+ - name: Trigger Go module proxy indexing
+ env:
+ MODULE: github.com/rapidclock/web-octopus
+ run: |
+ set -euo pipefail
+ VERSION="${{ github.event.release.tag_name }}"
+ URL="https://proxy.golang.org/${MODULE}/@v/${VERSION}.info"
+ for i in 1 2 3 4 5; do
+ if curl -fsSL "$URL" >/dev/null; then
+ exit 0
+ fi
+ sleep $((i * 2))
+ done
+ echo "failed to fetch module info from proxy after retries"
+ exit 1
+
+ - name: Trigger pkg.go.dev refresh
+ env:
+ MODULE: github.com/rapidclock/web-octopus
+ run: |
+ set -euo pipefail
+ VERSION="${{ github.event.release.tag_name }}"
+ URL="https://pkg.go.dev/fetch/${MODULE}@${VERSION}"
+ for i in 1 2 3 4 5; do
+ if curl -fsSL "$URL" >/dev/null; then
+ exit 0
+ fi
+ sleep $((i * 2))
+ done
+ echo "failed to trigger pkg.go.dev refresh after retries"
+ exit 1
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 8102e24..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-sudo: false
-
-language: go
-go:
- - "1.11"
-
-notifications:
- email: false
-
-script:
- - go doc octopus
- - go doc adapter
- - go test -v ./...
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..5cdbe49
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,31 @@
+# Changelog
+
+All notable changes to this project are documented in this file.
+
+## [1.3.0] - 2026-02-10
+
+### Added
+- Added GitHub Actions CI workflow for pull requests and default-branch pushes with module, formatting, vet, static analysis, test, and race checks.
+- Added GitHub Actions publish workflow driven by GitHub Release publication to trigger Go proxy and pkg.go.dev indexing.
+
+### Changed
+- Removed legacy Travis CI configuration in favor of GitHub Actions.
+- Hardened CI reliability by adding workflow concurrency cancellation, running module/static checks on the primary Go version, and pinning staticcheck.
+- Hardened publish reliability by validating release tag format against `VERSION`, checking out the exact release tag, skipping prereleases, and adding retry logic for Go proxy/pkg.go.dev refresh calls.
+
+## [1.1.0] - 2026-02-10
+
+### Added
+- Go module support (`go.mod`) with explicit dependency versions.
+- Unit tests for crawler defaults and factory helpers.
+- Unit tests for rate-limit validation and timeout setup behavior.
+- Unit tests for pipeline helper behavior (link absolution, duplicate filter, timeout propagation).
+- Unit test coverage for `adapter.FileWriterAdapter` write behavior.
+- Comprehensive README covering installation, architecture, configuration, adapters, testing, and release policy.
+
+### Changed
+- Replaced `log.Fatal` behavior in core library paths with non-process-terminating logic (`panic` for invalid setup value and early return for nil or file-open failure paths).
+- Improved file adapter open flags to use write-only, create, and truncate semantics for predictable output.
+
+### Notes
+- This version focuses on modernization and maintainability without changing the fundamental crawler pipeline design.
diff --git a/README.md b/README.md
index d6022ac..1e18f60 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,46 @@
# web-octopus
-[](https://godoc.org/github.com/rapidclock/web-octopus)
-[](https://travis-ci.com/rapidclock/web-octopus)
-
-A concurent web crawler written in Go.
+[](https://pkg.go.dev/github.com/rapidclock/web-octopus@v1.3.0)
-## Install
- go get github.com/rapidclock/web-octopus/octopus
- go get github.com/rapidclock/web-octopus/adapter
+A concurrent, channel-pipeline web crawler in Go.
-## Current Features:
-- Depth Limited Crawling
-- User specified valid protocols
-- User buildable adapters that the crawler feeds output to.
-- Filter Duplicates. (Default, Non-Customizable)
-- Filter URLs that fail a HEAD request. (Default, Non-Customizable)
-- User specifiable max timeout between two successive url requests.
-- Max Number of Links to be crawled.
+> This release modernizes the project for current Go module workflows, testing expectations, and maintainability standards.
+## Table of contents
-### Sample Implementation Snippet
+- [Highlights](#highlights)
+- [Installation](#installation)
+- [Quick start](#quick-start)
+- [Architecture](#architecture)
+- [Configuration reference](#configuration-reference)
+- [Output adapters](#output-adapters)
+- [Testing](#testing)
+- [Versioning and release](#versioning-and-release)
+- [Compatibility notes](#compatibility-notes)
+
+## Highlights
+
+- Uses Go modules (`go.mod`) instead of legacy `go get`-only workflow.
+- Includes automated unit tests for crawler defaults, validation behavior, pipeline helpers, and adapter output.
+- Improved adapter safety around file handling and error paths.
+- Expanded docs with architecture details and operational guidance.
+
+## Installation
+
+```bash
+go get github.com/rapidclock/web-octopus@v1.3.0
+```
+
+Import packages:
+
+```go
+import (
+ "github.com/rapidclock/web-octopus/adapter"
+ "github.com/rapidclock/web-octopus/octopus"
+)
+```
+
+## Quick start
```go
package main
@@ -31,134 +52,128 @@ import (
func main() {
opAdapter := &adapter.StdOpAdapter{}
-
+
options := octopus.GetDefaultCrawlOptions()
options.MaxCrawlDepth = 3
options.TimeToQuit = 10
options.CrawlRatePerSec = 5
options.CrawlBurstLimitPerSec = 8
options.OpAdapter = opAdapter
-
+
crawler := octopus.New(options)
crawler.SetupSystem()
crawler.BeginCrawling("https://www.example.com")
}
```
-### List of customizations
+## Architecture
+
+`web-octopus` uses a staged channel pipeline. Nodes (URLs + metadata) flow through filter and processing stages:
+
+1. Ingest
+2. Link absolution
+3. Protocol filter
+4. Duplicate filter
+5. URL validation (`HEAD`)
+6. Optional rate limiter
+7. Page requisition (`GET`)
+8. Distributor
+ - Output adapter stream
+ - Max delay watchdog stream
+9. Max crawled links limiter (optional)
+10. Crawl depth filter
+11. HTML parsing back into ingest
+
+This design allows localized extension by replacing adapters and modifying options, while preserving high concurrency.
+
+## Configuration reference
-Customizations can be made by supplying the crawler an instance of `CrawlOptions`. The basic structure is shown below, with a brief explanation for each option.
+`CrawlOptions` controls crawler behavior:
+
+- `MaxCrawlDepth int64` — max depth for crawled nodes.
+- `MaxCrawledUrls int64` — max total unique URLs; `-1` means unlimited.
+- `CrawlRatePerSec int64` — request rate limit, negative to disable.
+- `CrawlBurstLimitPerSec int64` — burst capacity for rate limiting.
+- `IncludeBody bool` — include body in crawled node (currently internal pipeline behavior).
+- `OpAdapter OutputAdapter` — required output sink.
+- `ValidProtocols []string` — accepted URL schemes (e.g., `http`, `https`).
+- `TimeToQuit int64` — max idle seconds before automatic quit.
+
+### Defaults
+
+Use:
```go
-type CrawlOptions struct {
- MaxCrawlDepth int64 // Max Depth of Crawl, 0 is the initial link.
- MaxCrawledUrls int64 // Max number of links to be crawled in total.
- StayWithinBaseHost bool // [Not-Implemented-Yet]
- CrawlRatePerSec int64 // Max Rate at which requests can be made (req/sec).
- CrawlBurstLimitPerSec int64 // Max Burst Capacity (should be atleast the crawl rate).
- RespectRobots bool // [Not-Implemented-Yet]
- IncludeBody bool // Include the Request Body (Contents of the web page) in the result of the crawl.
- OpAdapter OutputAdapter // A user defined crawl output handler (See next section for info).
- ValidProtocols []string // Valid protocols to crawl (http, https, ftp, etc.)
- TimeToQuit int64 // Timeout (seconds) between two attempts or requests, before the crawler quits.
-}
+opts := octopus.GetDefaultCrawlOptions()
```
-A default instance of the `CrawlOptions` can be obtained by calling `octopus.GetDefaultCrawlOptions()`. This can be further customized by overriding individual properties.
-
-**NOTE:** If rate-limiting is not required, then just ignore(don't set value) both `CrawlRatePerSec` and `CrawlBurstLimitPerSec` in the `CrawlOptions`.
+Default values are tuned for local experimentation:
-### Output Adapters
+- Depth: `2`
+- Max links: `-1` (unbounded)
+- Rate limit: disabled
+- Protocols: `http`, `https`
+- Timeout gap: `30s`
-An Output Adapter is the final destination of a crawler processed request. The output of the crawler is fed here, according to the customizations made before starting the crawler through the `CrawlOptions` attached to the crawler.
+## Output adapters
-The `OutputAdapter` is a Go Interface, that has to be implemented by your(user-defined) processor.
+The crawler emits processed nodes through the `OutputAdapter` interface:
```go
type OutputAdapter interface {
- Consume() *NodeChSet
+ Consume() *NodeChSet
}
```
-The user has to implement the `Consume()` method that returns a __*pointer*__ to a `NodeChSet`. The `NodeChSet` is described below. The crawler uses the returned channel to send the crawl output. The user can start listening for output from the crawler.
+### Built-in adapters
-**Note** : If the user chooses to implement their custom `OutputAdapter` **REMEMBER** to listen for the output on another go-routine. Otherwise you might block the crawler from running. Atleast begin the crawling on another go-routine before you begin processing output.
+1. `adapter.StdOpAdapter`
+ - Prints `count - depth - URL` to stdout.
+2. `adapter.FileWriterAdapter`
+ - Writes `depth - URL` lines to a file.
-The structure of the `NodeChSet` is given below.
+### Writing a custom adapter
-```go
-type NodeChSet struct {
- NodeCh chan<- *Node
- *StdChannels
-}
+Create channels, return `*octopus.NodeChSet`, and consume nodes in a goroutine. Always handle quit signals to avoid goroutine leaks.
-type StdChannels struct {
- QuitCh chan<- int
-}
+## Testing
-type Node struct {
- *NodeInfo
- Body io.ReadCloser
-}
+Run the full test suite:
-type NodeInfo struct {
- ParentUrlString string
- UrlString string
- Depth int64
-}
+```bash
+go test ./...
```
-You can use the utility function `MakeDefaultNodeChSet()` to get a `NodeChSet` built for you. This also returns the `Node` and quit channels. Example given below:
+Recommended local checks before release:
-```go
-var opNodeChSet *NodeChSet
-var nodeCh chan *Node
-var quitCh chan int
-// above to demo the types. One can easily use go lang type erasure.
-opNodeChSet, nodeCh, quitCh = MakeDefaultNodeChSet()
+```bash
+go test ./... -race
+go vet ./...
```
-The user should supply the custom OutputAdapter as an argument to the `CrawlOptions`.
-#### Default Output Adapters:
+## CI/CD
-We supply two default Adapters for you to try out. They are not meant to be feature rich, but you can still use them. Their primary purpose is meant to be a demonstration of how to build and use a `OutputAdapter`.
+This repository uses GitHub Actions (not Travis CI):
-1. `adapter.StdOpAdapter` : Writes the crawled output (only links, not body) to the standard output.
-1. `adapter.FileWriterAdapter` : Writes the crawled output (only links, not body) to a supplied file.
+- **CI workflow** (`.github/workflows/ci.yml`) runs automatically on PR open/sync/reopen and on pushes to the default branch. It validates module tidiness, formatting, vet/staticcheck, and test suites (including race detection).
+- **Publish workflow** (`.github/workflows/publish.yml`) runs only when a GitHub **Release** is published (excluding prereleases), validates tag/version alignment, and triggers indexing on both the Go proxy and pkg.go.dev so new versions are discoverable quickly.
-#### Implementation of the `adapter.StdOpAdapter`:
-We have supplied the implementation of `adapter.StdOpAdapter` below to get a rough idea of what goes into building your own adapter.
+Release flow:
-```go
-// StdOpAdapter is an output adapter that just prints the output onto the
-// screen.
-//
-// Sample Output Format is:
-// LinkNum - Depth - Url
-type StdOpAdapter struct{}
-
-func (s *StdOpAdapter) Consume() *oct.NodeChSet {
- listenCh := make(chan *oct.Node)
- quitCh := make(chan int, 1)
- listenChSet := &oct.NodeChSet{
- NodeCh: listenCh,
- StdChannels: &oct.StdChannels{
- QuitCh: quitCh,
- },
- }
- go func() {
- i := 1
- for {
- select {
- case output := <-listenCh:
- fmt.Printf("%d - %d - %s\n", i, output.Depth, output.UrlString)
- i++
- case <-quitCh:
- return
- }
- }
- }()
- return listenChSet
-}
-```
+1. Update `VERSION` and `CHANGELOG.md`.
+2. Merge to default branch.
+3. Create and push tag `vX.Y.Z` matching `VERSION`.
+4. Publish a GitHub Release for that tag.
+5. GitHub Actions publish workflow handles Go portal refresh calls.
+
+## Versioning and release
+
+- Project follows semantic versioning.
+- Current release in this repository: **v1.3.0**.
+- See `CHANGELOG.md` for release notes.
+
+## Compatibility notes
+
+- Legacy examples using old `go get` package paths still map to the same module path.
+- Existing adapters remain source-compatible.
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000..f0bb29e
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+1.3.0
diff --git a/adapter/basicadapters.go b/adapter/basicadapters.go
index f9a9e0e..e79e82e 100644
--- a/adapter/basicadapters.go
+++ b/adapter/basicadapters.go
@@ -13,7 +13,8 @@ import (
// screen.
//
// Sample Output Format is:
-// LinkNum - Depth - Url
+//
+// LinkNum - Depth - Url
type StdOpAdapter struct{}
func (s *StdOpAdapter) Consume() *oct.NodeChSet {
@@ -43,7 +44,8 @@ func (s *StdOpAdapter) Consume() *oct.NodeChSet {
// FileWriterAdapter is an output adapter that writes the output to a
// specified file.
// Sample Output Format is:
-// Depth - Url
+//
+// Depth - Url
type FileWriterAdapter struct {
FilePath string
}
@@ -65,11 +67,26 @@ func (fw *FileWriterAdapter) writeToFile(listenCh chan *oct.Node,
quitCh chan int) {
fp, err := fw.getFilePointer()
if err != nil {
- fp.Close()
- log.Fatal(err)
+ log.Printf("failed to open output file %q: %v", fw.FilePath, err)
+ // Start a fallback goroutine to drain listenCh so producers do not block.
+ go func() {
+ for {
+ select {
+ case <-listenCh:
+ // Discard messages; file is not available.
+ case <-quitCh:
+ return
+ }
+ }
+ }()
+ return
}
go func() {
- defer fp.Close()
+ defer func() {
+ if err := fp.Close(); err != nil {
+ log.Printf("failed to close output file %q: %v", fw.FilePath, err)
+ }
+ }()
for {
select {
case output := <-listenCh:
@@ -86,6 +103,6 @@ func (fw *FileWriterAdapter) writeToFile(listenCh chan *oct.Node,
}
func (fw *FileWriterAdapter) getFilePointer() (w io.WriteCloser, err error) {
- w, err = os.OpenFile(fw.FilePath, os.O_RDWR|os.O_CREATE, 0644)
+ w, err = os.OpenFile(fw.FilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o644)
return
}
diff --git a/adapter/basicadapters_test.go b/adapter/basicadapters_test.go
new file mode 100644
index 0000000..3044ac9
--- /dev/null
+++ b/adapter/basicadapters_test.go
@@ -0,0 +1,35 @@
+package adapter
+
+import (
+ "os"
+ "path/filepath"
+ "testing"
+ "time"
+
+ oct "github.com/rapidclock/web-octopus/octopus"
+)
+
+func TestFileWriterAdapterWritesOutput(t *testing.T) {
+ tmpDir := t.TempDir()
+ filePath := filepath.Join(tmpDir, "crawl.log")
+
+ adapter := &FileWriterAdapter{FilePath: filePath}
+ chSet := adapter.Consume()
+ chSet.NodeCh <- &oct.Node{NodeInfo: &oct.NodeInfo{Depth: 2, UrlString: "https://example.com"}}
+ chSet.QuitCh <- 1
+
+ deadline := time.Now().Add(500 * time.Millisecond)
+ for {
+ data, err := os.ReadFile(filePath)
+ if err == nil && len(data) > 0 {
+ if got := string(data); got != "2 - https://example.com\n" {
+ t.Fatalf("unexpected file content: %q", got)
+ }
+ break
+ }
+ if time.Now().After(deadline) {
+ t.Fatalf("timed out waiting for file content: %v", err)
+ }
+ time.Sleep(10 * time.Millisecond)
+ }
+}
diff --git a/adapter/doc.go b/adapter/doc.go
index 2eac6a3..16d0516 100644
--- a/adapter/doc.go
+++ b/adapter/doc.go
@@ -8,5 +8,5 @@ screen). The FileWriterAdapter prints the output to a specified File.
Both can be used as an OutputAdapter as part of the octopus crawler's
CrawlOptions.
- */
-package adapter
\ No newline at end of file
+*/
+package adapter
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..050a487
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,8 @@
+module github.com/rapidclock/web-octopus
+
+go 1.24.0
+
+require (
+ golang.org/x/net v0.50.0
+ golang.org/x/time v0.14.0
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..9100c1e
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,4 @@
+golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60=
+golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM=
+golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
+golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
diff --git a/octopus/doc.go b/octopus/doc.go
index 8a758eb..3a80a15 100644
--- a/octopus/doc.go
+++ b/octopus/doc.go
@@ -4,35 +4,35 @@ The octopus uses a pipeline of channels to implement a non-blocking web crawler.
The octopus also provides user configurable options that can be used to
customize the behaviour of the crawler.
-Features
+# Features
Current Features of the crawler include:
- 1. User specifiable Depth Limited Crawling
- 2. User specified valid protocols
- 3. User buildable adapters that the crawler feeds output to.
- 4. Filter Duplicates.
- 5. Filter URLs that fail a HEAD request.
- 6. User specifiable max timeout between two successive url requests.
- 7. User specifiable Max Number of Links to be crawled.
+ 1. User specifiable Depth Limited Crawling
+ 2. User specified valid protocols
+ 3. User buildable adapters that the crawler feeds output to.
+ 4. Filter Duplicates.
+ 5. Filter URLs that fail a HEAD request.
+ 6. User specifiable max timeout between two successive url requests.
+ 7. User specifiable Max Number of Links to be crawled.
-
-Pipeline Overview
+# Pipeline Overview
The overview of the Pipeline is given below:
- 1. Ingest
- 2. Link Absolution
- 3. Protocol Filter
- 4. Duplicate Filter
- 5. Invalid Url Filter (Urls whose HEAD request Fails)
- (5x) (Optional) Crawl Rate Limiter.
- [6]. Make GET Request
- 7a. Send to Output Adapter
- 7b. Check for Timeout (gap between two output on this channel).
- 8. Max Links Crawled Limit Filter
- 9. Depth Limit Filter
- 10. Parse Page for more URLs.
+ 1. Ingest
+ 2. Link Absolution
+ 3. Protocol Filter
+ 4. Duplicate Filter
+ 5. Invalid Url Filter (Urls whose HEAD request Fails)
+ (5x) (Optional) Crawl Rate Limiter.
+ [6]. Make GET Request
+ 7a. Send to Output Adapter
+ 7b. Check for Timeout (gap between two output on this channel).
+ 8. Max Links Crawled Limit Filter
+ 9. Depth Limit Filter
+ 10. Parse Page for more URLs.
Note: The output from 7b. is fed to 8.
+
1 -> 2 -> 3 -> 4 -> 5 -> (5x) -> [6] -> 7b -> 8 -> 9 -> 10 -> 1
*/
package octopus
diff --git a/octopus/htmlparse_test.go b/octopus/htmlparse_test.go
new file mode 100644
index 0000000..1dba189
--- /dev/null
+++ b/octopus/htmlparse_test.go
@@ -0,0 +1,35 @@
+package octopus
+
+import (
+ "io"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestParseHtmlPageExtractsAnchorLinks(t *testing.T) {
+ outNodeCh := make(chan *Node, 2)
+ outQuitCh := make(chan int, 1)
+ out := MakeNodeChSet(outNodeCh, outQuitCh)
+
+ node := &Node{
+ NodeInfo: &NodeInfo{UrlString: "https://example.com", Depth: 1},
+ Body: io.NopCloser(strings.NewReader(`