From 87cdb117ba350d947d52879b724de4f7450445ab Mon Sep 17 00:00:00 2001 From: Arnab Chatterjee Date: Tue, 4 Nov 2025 18:07:28 +0530 Subject: [PATCH 1/8] added support for huggingface dataset repositories Signed-off-by: Arnab Chatterjee --- pkg/cmd/kitimport/cmd.go | 2 +- pkg/cmd/kitimport/hfimport.go | 6 +++--- pkg/cmd/kitimport/util.go | 31 +++++++++++++++++++++++-------- pkg/cmd/kitimport/util_test.go | 26 ++++++++++++++++---------- pkg/lib/hf/download.go | 13 +++++++++++-- pkg/lib/hf/list.go | 15 ++++++++++++--- 6 files changed, 66 insertions(+), 27 deletions(-) diff --git a/pkg/cmd/kitimport/cmd.go b/pkg/cmd/kitimport/cmd.go index ea100e92..cf6add5c 100644 --- a/pkg/cmd/kitimport/cmd.go +++ b/pkg/cmd/kitimport/cmd.go @@ -130,7 +130,7 @@ func (opts *importOptions) complete(ctx context.Context, args []string) error { opts.repo = args[0] if opts.tag == "" { - tag, err := extractRepoFromURL(opts.repo) + tag, _, err := extractRepoFromURL(opts.repo) if err != nil { output.Errorf("Could not generate tag from URL: %s", err) return fmt.Errorf("use flag --tag to set a tag for ModelKit") diff --git a/pkg/cmd/kitimport/hfimport.go b/pkg/cmd/kitimport/hfimport.go index 4ab76d07..d1411249 100644 --- a/pkg/cmd/kitimport/hfimport.go +++ b/pkg/cmd/kitimport/hfimport.go @@ -37,7 +37,7 @@ import ( func importUsingHF(ctx context.Context, opts *importOptions) error { // Handle full HF URLs by extracting repository name from URL - repo, err := extractRepoFromURL(opts.repo) + repo, kind, err := extractRepoFromURL(opts.repo) if err != nil { return fmt.Errorf("could not process URL %s: %w", opts.repo, err) } @@ -53,7 +53,7 @@ func importUsingHF(ctx context.Context, opts *importOptions) error { } }() - dirListing, err := hf.ListFiles(ctx, repo, opts.repoRef, opts.token) + dirListing, err := hf.ListFiles(ctx, repo, opts.repoRef, opts.token, kind) if err != nil { return fmt.Errorf("failed to list files from HuggingFace API: %w", err) } @@ -106,7 +106,7 @@ func importUsingHF(ctx context.Context, opts *importOptions) error { if err != nil { return err } - if err := hf.DownloadFiles(ctx, repo, opts.repoRef, tmpDir, toDownload, opts.token, opts.concurrency); err != nil { + if err := hf.DownloadFiles(ctx, repo, opts.repoRef, tmpDir, toDownload, opts.token, opts.concurrency, kind); err != nil { return fmt.Errorf("error downloading repository: %w", err) } diff --git a/pkg/cmd/kitimport/util.go b/pkg/cmd/kitimport/util.go index 5e64d071..1358f99c 100644 --- a/pkg/cmd/kitimport/util.go +++ b/pkg/cmd/kitimport/util.go @@ -163,20 +163,35 @@ func getEditorName() (string, error) { } // extractRepoFromURL attempts to normalize a string or URL into a repository name as is used on GitHub and Huggingface. -// Returns an error we cannot automatically handle the input URL/string. -// - https://example.com/segment1/segment2 --> segment1/segment2 -// - 'organization/repository' --> 'organization/repository' -func extractRepoFromURL(rawUrl string) (string, error) { +// Returns the normalized repository path (org/repo), the kind ("model" or "dataset"), and an error if parsing fails. +// - https://huggingface.co/org/repo --> (org/repo, "model", nil) +// - https://huggingface.co/datasets/org/repo --> (org/repo, "dataset", nil) +// - 'organization/repository' --> (organization/repository, "model", nil) +// - 'datasets/organization/repository' --> (organization/repository, "dataset", nil) +func extractRepoFromURL(rawUrl string) (string, string, error) { u, err := url.Parse(rawUrl) if err != nil { - return "", fmt.Errorf("failed to parse url: %w", err) + return "", "", fmt.Errorf("failed to parse url: %w", err) } path := strings.Trim(u.Path, "/") segments := strings.Split(path, "/") - if len(segments) != 2 { - return "", fmt.Errorf("could not extract organization and repository from '%s'", path) + + var kind string + var normalizedPath string + + if len(segments) == 3 && segments[0] == "datasets" { + kind = "dataset" + normalizedPath = strings.Join(segments[1:], "/") + } else if len(segments) == 2 { + if segments[0] == "datasets" { + return "", "", fmt.Errorf("invalid dataset URL format: expected datasets/org/repo, got '%s'", path) + } + kind = "model" + normalizedPath = path + } else { + return "", "", fmt.Errorf("could not extract organization and repository from '%s'", path) } - return path, nil + return normalizedPath, kind, nil } diff --git a/pkg/cmd/kitimport/util_test.go b/pkg/cmd/kitimport/util_test.go index 53ff6a12..18ceffd3 100644 --- a/pkg/cmd/kitimport/util_test.go +++ b/pkg/cmd/kitimport/util_test.go @@ -26,24 +26,29 @@ import ( func TestExtractRepoFromURL(t *testing.T) { testcases := []struct { input string - expected string + expectedRepo string + expectedKind string expectErrRegexp string }{ - {input: "organization/repository", expected: "organization/repository"}, - {input: "https://example.com/org/repo", expected: "org/repo"}, - {input: "https://huggingface.co/org/repo", expected: "org/repo"}, - {input: "https://github.com/org/repo", expected: "org/repo"}, - {input: "organization/repository.with-dots.and_CAPS", expected: "organization/repository.with-dots.and_CAPS"}, - {input: "https://huggingface.co/org/trailing-slash/", expected: "org/trailing-slash"}, - {input: "https://github.com/org/repo.git", expected: "org/repo.git"}, + {input: "organization/repository", expectedRepo: "organization/repository", expectedKind: "model"}, + {input: "https://example.com/org/repo", expectedRepo: "org/repo", expectedKind: "model"}, + {input: "https://huggingface.co/org/repo", expectedRepo: "org/repo", expectedKind: "model"}, + {input: "https://github.com/org/repo", expectedRepo: "org/repo", expectedKind: "model"}, + {input: "organization/repository.with-dots.and_CAPS", expectedRepo: "organization/repository.with-dots.and_CAPS", expectedKind: "model"}, + {input: "https://huggingface.co/org/trailing-slash/", expectedRepo: "org/trailing-slash", expectedKind: "model"}, + {input: "https://github.com/org/repo.git", expectedRepo: "org/repo.git", expectedKind: "model"}, + {input: "https://huggingface.co/datasets/org/repo", expectedRepo: "org/repo", expectedKind: "dataset"}, + {input: "datasets/org/repo", expectedRepo: "org/repo", expectedKind: "dataset"}, + {input: "https://huggingface.co/datasets/organization/repository", expectedRepo: "organization/repository", expectedKind: "dataset"}, {input: ":///invalidURL", expectErrRegexp: "failed to parse url.*"}, {input: "too/many/path/segments", expectErrRegexp: "could not extract organization and repository from.*"}, {input: "https://github.com/kitops-ml/github.com/kitops-ml/kitops/tree/main", expectErrRegexp: "could not extract organization and repository from.*"}, + {input: "datasets/invalid", expectErrRegexp: "invalid dataset URL format.*"}, } for _, tt := range testcases { t.Run(fmt.Sprintf("handles %s", tt.input), func(t *testing.T) { - actual, actualErr := extractRepoFromURL(tt.input) + actualRepo, actualKind, actualErr := extractRepoFromURL(tt.input) if tt.expectErrRegexp != "" { if !assert.Error(t, actualErr) { return @@ -53,7 +58,8 @@ func TestExtractRepoFromURL(t *testing.T) { if !assert.NoError(t, actualErr) { return } - assert.Equal(t, tt.expected, actual) + assert.Equal(t, tt.expectedRepo, actualRepo) + assert.Equal(t, tt.expectedKind, actualKind) } }) } diff --git a/pkg/lib/hf/download.go b/pkg/lib/hf/download.go index 430c48bb..70dc5c10 100644 --- a/pkg/lib/hf/download.go +++ b/pkg/lib/hf/download.go @@ -35,7 +35,8 @@ import ( ) const ( - resolveURLFmt = "https://huggingface.co/%s/resolve/%s/%s" + modelResolveURLFmt = "https://huggingface.co/%s/resolve/%s/%s" + datasetResolveURLFmt = "https://huggingface.co/datasets/%s/resolve/%s/%s" ) func DownloadFiles( @@ -43,7 +44,8 @@ func DownloadFiles( modelRepo, repoRef, destDir string, files []kfgen.FileListing, token string, - maxConcurrency int) error { + maxConcurrency int, + kind string) error { client := &http.Client{ Timeout: 1 * time.Hour, @@ -55,6 +57,13 @@ func DownloadFiles( progress, plog := output.NewDownloadProgress() + var resolveURLFmt string + if kind == "dataset" { + resolveURLFmt = datasetResolveURLFmt + } else { + resolveURLFmt = modelResolveURLFmt + } + for _, f := range files { if err := sem.Acquire(errCtx, 1); err != nil { semErr = err diff --git a/pkg/lib/hf/list.go b/pkg/lib/hf/list.go index 05a360df..6d8dc44d 100644 --- a/pkg/lib/hf/list.go +++ b/pkg/lib/hf/list.go @@ -31,7 +31,8 @@ import ( ) const ( - treeURLFmt = "https://huggingface.co/api/models/%s/tree/%s" + modelTreeURLFmt = "https://huggingface.co/api/models/%s/tree/%s" + datasetTreeURLFmt = "https://huggingface.co/api/datasets/%s/tree/%s" ) type hfTreeResponse []struct { @@ -45,11 +46,19 @@ type hfErrorResponse struct { Error string `json:"error"` } -func ListFiles(ctx context.Context, modelRepo, ref string, token string) (*kfgen.DirectoryListing, error) { +func ListFiles(ctx context.Context, modelRepo, ref string, token string, kind string) (*kfgen.DirectoryListing, error) { client := &http.Client{ Timeout: 10 * time.Second, } - baseURL, err := url.Parse(fmt.Sprintf(treeURLFmt, modelRepo, ref)) + + var treeURL string + if kind == "dataset" { + treeURL = fmt.Sprintf(datasetTreeURLFmt, modelRepo, ref) + } else { + treeURL = fmt.Sprintf(modelTreeURLFmt, modelRepo, ref) + } + + baseURL, err := url.Parse(treeURL) if err != nil { return nil, fmt.Errorf("failed to parse URL: %w", err) } From 81e20e7ad751633326fcce9368e2de05ea503ab6 Mon Sep 17 00:00:00 2001 From: Arnab Chatterjee Date: Wed, 5 Nov 2025 10:19:11 +0530 Subject: [PATCH 2/8] Resolved Code Comments (Created typed enums, Hf-specific logic seperation, enums use int with iota Signed-off-by: Arnab Chatterjee --- pkg/cmd/kitimport/cmd.go | 18 +++++++---- pkg/cmd/kitimport/hfimport.go | 51 ++++++++++++++++++++++++++++-- pkg/cmd/kitimport/util.go | 33 ++++++------------- pkg/cmd/kitimport/util_test.go | 58 +++++++++++++++++++++++++--------- pkg/lib/hf/download.go | 4 +-- pkg/lib/hf/list.go | 4 +-- pkg/lib/hf/repo.go | 9 ++++++ 7 files changed, 126 insertions(+), 51 deletions(-) create mode 100644 pkg/lib/hf/repo.go diff --git a/pkg/cmd/kitimport/cmd.go b/pkg/cmd/kitimport/cmd.go index cf6add5c..2d3437cd 100644 --- a/pkg/cmd/kitimport/cmd.go +++ b/pkg/cmd/kitimport/cmd.go @@ -130,13 +130,19 @@ func (opts *importOptions) complete(ctx context.Context, args []string) error { opts.repo = args[0] if opts.tag == "" { - tag, _, err := extractRepoFromURL(opts.repo) - if err != nil { - output.Errorf("Could not generate tag from URL: %s", err) - return fmt.Errorf("use flag --tag to set a tag for ModelKit") + var tagRepo string + if repo, _, err := parseHuggingFaceRepo(opts.repo); err == nil { + tagRepo = repo + } else { + repo, err := extractRepoFromURL(opts.repo) + if err != nil { + output.Errorf("Could not generate tag from URL: %s", err) + return fmt.Errorf("use flag --tag to set a tag for ModelKit") + } + tagRepo = repo } - tag = strings.ToLower(tag) - opts.tag = fmt.Sprintf("%s:latest", tag) + tagRepo = strings.ToLower(tagRepo) + opts.tag = fmt.Sprintf("%s:latest", tagRepo) output.Infof("Using tag %s. Use flag --tag to override", opts.tag) } diff --git a/pkg/cmd/kitimport/hfimport.go b/pkg/cmd/kitimport/hfimport.go index d1411249..df0bc2b7 100644 --- a/pkg/cmd/kitimport/hfimport.go +++ b/pkg/cmd/kitimport/hfimport.go @@ -20,8 +20,10 @@ import ( "context" "errors" "fmt" + "net/url" "os" "path/filepath" + "strings" "github.com/kitops-ml/kitops/pkg/artifact" "github.com/kitops-ml/kitops/pkg/lib/constants" @@ -37,7 +39,7 @@ import ( func importUsingHF(ctx context.Context, opts *importOptions) error { // Handle full HF URLs by extracting repository name from URL - repo, kind, err := extractRepoFromURL(opts.repo) + repo, repoType, err := parseHuggingFaceRepo(opts.repo) if err != nil { return fmt.Errorf("could not process URL %s: %w", opts.repo, err) } @@ -53,7 +55,7 @@ func importUsingHF(ctx context.Context, opts *importOptions) error { } }() - dirListing, err := hf.ListFiles(ctx, repo, opts.repoRef, opts.token, kind) + dirListing, err := hf.ListFiles(ctx, repo, opts.repoRef, opts.token, mapRepoType(repoType)) if err != nil { return fmt.Errorf("failed to list files from HuggingFace API: %w", err) } @@ -106,7 +108,7 @@ func importUsingHF(ctx context.Context, opts *importOptions) error { if err != nil { return err } - if err := hf.DownloadFiles(ctx, repo, opts.repoRef, tmpDir, toDownload, opts.token, opts.concurrency, kind); err != nil { + if err := hf.DownloadFiles(ctx, repo, opts.repoRef, tmpDir, toDownload, opts.token, opts.concurrency, mapRepoType(repoType)); err != nil { return fmt.Errorf("error downloading repository: %w", err) } @@ -172,3 +174,46 @@ func kitfileHasCatchallLayer(kitfile *artifact.KitFile) bool { } return false } + +type hfRepoType int + +const ( + hfRepoTypeModel hfRepoType = iota + hfRepoTypeDataset +) + +func mapRepoType(repoType hfRepoType) hf.RepositoryType { + switch repoType { + case hfRepoTypeDataset: + return hf.RepoTypeDataset + default: + return hf.RepoTypeModel + } +} + +func parseHuggingFaceRepo(rawURL string) (string, hfRepoType, error) { + u, err := url.Parse(rawURL) + if err != nil { + return "", 0, fmt.Errorf("failed to parse url: %w", err) + } + + if u.Host != "" && !strings.Contains(u.Host, "huggingface.co") { + return "", 0, fmt.Errorf("not a Hugging Face repository") + } + + path := strings.Trim(u.Path, "/") + segments := strings.FieldsFunc(path, func(r rune) bool { return r == '/' }) + + if len(segments) >= 3 && segments[0] == "datasets" { + return strings.Join(segments[1:3], "/"), hfRepoTypeDataset, nil + } + if len(segments) == 2 && segments[0] == "datasets" { + return "", 0, fmt.Errorf("could not extract repository from path '%s'", path) + } + + if len(segments) >= 2 { + return strings.Join(segments[len(segments)-2:], "/"), hfRepoTypeModel, nil + } + + return "", 0, fmt.Errorf("could not extract repository from path '%s'", path) +} diff --git a/pkg/cmd/kitimport/util.go b/pkg/cmd/kitimport/util.go index 1358f99c..0dc1b063 100644 --- a/pkg/cmd/kitimport/util.go +++ b/pkg/cmd/kitimport/util.go @@ -163,35 +163,22 @@ func getEditorName() (string, error) { } // extractRepoFromURL attempts to normalize a string or URL into a repository name as is used on GitHub and Huggingface. -// Returns the normalized repository path (org/repo), the kind ("model" or "dataset"), and an error if parsing fails. -// - https://huggingface.co/org/repo --> (org/repo, "model", nil) -// - https://huggingface.co/datasets/org/repo --> (org/repo, "dataset", nil) -// - 'organization/repository' --> (organization/repository, "model", nil) -// - 'datasets/organization/repository' --> (organization/repository, "dataset", nil) -func extractRepoFromURL(rawUrl string) (string, string, error) { +// Returns an error we cannot automatically handle the input URL/string. +// - https://example.com/segment1/segment2 --> segment1/segment2 +// - 'organization/repository' --> 'organization/repository' +func extractRepoFromURL(rawUrl string) (string, error) { u, err := url.Parse(rawUrl) if err != nil { - return "", "", fmt.Errorf("failed to parse url: %w", err) + return "", fmt.Errorf("failed to parse url: %w", err) } path := strings.Trim(u.Path, "/") segments := strings.Split(path, "/") - - var kind string - var normalizedPath string - - if len(segments) == 3 && segments[0] == "datasets" { - kind = "dataset" - normalizedPath = strings.Join(segments[1:], "/") - } else if len(segments) == 2 { - if segments[0] == "datasets" { - return "", "", fmt.Errorf("invalid dataset URL format: expected datasets/org/repo, got '%s'", path) - } - kind = "model" - normalizedPath = path - } else { - return "", "", fmt.Errorf("could not extract organization and repository from '%s'", path) + if len(segments) != 2 { + return "", fmt.Errorf("could not extract organization and repository from '%s'", path) } - return normalizedPath, kind, nil + return path, nil } + +// AGENT_MODIFIED: Human review required before merge diff --git a/pkg/cmd/kitimport/util_test.go b/pkg/cmd/kitimport/util_test.go index 18ceffd3..36520337 100644 --- a/pkg/cmd/kitimport/util_test.go +++ b/pkg/cmd/kitimport/util_test.go @@ -26,29 +26,57 @@ import ( func TestExtractRepoFromURL(t *testing.T) { testcases := []struct { input string - expectedRepo string - expectedKind string + expected string expectErrRegexp string }{ - {input: "organization/repository", expectedRepo: "organization/repository", expectedKind: "model"}, - {input: "https://example.com/org/repo", expectedRepo: "org/repo", expectedKind: "model"}, - {input: "https://huggingface.co/org/repo", expectedRepo: "org/repo", expectedKind: "model"}, - {input: "https://github.com/org/repo", expectedRepo: "org/repo", expectedKind: "model"}, - {input: "organization/repository.with-dots.and_CAPS", expectedRepo: "organization/repository.with-dots.and_CAPS", expectedKind: "model"}, - {input: "https://huggingface.co/org/trailing-slash/", expectedRepo: "org/trailing-slash", expectedKind: "model"}, - {input: "https://github.com/org/repo.git", expectedRepo: "org/repo.git", expectedKind: "model"}, - {input: "https://huggingface.co/datasets/org/repo", expectedRepo: "org/repo", expectedKind: "dataset"}, - {input: "datasets/org/repo", expectedRepo: "org/repo", expectedKind: "dataset"}, - {input: "https://huggingface.co/datasets/organization/repository", expectedRepo: "organization/repository", expectedKind: "dataset"}, + {input: "organization/repository", expected: "organization/repository"}, + {input: "https://example.com/org/repo", expected: "org/repo"}, + {input: "https://huggingface.co/org/repo", expected: "org/repo"}, + {input: "https://github.com/org/repo", expected: "org/repo"}, + {input: "organization/repository.with-dots.and_CAPS", expected: "organization/repository.with-dots.and_CAPS"}, + {input: "https://huggingface.co/org/trailing-slash/", expected: "org/trailing-slash"}, + {input: "https://github.com/org/repo.git", expected: "org/repo.git"}, {input: ":///invalidURL", expectErrRegexp: "failed to parse url.*"}, {input: "too/many/path/segments", expectErrRegexp: "could not extract organization and repository from.*"}, {input: "https://github.com/kitops-ml/github.com/kitops-ml/kitops/tree/main", expectErrRegexp: "could not extract organization and repository from.*"}, - {input: "datasets/invalid", expectErrRegexp: "invalid dataset URL format.*"}, } for _, tt := range testcases { t.Run(fmt.Sprintf("handles %s", tt.input), func(t *testing.T) { - actualRepo, actualKind, actualErr := extractRepoFromURL(tt.input) + actual, actualErr := extractRepoFromURL(tt.input) + if tt.expectErrRegexp != "" { + if !assert.Error(t, actualErr) { + return + } + assert.Regexp(t, tt.expectErrRegexp, actualErr.Error()) + } else { + if !assert.NoError(t, actualErr) { + return + } + assert.Equal(t, tt.expected, actual) + } + }) + } +} + +func TestParseHuggingFaceRepo(t *testing.T) { + testcases := []struct { + input string + expectedRepo string + expectedType hfRepoType + expectErrRegexp string + }{ + {input: "https://huggingface.co/org/repo", expectedRepo: "org/repo", expectedType: hfRepoTypeModel}, + {input: "https://huggingface.co/datasets/org/repo", expectedRepo: "org/repo", expectedType: hfRepoTypeDataset}, + {input: "org/repo", expectedRepo: "org/repo", expectedType: hfRepoTypeModel}, + {input: "datasets/org/repo", expectedRepo: "org/repo", expectedType: hfRepoTypeDataset}, + {input: "datasets/only-one-segment", expectErrRegexp: "could not extract repository"}, + {input: "https://example.com/org/repo", expectErrRegexp: "not a Hugging Face repository"}, + } + + for _, tt := range testcases { + t.Run(fmt.Sprintf("handles %s", tt.input), func(t *testing.T) { + actualRepo, actualType, actualErr := parseHuggingFaceRepo(tt.input) if tt.expectErrRegexp != "" { if !assert.Error(t, actualErr) { return @@ -59,7 +87,7 @@ func TestExtractRepoFromURL(t *testing.T) { return } assert.Equal(t, tt.expectedRepo, actualRepo) - assert.Equal(t, tt.expectedKind, actualKind) + assert.Equal(t, tt.expectedType, actualType) } }) } diff --git a/pkg/lib/hf/download.go b/pkg/lib/hf/download.go index 70dc5c10..cb45ce94 100644 --- a/pkg/lib/hf/download.go +++ b/pkg/lib/hf/download.go @@ -45,7 +45,7 @@ func DownloadFiles( files []kfgen.FileListing, token string, maxConcurrency int, - kind string) error { + repoType RepositoryType) error { client := &http.Client{ Timeout: 1 * time.Hour, @@ -58,7 +58,7 @@ func DownloadFiles( progress, plog := output.NewDownloadProgress() var resolveURLFmt string - if kind == "dataset" { + if repoType == RepoTypeDataset { resolveURLFmt = datasetResolveURLFmt } else { resolveURLFmt = modelResolveURLFmt diff --git a/pkg/lib/hf/list.go b/pkg/lib/hf/list.go index 6d8dc44d..6db186fa 100644 --- a/pkg/lib/hf/list.go +++ b/pkg/lib/hf/list.go @@ -46,13 +46,13 @@ type hfErrorResponse struct { Error string `json:"error"` } -func ListFiles(ctx context.Context, modelRepo, ref string, token string, kind string) (*kfgen.DirectoryListing, error) { +func ListFiles(ctx context.Context, modelRepo, ref string, token string, repoType RepositoryType) (*kfgen.DirectoryListing, error) { client := &http.Client{ Timeout: 10 * time.Second, } var treeURL string - if kind == "dataset" { + if repoType == RepoTypeDataset { treeURL = fmt.Sprintf(datasetTreeURLFmt, modelRepo, ref) } else { treeURL = fmt.Sprintf(modelTreeURLFmt, modelRepo, ref) diff --git a/pkg/lib/hf/repo.go b/pkg/lib/hf/repo.go new file mode 100644 index 00000000..9ae07bf9 --- /dev/null +++ b/pkg/lib/hf/repo.go @@ -0,0 +1,9 @@ +package hf + +// RepositoryType represents the kind of Hugging Face repository. +type RepositoryType int + +const ( + RepoTypeModel RepositoryType = iota + RepoTypeDataset +) From b52bd5f340c4e20e0c87dcade4bf224a24cb59f0 Mon Sep 17 00:00:00 2001 From: Arnab Chatterjee Date: Wed, 5 Nov 2025 10:26:46 +0530 Subject: [PATCH 3/8] remove agent marker Signed-off-by: Arnab Chatterjee --- pkg/cmd/kitimport/util.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/cmd/kitimport/util.go b/pkg/cmd/kitimport/util.go index 0dc1b063..5e64d071 100644 --- a/pkg/cmd/kitimport/util.go +++ b/pkg/cmd/kitimport/util.go @@ -180,5 +180,3 @@ func extractRepoFromURL(rawUrl string) (string, error) { return path, nil } - -// AGENT_MODIFIED: Human review required before merge From 61c15ea398a59435135741fe424b6cfb3af2c6ca Mon Sep 17 00:00:00 2001 From: Arnab Chatterjee Date: Fri, 5 Dec 2025 23:17:08 +0530 Subject: [PATCH 4/8] Add Apache 2.0 license header to repo.go Added the standard Apache License 2.0 header to pkg/lib/hf/repo.go for compliance and clarity regarding licensing. Signed-off-by: Arnab Chatterjee --- pkg/lib/hf/repo.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pkg/lib/hf/repo.go b/pkg/lib/hf/repo.go index 9ae07bf9..4013883d 100644 --- a/pkg/lib/hf/repo.go +++ b/pkg/lib/hf/repo.go @@ -1,3 +1,19 @@ +// Copyright 2025 The KitOps Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + package hf // RepositoryType represents the kind of Hugging Face repository. From eb355335ac19892e5c8bda07e2992839c0d82dd1 Mon Sep 17 00:00:00 2001 From: Arnab Chatterjee Date: Fri, 5 Dec 2025 23:54:33 +0530 Subject: [PATCH 5/8] fixed dco issues Signed-off-by: Arnab Chatterjee --- pkg/cmd/kitimport/util.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cmd/kitimport/util.go b/pkg/cmd/kitimport/util.go index 5e64d071..776f73d0 100644 --- a/pkg/cmd/kitimport/util.go +++ b/pkg/cmd/kitimport/util.go @@ -34,7 +34,7 @@ import ( kfgen "github.com/kitops-ml/kitops/pkg/lib/kitfile/generate" "github.com/kitops-ml/kitops/pkg/lib/repo/local" "github.com/kitops-ml/kitops/pkg/lib/util" - "github.com/kitops-ml/kitops/pkg/output" + "github.com/kitops-ml/kitops/pkg/output" "oras.land/oras-go/v2/registry" ) From a87b7c44f3b64d0503b6a6f442e0f06a7a27b19a Mon Sep 17 00:00:00 2001 From: Arnab Chatterjee Date: Sat, 6 Dec 2025 14:42:28 +0530 Subject: [PATCH 6/8] formatting issues fixed Signed-off-by: Arnab Chatterjee --- pkg/cmd/kitimport/util.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cmd/kitimport/util.go b/pkg/cmd/kitimport/util.go index 776f73d0..5e64d071 100644 --- a/pkg/cmd/kitimport/util.go +++ b/pkg/cmd/kitimport/util.go @@ -34,7 +34,7 @@ import ( kfgen "github.com/kitops-ml/kitops/pkg/lib/kitfile/generate" "github.com/kitops-ml/kitops/pkg/lib/repo/local" "github.com/kitops-ml/kitops/pkg/lib/util" - "github.com/kitops-ml/kitops/pkg/output" + "github.com/kitops-ml/kitops/pkg/output" "oras.land/oras-go/v2/registry" ) From 6d2388154d11932bb9721a139ba3c623c9043692 Mon Sep 17 00:00:00 2001 From: Arnab Chatterjee Date: Wed, 10 Dec 2025 09:57:41 +0530 Subject: [PATCH 7/8] resolved comments Signed-off-by: Arnab Chatterjee --- pkg/cmd/kitimport/cmd.go | 3 +- pkg/cmd/kitimport/hfimport.go | 51 ++---------------------------- pkg/cmd/kitimport/util_test.go | 34 -------------------- pkg/lib/hf/repo.go | 38 +++++++++++++++++++++- pkg/lib/hf/repo_test.go | 58 ++++++++++++++++++++++++++++++++++ 5 files changed, 100 insertions(+), 84 deletions(-) create mode 100644 pkg/lib/hf/repo_test.go diff --git a/pkg/cmd/kitimport/cmd.go b/pkg/cmd/kitimport/cmd.go index 2d3437cd..8c9d0d96 100644 --- a/pkg/cmd/kitimport/cmd.go +++ b/pkg/cmd/kitimport/cmd.go @@ -25,6 +25,7 @@ import ( "github.com/kitops-ml/kitops/pkg/lib/constants" "github.com/kitops-ml/kitops/pkg/lib/git" + "github.com/kitops-ml/kitops/pkg/lib/hf" repoutils "github.com/kitops-ml/kitops/pkg/lib/repo/util" "github.com/kitops-ml/kitops/pkg/output" @@ -131,7 +132,7 @@ func (opts *importOptions) complete(ctx context.Context, args []string) error { if opts.tag == "" { var tagRepo string - if repo, _, err := parseHuggingFaceRepo(opts.repo); err == nil { + if repo, _, err := hf.ParseHuggingFaceRepo(opts.repo); err == nil { tagRepo = repo } else { repo, err := extractRepoFromURL(opts.repo) diff --git a/pkg/cmd/kitimport/hfimport.go b/pkg/cmd/kitimport/hfimport.go index df0bc2b7..18c731c1 100644 --- a/pkg/cmd/kitimport/hfimport.go +++ b/pkg/cmd/kitimport/hfimport.go @@ -20,10 +20,8 @@ import ( "context" "errors" "fmt" - "net/url" "os" "path/filepath" - "strings" "github.com/kitops-ml/kitops/pkg/artifact" "github.com/kitops-ml/kitops/pkg/lib/constants" @@ -39,7 +37,7 @@ import ( func importUsingHF(ctx context.Context, opts *importOptions) error { // Handle full HF URLs by extracting repository name from URL - repo, repoType, err := parseHuggingFaceRepo(opts.repo) + repo, repoType, err := hf.ParseHuggingFaceRepo(opts.repo) if err != nil { return fmt.Errorf("could not process URL %s: %w", opts.repo, err) } @@ -55,7 +53,7 @@ func importUsingHF(ctx context.Context, opts *importOptions) error { } }() - dirListing, err := hf.ListFiles(ctx, repo, opts.repoRef, opts.token, mapRepoType(repoType)) + dirListing, err := hf.ListFiles(ctx, repo, opts.repoRef, opts.token, repoType) if err != nil { return fmt.Errorf("failed to list files from HuggingFace API: %w", err) } @@ -108,7 +106,7 @@ func importUsingHF(ctx context.Context, opts *importOptions) error { if err != nil { return err } - if err := hf.DownloadFiles(ctx, repo, opts.repoRef, tmpDir, toDownload, opts.token, opts.concurrency, mapRepoType(repoType)); err != nil { + if err := hf.DownloadFiles(ctx, repo, opts.repoRef, tmpDir, toDownload, opts.token, opts.concurrency, repoType); err != nil { return fmt.Errorf("error downloading repository: %w", err) } @@ -174,46 +172,3 @@ func kitfileHasCatchallLayer(kitfile *artifact.KitFile) bool { } return false } - -type hfRepoType int - -const ( - hfRepoTypeModel hfRepoType = iota - hfRepoTypeDataset -) - -func mapRepoType(repoType hfRepoType) hf.RepositoryType { - switch repoType { - case hfRepoTypeDataset: - return hf.RepoTypeDataset - default: - return hf.RepoTypeModel - } -} - -func parseHuggingFaceRepo(rawURL string) (string, hfRepoType, error) { - u, err := url.Parse(rawURL) - if err != nil { - return "", 0, fmt.Errorf("failed to parse url: %w", err) - } - - if u.Host != "" && !strings.Contains(u.Host, "huggingface.co") { - return "", 0, fmt.Errorf("not a Hugging Face repository") - } - - path := strings.Trim(u.Path, "/") - segments := strings.FieldsFunc(path, func(r rune) bool { return r == '/' }) - - if len(segments) >= 3 && segments[0] == "datasets" { - return strings.Join(segments[1:3], "/"), hfRepoTypeDataset, nil - } - if len(segments) == 2 && segments[0] == "datasets" { - return "", 0, fmt.Errorf("could not extract repository from path '%s'", path) - } - - if len(segments) >= 2 { - return strings.Join(segments[len(segments)-2:], "/"), hfRepoTypeModel, nil - } - - return "", 0, fmt.Errorf("could not extract repository from path '%s'", path) -} diff --git a/pkg/cmd/kitimport/util_test.go b/pkg/cmd/kitimport/util_test.go index 36520337..53ff6a12 100644 --- a/pkg/cmd/kitimport/util_test.go +++ b/pkg/cmd/kitimport/util_test.go @@ -58,37 +58,3 @@ func TestExtractRepoFromURL(t *testing.T) { }) } } - -func TestParseHuggingFaceRepo(t *testing.T) { - testcases := []struct { - input string - expectedRepo string - expectedType hfRepoType - expectErrRegexp string - }{ - {input: "https://huggingface.co/org/repo", expectedRepo: "org/repo", expectedType: hfRepoTypeModel}, - {input: "https://huggingface.co/datasets/org/repo", expectedRepo: "org/repo", expectedType: hfRepoTypeDataset}, - {input: "org/repo", expectedRepo: "org/repo", expectedType: hfRepoTypeModel}, - {input: "datasets/org/repo", expectedRepo: "org/repo", expectedType: hfRepoTypeDataset}, - {input: "datasets/only-one-segment", expectErrRegexp: "could not extract repository"}, - {input: "https://example.com/org/repo", expectErrRegexp: "not a Hugging Face repository"}, - } - - for _, tt := range testcases { - t.Run(fmt.Sprintf("handles %s", tt.input), func(t *testing.T) { - actualRepo, actualType, actualErr := parseHuggingFaceRepo(tt.input) - if tt.expectErrRegexp != "" { - if !assert.Error(t, actualErr) { - return - } - assert.Regexp(t, tt.expectErrRegexp, actualErr.Error()) - } else { - if !assert.NoError(t, actualErr) { - return - } - assert.Equal(t, tt.expectedRepo, actualRepo) - assert.Equal(t, tt.expectedType, actualType) - } - }) - } -} diff --git a/pkg/lib/hf/repo.go b/pkg/lib/hf/repo.go index 4013883d..7028d6e8 100644 --- a/pkg/lib/hf/repo.go +++ b/pkg/lib/hf/repo.go @@ -16,10 +16,46 @@ package hf +import ( + "fmt" + "net/url" + "strings" +) + // RepositoryType represents the kind of Hugging Face repository. type RepositoryType int const ( - RepoTypeModel RepositoryType = iota + RepoTypeUnknown RepositoryType = iota + RepoTypeModel RepoTypeDataset ) + +// ParseHuggingFaceRepo parses a Hugging Face repository URL or path and returns +// the normalized repository path (org/repo) and the repository type. +func ParseHuggingFaceRepo(rawURL string) (string, RepositoryType, error) { + u, err := url.Parse(rawURL) + if err != nil { + return "", RepoTypeUnknown, fmt.Errorf("failed to parse url: %w", err) + } + + if u.Host != "" && !strings.Contains(u.Host, "huggingface.co") { + return "", RepoTypeUnknown, fmt.Errorf("not a Hugging Face repository") + } + + path := strings.Trim(u.Path, "/") + segments := strings.Split(path, "/") + + if len(segments) >= 3 && segments[0] == "datasets" { + return strings.Join(segments[1:3], "/"), RepoTypeDataset, nil + } + if len(segments) == 2 && segments[0] == "datasets" { + return "", RepoTypeUnknown, fmt.Errorf("could not extract repository from path '%s'", path) + } + + if len(segments) >= 2 { + return strings.Join(segments[len(segments)-2:], "/"), RepoTypeModel, nil + } + + return "", RepoTypeUnknown, fmt.Errorf("could not extract repository from path '%s'", path) +} diff --git a/pkg/lib/hf/repo_test.go b/pkg/lib/hf/repo_test.go new file mode 100644 index 00000000..ffaf807a --- /dev/null +++ b/pkg/lib/hf/repo_test.go @@ -0,0 +1,58 @@ +// Copyright 2025 The KitOps Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package hf + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParseHuggingFaceRepo(t *testing.T) { + testcases := []struct { + input string + expectedRepo string + expectedType RepositoryType + expectErrRegexp string + }{ + {input: "https://huggingface.co/org/repo", expectedRepo: "org/repo", expectedType: RepoTypeModel}, + {input: "https://huggingface.co/datasets/org/repo", expectedRepo: "org/repo", expectedType: RepoTypeDataset}, + {input: "org/repo", expectedRepo: "org/repo", expectedType: RepoTypeModel}, + {input: "datasets/org/repo", expectedRepo: "org/repo", expectedType: RepoTypeDataset}, + {input: "datasets/only-one-segment", expectErrRegexp: "could not extract repository"}, + {input: "https://example.com/org/repo", expectErrRegexp: "not a Hugging Face repository"}, + } + + for _, tt := range testcases { + t.Run(fmt.Sprintf("handles %s", tt.input), func(t *testing.T) { + actualRepo, actualType, actualErr := ParseHuggingFaceRepo(tt.input) + if tt.expectErrRegexp != "" { + if !assert.Error(t, actualErr) { + return + } + assert.Regexp(t, tt.expectErrRegexp, actualErr.Error()) + } else { + if !assert.NoError(t, actualErr) { + return + } + assert.Equal(t, tt.expectedRepo, actualRepo) + assert.Equal(t, tt.expectedType, actualType) + } + }) + } +} From 6e7091b6f7fd4ead7fae1065a8b38d9594a077da Mon Sep 17 00:00:00 2001 From: Arnab Chatterjee Date: Fri, 16 Jan 2026 00:51:30 +0530 Subject: [PATCH 8/8] feat: enhance Hugging Face URL parsing to support scheme-less and HTTP URLs, improve host validation, and handle trailing slashes. Signed-off-by: Arnab Chatterjee --- pkg/lib/hf/repo.go | 13 +++++++++++-- pkg/lib/hf/repo_test.go | 11 +++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/pkg/lib/hf/repo.go b/pkg/lib/hf/repo.go index 7028d6e8..fb7b1b94 100644 --- a/pkg/lib/hf/repo.go +++ b/pkg/lib/hf/repo.go @@ -39,13 +39,22 @@ func ParseHuggingFaceRepo(rawURL string) (string, RepositoryType, error) { return "", RepoTypeUnknown, fmt.Errorf("failed to parse url: %w", err) } - if u.Host != "" && !strings.Contains(u.Host, "huggingface.co") { - return "", RepoTypeUnknown, fmt.Errorf("not a Hugging Face repository") + // Strict host validation - must be exactly "huggingface.co" or a proper subdomain + if u.Host != "" { + if u.Host != "huggingface.co" && !strings.HasSuffix(u.Host, ".huggingface.co") { + return "", RepoTypeUnknown, fmt.Errorf("not a Hugging Face repository") + } } path := strings.Trim(u.Path, "/") segments := strings.Split(path, "/") + // Handle scheme-less URLs like "huggingface.co/datasets/org/repo" + // When there's no scheme, url.Parse puts everything in the path + if len(segments) > 0 && (segments[0] == "huggingface.co" || strings.HasSuffix(segments[0], ".huggingface.co")) { + segments = segments[1:] // Skip the host part from path + } + if len(segments) >= 3 && segments[0] == "datasets" { return strings.Join(segments[1:3], "/"), RepoTypeDataset, nil } diff --git a/pkg/lib/hf/repo_test.go b/pkg/lib/hf/repo_test.go index ffaf807a..3532d195 100644 --- a/pkg/lib/hf/repo_test.go +++ b/pkg/lib/hf/repo_test.go @@ -36,6 +36,17 @@ func TestParseHuggingFaceRepo(t *testing.T) { {input: "datasets/org/repo", expectedRepo: "org/repo", expectedType: RepoTypeDataset}, {input: "datasets/only-one-segment", expectErrRegexp: "could not extract repository"}, {input: "https://example.com/org/repo", expectErrRegexp: "not a Hugging Face repository"}, + // Test for malicious URL that contains "huggingface.co" in hostname + {input: "https://huggingface.co.evil.com/org/repo", expectErrRegexp: "not a Hugging Face repository"}, + // Test for scheme-less dataset URL + {input: "huggingface.co/datasets/org/repo", expectedRepo: "org/repo", expectedType: RepoTypeDataset}, + // Test for scheme-less model URL + {input: "huggingface.co/org/repo", expectedRepo: "org/repo", expectedType: RepoTypeModel}, + // Test URL with trailing slashes + {input: "https://huggingface.co/org/repo/", expectedRepo: "org/repo", expectedType: RepoTypeModel}, + {input: "https://huggingface.co/datasets/org/repo/", expectedRepo: "org/repo", expectedType: RepoTypeDataset}, + // Test with http (not https) + {input: "http://huggingface.co/org/repo", expectedRepo: "org/repo", expectedType: RepoTypeModel}, } for _, tt := range testcases {