From 0de61a1efc39e56ab78456b5983ce68d605dc9fd Mon Sep 17 00:00:00 2001 From: Bruno Sofiato Date: Thu, 26 Sep 2024 08:33:25 -0300 Subject: [PATCH 1/4] Allow code search by filename Signed-off-by: Bruno Sofiato --- models/fixtures/repo_unit.yml | 21 +++ models/fixtures/repository.yml | 31 ++++ models/fixtures/user.yml | 37 ++++ models/repo/repo_list_test.go | 6 +- models/user/user_test.go | 5 +- modules/indexer/code/bleve/bleve.go | 44 ++++- modules/indexer/code/bleve/token/path/path.go | 101 +++++++++++ .../code/bleve/token/path/path_test.go | 76 ++++++++ .../code/elasticsearch/elasticsearch.go | 75 ++++++-- .../code/elasticsearch/elasticsearch_test.go | 4 +- modules/indexer/code/indexer_test.go | 162 ++++++++++++++++-- modules/indexer/code/internal/util.go | 18 ++ modules/indexer/internal/bleve/util.go | 24 ++- modules/indexer/internal/bleve/util_test.go | 45 +++++ .../org42/search-by-path.git/GIT_COLA_MSG | 1 + .../org42/search-by-path.git/HEAD | 1 + .../org42/search-by-path.git/config | 4 + .../org42/search-by-path.git/description | 1 + .../search-by-path.git/hooks/post-receive | 7 + .../hooks/post-receive.d/gitea | 2 + .../search-by-path.git/hooks/pre-receive | 7 + .../hooks/pre-receive.d/gitea | 2 + .../search-by-path.git/hooks/proc-receive | 7 + .../hooks/proc-receive.d/gitea | 2 + .../org42/search-by-path.git/hooks/update | 7 + .../search-by-path.git/hooks/update.d/gitea | 2 + .../org42/search-by-path.git/info/exclude | 6 + .../org42/search-by-path.git/info/refs | 13 ++ .../search-by-path.git/logs/refs/heads/master | 0 .../objects/info/commit-graph | Bin 0 -> 1772 bytes .../search-by-path.git/objects/info/packs | 2 + ...29256bc27cb2ec73898507df710be7a3cf5.bitmap | Bin 0 -> 674 bytes ...3dc29256bc27cb2ec73898507df710be7a3cf5.idx | Bin 0 -> 2080 bytes ...dc29256bc27cb2ec73898507df710be7a3cf5.pack | Bin 0 -> 6714 bytes ...3dc29256bc27cb2ec73898507df710be7a3cf5.rev | Bin 0 -> 196 bytes .../org42/search-by-path.git/packed-refs | 14 ++ tests/integration/api_org_test.go | 4 +- tests/integration/api_repo_test.go | 6 +- 38 files changed, 688 insertions(+), 49 deletions(-) create mode 100644 modules/indexer/code/bleve/token/path/path.go create mode 100644 modules/indexer/code/bleve/token/path/path_test.go create mode 100644 modules/indexer/internal/bleve/util_test.go create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/GIT_COLA_MSG create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/HEAD create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/config create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/description create mode 100755 tests/gitea-repositories-meta/org42/search-by-path.git/hooks/post-receive create mode 100755 tests/gitea-repositories-meta/org42/search-by-path.git/hooks/post-receive.d/gitea create mode 100755 tests/gitea-repositories-meta/org42/search-by-path.git/hooks/pre-receive create mode 100755 tests/gitea-repositories-meta/org42/search-by-path.git/hooks/pre-receive.d/gitea create mode 100755 tests/gitea-repositories-meta/org42/search-by-path.git/hooks/proc-receive create mode 100755 tests/gitea-repositories-meta/org42/search-by-path.git/hooks/proc-receive.d/gitea create mode 100755 tests/gitea-repositories-meta/org42/search-by-path.git/hooks/update create mode 100755 tests/gitea-repositories-meta/org42/search-by-path.git/hooks/update.d/gitea create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/info/exclude create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/info/refs create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/logs/refs/heads/master create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/commit-graph create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.bitmap create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.idx create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.rev create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/packed-refs diff --git a/models/fixtures/repo_unit.yml b/models/fixtures/repo_unit.yml index 8a22db0445c64..f6b6252da1f88 100644 --- a/models/fixtures/repo_unit.yml +++ b/models/fixtures/repo_unit.yml @@ -712,3 +712,24 @@ type: 3 config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}" created_unix: 946684810 + +- + id: 108 + repo_id: 62 + type: 1 + config: "{}" + created_unix: 946684810 + +- + id: 109 + repo_id: 62 + type: 2 + config: "{\"EnableTimetracker\":true,\"AllowOnlyContributorsToTrackTime\":true}" + created_unix: 946684810 + +- + id: 110 + repo_id: 62 + type: 3 + config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}" + created_unix: 946684810 diff --git a/models/fixtures/repository.yml b/models/fixtures/repository.yml index e141593f41576..b7970cb7c82f6 100644 --- a/models/fixtures/repository.yml +++ b/models/fixtures/repository.yml @@ -1768,3 +1768,34 @@ size: 0 is_fsck_enabled: true close_issues_via_commit_in_any_branch: false + +- + id: 62 + owner_id: 42 + owner_name: org42 + lower_name: search-by-path + name: search-by-path + default_branch: master + num_watches: 0 + num_stars: 0 + num_forks: 0 + num_issues: 0 + num_closed_issues: 0 + num_pulls: 0 + num_closed_pulls: 0 + num_milestones: 0 + num_closed_milestones: 0 + num_projects: 0 + num_closed_projects: 0 + is_private: false + is_empty: false + is_archived: false + is_mirror: false + status: 0 + is_fork: false + fork_id: 0 + is_template: false + template_id: 0 + size: 0 + is_fsck_enabled: true + close_issues_via_commit_in_any_branch: false diff --git a/models/fixtures/user.yml b/models/fixtures/user.yml index 8504d88ce5995..c0296deec55bd 100644 --- a/models/fixtures/user.yml +++ b/models/fixtures/user.yml @@ -1517,3 +1517,40 @@ repo_admin_change_team_access: false theme: "" keep_activity_private: false + +- + id: 42 + lower_name: org42 + name: org42 + full_name: Org42 + email: org42@example.com + keep_email_private: false + email_notifications_preference: onmention + passwd: ZogKvWdyEx:password + passwd_hash_algo: dummy + must_change_password: false + login_source: 0 + login_name: org42 + type: 1 + salt: ZogKvWdyEx + max_repo_creation: -1 + is_active: false + is_admin: false + is_restricted: false + allow_git_hook: false + allow_import_local: false + allow_create_organization: true + prohibit_login: false + avatar: avatar42 + avatar_email: org42@example.com + use_custom_avatar: false + num_followers: 0 + num_following: 0 + num_stars: 0 + num_repos: 1 + num_teams: 0 + num_members: 0 + visibility: 0 + repo_admin_change_team_access: false + theme: "" + keep_activity_private: false diff --git a/models/repo/repo_list_test.go b/models/repo/repo_list_test.go index 88cfcde620832..ca6007f6c7882 100644 --- a/models/repo/repo_list_test.go +++ b/models/repo/repo_list_test.go @@ -138,12 +138,12 @@ func getTestCases() []struct { { name: "AllPublic/PublicRepositoriesOfUserIncludingCollaborative", opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, AllPublic: true, Template: optional.Some(false)}, - count: 33, + count: 34, }, { name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborative", opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, Private: true, AllPublic: true, AllLimited: true, Template: optional.Some(false)}, - count: 38, + count: 39, }, { name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborativeByName", @@ -158,7 +158,7 @@ func getTestCases() []struct { { name: "AllPublic/PublicRepositoriesOfOrganization", opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 17, AllPublic: true, Collaborate: optional.Some(false), Template: optional.Some(false)}, - count: 33, + count: 34, }, { name: "AllTemplates", diff --git a/models/user/user_test.go b/models/user/user_test.go index 67efb3859fdf3..bc1abc64512c7 100644 --- a/models/user/user_test.go +++ b/models/user/user_test.go @@ -92,7 +92,10 @@ func TestSearchUsers(t *testing.T) { testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 4, PageSize: 2}}, []int64{26, 41}) - testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 5, PageSize: 2}}, + testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 5, PageSize: 2}}, + []int64{42}) + + testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 6, PageSize: 2}}, []int64{}) // test users diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index c17f56d3cff5a..90e5e62bcb4aa 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -17,6 +17,7 @@ import ( "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/gitrepo" + path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path" "code.gitea.io/gitea/modules/indexer/code/internal" indexer_internal "code.gitea.io/gitea/modules/indexer/internal" inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve" @@ -53,6 +54,7 @@ type RepoIndexerData struct { RepoID int64 CommitID string Content string + Filename string Language string UpdatedAt time.Time } @@ -64,8 +66,10 @@ func (d *RepoIndexerData) Type() string { const ( repoIndexerAnalyzer = "repoIndexerAnalyzer" + filenameIndexerAnalyzer = "filenameIndexerAnalyzer" + filenameIndexerTokenizer = "filenameIndexerTokenizer" repoIndexerDocType = "repoIndexerDocType" - repoIndexerLatestVersion = 6 + repoIndexerLatestVersion = 7 ) // generateBleveIndexMapping generates a bleve index mapping for the repo indexer @@ -79,6 +83,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) { textFieldMapping.IncludeInAll = false docMapping.AddFieldMappingsAt("Content", textFieldMapping) + fileNamedMapping := bleve.NewTextFieldMapping() + fileNamedMapping.IncludeInAll = false + fileNamedMapping.Analyzer = filenameIndexerAnalyzer + docMapping.AddFieldMappingsAt("Filename", fileNamedMapping) + termFieldMapping := bleve.NewTextFieldMapping() termFieldMapping.IncludeInAll = false termFieldMapping.Analyzer = analyzer_keyword.Name @@ -90,6 +99,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) { docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping) mapping := bleve.NewIndexMapping() + if err := addUnicodeNormalizeTokenFilter(mapping); err != nil { return nil, err } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ @@ -100,6 +110,16 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) { }); err != nil { return nil, err } + + if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{ + "type": analyzer_custom.Name, + "char_filters": []string{}, + "tokenizer": unicode.Name, + "token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name}, + }); err != nil { + return nil, err + } + mapping.DefaultAnalyzer = repoIndexerAnalyzer mapping.AddDocumentMapping(repoIndexerDocType, docMapping) mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping()) @@ -174,6 +194,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro return batch.Index(id, &RepoIndexerData{ RepoID: repo.ID, CommitID: commitSha, + Filename: update.Filename, Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), Language: analyze.GetCodeLanguage(update.Filename, fileContents), UpdatedAt: time.Now().UTC(), @@ -240,14 +261,19 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int keywordQuery query.Query ) - phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword) - phraseQuery.FieldVal = "Content" - phraseQuery.Analyzer = repoIndexerAnalyzer - keywordQuery = phraseQuery + pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword)) + pathQuery.FieldVal = "Filename" + pathQuery.SetBoost(10) + + contentQuery := bleve.NewMatchQuery(opts.Keyword) + contentQuery.FieldVal = "Content" + if opts.IsKeywordFuzzy { - phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) + contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) } + keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery) + if len(opts.RepoIDs) > 0 { repoQueries := make([]query.Query, 0, len(opts.RepoIDs)) for _, repoID := range opts.RepoIDs { @@ -277,7 +303,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int from, pageSize := opts.GetSkipTake() searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false) - searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} + searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"} searchRequest.IncludeLocations = true if len(opts.Language) == 0 { @@ -307,6 +333,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int endIndex = locationEnd } } + if len(hit.Locations["Filename"]) > 0 { + startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string)) + } + language := hit.Fields["Language"].(string) var updatedUnix timeutil.TimeStamp if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil { diff --git a/modules/indexer/code/bleve/token/path/path.go b/modules/indexer/code/bleve/token/path/path.go new file mode 100644 index 0000000000000..107e0da1090b3 --- /dev/null +++ b/modules/indexer/code/bleve/token/path/path.go @@ -0,0 +1,101 @@ +// Copyright 2024 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package path + +import ( + "slices" + "strings" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/registry" +) + +const ( + Name = "gitea/path" +) + +type TokenFilter struct{} + +func NewTokenFilter() *TokenFilter { + return &TokenFilter{} +} + +func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewTokenFilter(), nil +} + +func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + if len(input) == 1 { + // if there is only one token, we dont need to generate the reversed chain + return generatePathTokens(input, false) + } + + normal := generatePathTokens(input, false) + reversed := generatePathTokens(input, true) + + return append(normal, reversed...) +} + +// Generates path tokens from the input tokens. +// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component +// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md). +// +// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful +// to efficiently search for filenames without supplying the fullpath. +func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream { + terms := make([]string, 0, len(input)) + longestTerm := 0 + + if reversed { + slices.Reverse(input) + } + + for i := 0; i < len(input); i++ { + var sb strings.Builder + sb.WriteString(string(input[0].Term)) + + for j := 1; j < i; j++ { + sb.WriteString("/") + sb.WriteString(string(input[j].Term)) + } + + term := sb.String() + + if longestTerm < len(term) { + longestTerm = len(term) + } + + terms = append(terms, term) + } + + output := make(analysis.TokenStream, 0, len(terms)) + + for _, term := range terms { + var start, end int + + if reversed { + start = 0 + end = len(term) + } else { + start = longestTerm - len(term) + end = longestTerm + } + + token := analysis.Token{ + Position: 1, + Start: start, + End: end, + Type: analysis.AlphaNumeric, + Term: []byte(term), + } + + output = append(output, &token) + } + + return output +} + +func init() { + registry.RegisterTokenFilter(Name, TokenFilterConstructor) +} diff --git a/modules/indexer/code/bleve/token/path/path_test.go b/modules/indexer/code/bleve/token/path/path_test.go new file mode 100644 index 0000000000000..cc52021ef7f1f --- /dev/null +++ b/modules/indexer/code/bleve/token/path/path_test.go @@ -0,0 +1,76 @@ +// Copyright 2024 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package path + +import ( + "fmt" + "testing" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" + "github.com/stretchr/testify/assert" +) + +type Scenario struct { + Input string + Tokens []string +} + +func TestTokenFilter(t *testing.T) { + scenarios := []struct { + Input string + Terms []string + }{ + { + Input: "Dockerfile", + Terms: []string{"Dockerfile"}, + }, + { + Input: "Dockerfile.rootless", + Terms: []string{"Dockerfile.rootless"}, + }, + { + Input: "a/b/c/Dockerfile.rootless", + Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"}, + }, + { + Input: "", + Terms: []string{}, + }, + } + + for _, scenario := range scenarios { + t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) { + terms := extractTerms(scenario.Input) + + assert.Len(t, terms, len(scenario.Terms)) + + for _, term := range terms { + assert.Contains(t, scenario.Terms, term) + } + }) + } +} + +func extractTerms(input string) []string { + tokens := tokenize(input) + filteredTokens := filter(tokens) + terms := make([]string, 0, len(filteredTokens)) + + for _, token := range filteredTokens { + terms = append(terms, string(token.Term)) + } + + return terms +} + +func filter(input analysis.TokenStream) analysis.TokenStream { + filter := NewTokenFilter() + return filter.Filter(input) +} + +func tokenize(input string) analysis.TokenStream { + tokenizer := unicode.NewUnicodeTokenizer() + return tokenizer.Tokenize([]byte(input)) +} diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index 5c01034450be7..669a1bafcc908 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -30,7 +30,7 @@ import ( ) const ( - esRepoIndexerLatestVersion = 1 + esRepoIndexerLatestVersion = 2 // multi-match-types, currently only 2 types are used // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types esMultiMatchTypeBestFields = "best_fields" @@ -57,12 +57,50 @@ func NewIndexer(url, indexerName string) *Indexer { const ( defaultMapping = `{ + "settings": { + "analysis": { + "analyzer": { + "filename_path_analyzer": { + "tokenizer": "path_tokenizer" + }, + "reversed_filename_path_analyzer": { + "tokenizer": "reversed_path_tokenizer" + } + }, + "tokenizer": { + "path_tokenizer": { + "type": "path_hierarchy", + "delimiter": "/" + }, + "reversed_path_tokenizer": { + "type": "path_hierarchy", + "delimiter": "/", + "reverse": true + } + } + } + }, "mappings": { "properties": { "repo_id": { "type": "long", "index": true }, + "filename": { + "type": "text", + "term_vector": "with_positions_offsets", + "index": true, + "fields": { + "path": { + "type": "text", + "analyzer": "reversed_filename_path_analyzer" + }, + "path_reversed": { + "type": "text", + "analyzer": "filename_path_analyzer" + } + } + }, "content": { "type": "text", "term_vector": "with_positions_offsets", @@ -136,6 +174,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro Id(id). Doc(map[string]any{ "repo_id": repo.ID, + "filename": update.Filename, "content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), "commit_id": sha, "language": analyze.GetCodeLanguage(update.Filename, fileContents), @@ -231,11 +270,11 @@ func (b *Indexer) doDelete(ctx context.Context, repoID int64) error { return err } -// indexPos find words positions for start and the following end on content. It will +// contentMatchIndexPos find words positions for start and the following end on content. It will // return the beginning position of the first start and the ending position of the // first end following the start string. // If not found any of the positions, it will return -1, -1. -func indexPos(content, start, end string) (int, int) { +func contentMatchIndexPos(content, start, end string) (int, int) { startIdx := strings.Index(content, start) if startIdx < 0 { return -1, -1 @@ -244,22 +283,29 @@ func indexPos(content, start, end string) (int, int) { if endIdx < 0 { return -1, -1 } - return startIdx, startIdx + len(start) + endIdx + len(end) + return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length since we give Content the original data } func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { hits := make([]*internal.SearchResult, 0, pageSize) for _, hit := range searchResult.Hits.Hits { + repoID, fileName := internal.ParseIndexerID(hit.Id) + res := make(map[string]any) + if err := json.Unmarshal(hit.Source, &res); err != nil { + return 0, nil, nil, err + } + // FIXME: There is no way to get the position the keyword on the content currently on the same request. // So we get it from content, this may made the query slower. See // https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291 var startIndex, endIndex int - c, ok := hit.Highlight["content"] - if ok && len(c) > 0 { + if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 { + startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string)) + } else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 { // FIXME: Since the highlighting content will include and for the keywords, // now we should find the positions. But how to avoid html content which contains the // and tags? If elastic search has handled that? - startIndex, endIndex = indexPos(c[0], "", "") + startIndex, endIndex = contentMatchIndexPos(c[0], "", "") if startIndex == -1 { panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0])) } @@ -267,12 +313,6 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) panic(fmt.Sprintf("2===%#v", hit.Highlight)) } - repoID, fileName := internal.ParseIndexerID(hit.Id) - res := make(map[string]any) - if err := json.Unmarshal(hit.Source, &res); err != nil { - return 0, nil, nil, err - } - language := res["language"].(string) hits = append(hits, &internal.SearchResult{ @@ -283,7 +323,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)), Language: language, StartIndex: startIndex, - EndIndex: endIndex - 9, // remove the length since we give Content the original data + EndIndex: endIndex, Color: enry.GetColor(language), }) } @@ -315,7 +355,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int searchType = esMultiMatchTypeBestFields } - kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType) + kwQuery := elastic.NewBoolQuery().Should( + elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType), + elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix), + ) query := elastic.NewBoolQuery() query = query.Must(kwQuery) if len(opts.RepoIDs) > 0 { @@ -341,6 +384,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int Highlight( elastic.NewHighlight(). Field("content"). + Field("filename"). NumOfFragments(0). // return all highting content on fragments HighlighterType("fvh"), ). @@ -373,6 +417,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int Highlight( elastic.NewHighlight(). Field("content"). + Field("filename"). NumOfFragments(0). // return all highting content on fragments HighlighterType("fvh"), ). diff --git a/modules/indexer/code/elasticsearch/elasticsearch_test.go b/modules/indexer/code/elasticsearch/elasticsearch_test.go index c6ba93e76d469..a6d2af92b2b11 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch_test.go +++ b/modules/indexer/code/elasticsearch/elasticsearch_test.go @@ -10,7 +10,7 @@ import ( ) func TestIndexPos(t *testing.T) { - startIdx, endIdx := indexPos("test index start and end", "start", "end") + startIdx, endIdx := contentMatchIndexPos("test index start and end", "start", "end") assert.EqualValues(t, 11, startIdx) - assert.EqualValues(t, 24, endIdx) + assert.EqualValues(t, 15, endIdx) } diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go index 8975c5ce4083b..aca7966e33a83 100644 --- a/modules/indexer/code/indexer_test.go +++ b/modules/indexer/code/indexer_test.go @@ -6,6 +6,7 @@ package code import ( "context" "os" + "slices" "testing" "code.gitea.io/gitea/models/db" @@ -20,53 +21,144 @@ import ( _ "code.gitea.io/gitea/models/activities" "github.com/stretchr/testify/assert" + + _ "github.com/mattn/go-sqlite3" ) +type codeSearchResult struct { + Filename string + Content string +} + func TestMain(m *testing.M) { unittest.MainTest(m) } func testIndexer(name string, t *testing.T, indexer internal.Indexer) { t.Run(name, func(t *testing.T) { - var repoID int64 = 1 - err := index(git.DefaultContext, indexer, repoID) - assert.NoError(t, err) + assert.NoError(t, setupRepositoryIndexes(git.DefaultContext, indexer)) + keywords := []struct { RepoIDs []int64 Keyword string - IDs []int64 Langs int + Results []codeSearchResult }{ { RepoIDs: nil, Keyword: "Description", - IDs: []int64{repoID}, Langs: 1, + Results: []codeSearchResult{ + { + Filename: "README.md", + Content: "# repo1\n\nDescription for repo1", + }, + }, }, { RepoIDs: []int64{2}, Keyword: "Description", - IDs: []int64{}, Langs: 0, }, { RepoIDs: nil, Keyword: "repo1", - IDs: []int64{repoID}, Langs: 1, + Results: []codeSearchResult{ + { + Filename: "README.md", + Content: "# repo1\n\nDescription for repo1", + }, + }, }, { RepoIDs: []int64{2}, Keyword: "repo1", - IDs: []int64{}, Langs: 0, }, { RepoIDs: nil, Keyword: "non-exist", - IDs: []int64{}, Langs: 0, }, + { + RepoIDs: []int64{62}, + Keyword: "pineaple", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "avocado.md", + Content: "# repo1\n\npineaple pie of cucumber juice", + }, + }, + }, + { + RepoIDs: []int64{62}, + Keyword: "avocado.md", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "avocado.md", + Content: "# repo1\n\npineaple pie of cucumber juice", + }, + }, + }, + { + RepoIDs: []int64{62}, + Keyword: "avo", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "avocado.md", + Content: "# repo1\n\npineaple pie of cucumber juice", + }, + }, + }, + { + RepoIDs: []int64{62}, + Keyword: "cucumber", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "cucumber.md", + Content: "Salad is good for your health", + }, + { + Filename: "avocado.md", + Content: "# repo1\n\npineaple pie of cucumber juice", + }, + }, + }, + { + RepoIDs: []int64{62}, + Keyword: "ham", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "ham.md", + Content: "This is also not cheese", + }, + { + Filename: "potato/ham.md", + Content: "This is not cheese", + }, + }, + }, + { + RepoIDs: []int64{62}, + Keyword: "This is not cheese", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "potato/ham.md", + Content: "This is not cheese", + }, + { + Filename: "ham.md", + Content: "This is also not cheese", + }, + }, + }, } for _, kw := range keywords { @@ -81,19 +173,37 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { IsKeywordFuzzy: true, }) assert.NoError(t, err) - assert.Len(t, kw.IDs, int(total)) assert.Len(t, langs, kw.Langs) - ids := make([]int64, 0, len(res)) + hits := make([]codeSearchResult, 0, len(res)) + + if total > 0 { + assert.NotEmpty(t, kw.Results, "The given scenario does not provide any expected results") + } + for _, hit := range res { - ids = append(ids, hit.RepoID) - assert.EqualValues(t, "# repo1\n\nDescription for repo1", hit.Content) + hits = append(hits, codeSearchResult{ + Filename: hit.Filename, + Content: hit.Content, + }) + } + + lastIndex := -1 + + for _, expected := range kw.Results { + index := slices.Index(hits, expected) + if index == -1 { + assert.Failf(t, "Result not found", "Expected %v in %v", expected, hits) + } else if lastIndex > index { + assert.Failf(t, "Result is out of order", "The order of %v within %v is wrong", expected, hits) + } else { + lastIndex = index + } } - assert.EqualValues(t, kw.IDs, ids) }) } - assert.NoError(t, indexer.Delete(context.Background(), repoID)) + assert.NoError(t, tearDownRepositoryIndexes(indexer)) }) } @@ -136,3 +246,25 @@ func TestESIndexAndSearch(t *testing.T) { testIndexer("elastic_search", t, indexer) } + +func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error { + for _, repoID := range repositoriesToSearch() { + if err := index(ctx, indexer, repoID); err != nil { + return err + } + } + return nil +} + +func tearDownRepositoryIndexes(indexer internal.Indexer) error { + for _, repoID := range repositoriesToSearch() { + if err := indexer.Delete(context.Background(), repoID); err != nil { + return err + } + } + return nil +} + +func repositoriesToSearch() []int64 { + return []int64{1, 62} +} diff --git a/modules/indexer/code/internal/util.go b/modules/indexer/code/internal/util.go index 689c4f4584b14..5b95783d9fcfe 100644 --- a/modules/indexer/code/internal/util.go +++ b/modules/indexer/code/internal/util.go @@ -10,6 +10,10 @@ import ( "code.gitea.io/gitea/modules/log" ) +const ( + filenameMatchNumberOfLines = 7 // Copied from github search +) + func FilenameIndexerID(repoID int64, filename string) string { return internal.Base36(repoID) + "_" + filename } @@ -30,3 +34,17 @@ func FilenameOfIndexerID(indexerID string) string { } return indexerID[index+1:] } + +// Given the contents of file, returns the boundaries of its first seven lines. +func FilenameMatchIndexPos(content string) (int, int) { + count := 1 + for i, c := range content { + if c == '\n' { + count++ + if count == filenameMatchNumberOfLines { + return 0, i + } + } + } + return 0, len(content) +} diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go index a2265f86e6b35..79c615a238cd4 100644 --- a/modules/indexer/internal/bleve/util.go +++ b/modules/indexer/internal/bleve/util.go @@ -11,10 +11,15 @@ import ( "code.gitea.io/gitea/modules/util" "github.com/blevesearch/bleve/v2" + "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" "github.com/blevesearch/bleve/v2/index/upsidedown" "github.com/ethantkoenig/rupture" ) +const ( + maxFuzziness = 2 +) + // openIndexer open the index at the specified path, checking for metadata // updates and bleve version updates. If index needs to be created (or // re-created), returns (nil, nil) @@ -49,6 +54,23 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) { } func GuessFuzzinessByKeyword(s string) int { + tokenizer := unicode.NewUnicodeTokenizer() + tokens := tokenizer.Tokenize([]byte(s)) + + if len(tokens) > 0 { + fuzziness := maxFuzziness + + for _, token := range tokens { + fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term))) + } + + return fuzziness + } + + return 0 +} + +func guessFuzzinessByKeyword(s string) int { // according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2 // magic number 4 was chosen to determine the levenshtein distance per each character of a keyword // BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot. @@ -57,5 +79,5 @@ func GuessFuzzinessByKeyword(s string) int { return 0 } } - return min(2, len(s)/4) + return min(maxFuzziness, len(s)/4) } diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go new file mode 100644 index 0000000000000..da69917496261 --- /dev/null +++ b/modules/indexer/internal/bleve/util_test.go @@ -0,0 +1,45 @@ +// Copyright 2024 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package bleve + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestBleveIndexAndSearch(t *testing.T) { + scenarios := []struct { + Input string + Fuzziness int + }{ + { + Input: "", + Fuzziness: 0, + }, + { + Input: "Avocado", + Fuzziness: 1, + }, + { + Input: "Geschwindigkeit", + Fuzziness: 2, + }, + { + Input: "non-exist", + Fuzziness: 0, + }, + { + Input: "갃갃갃", + Fuzziness: 0, + }, + } + + for _, scenario := range scenarios { + t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) { + assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input)) + }) + } +} diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/GIT_COLA_MSG b/tests/gitea-repositories-meta/org42/search-by-path.git/GIT_COLA_MSG new file mode 100644 index 0000000000000..8b137891791fe --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/GIT_COLA_MSG @@ -0,0 +1 @@ + diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/HEAD b/tests/gitea-repositories-meta/org42/search-by-path.git/HEAD new file mode 100644 index 0000000000000..cb089cd89a7d7 --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/HEAD @@ -0,0 +1 @@ +ref: refs/heads/master diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/config b/tests/gitea-repositories-meta/org42/search-by-path.git/config new file mode 100644 index 0000000000000..07d359d07cf1e --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/config @@ -0,0 +1,4 @@ +[core] + repositoryformatversion = 0 + filemode = true + bare = true diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/description b/tests/gitea-repositories-meta/org42/search-by-path.git/description new file mode 100644 index 0000000000000..3165765e21991 --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/description @@ -0,0 +1 @@ +This repository will be used to test code search, diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/post-receive b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/post-receive new file mode 100755 index 0000000000000..4b3d452abcce2 --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/post-receive @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +ORI_DIR=`pwd` +SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) +cd "$ORI_DIR" +for i in `ls "$SHELL_FOLDER/post-receive.d"`; do + sh "$SHELL_FOLDER/post-receive.d/$i" +done \ No newline at end of file diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/post-receive.d/gitea b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/post-receive.d/gitea new file mode 100755 index 0000000000000..43a948da3a983 --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/post-receive.d/gitea @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" post-receive diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/pre-receive b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/pre-receive new file mode 100755 index 0000000000000..412701305369c --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/pre-receive @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +ORI_DIR=`pwd` +SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) +cd "$ORI_DIR" +for i in `ls "$SHELL_FOLDER/pre-receive.d"`; do + sh "$SHELL_FOLDER/pre-receive.d/$i" +done \ No newline at end of file diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/pre-receive.d/gitea b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/pre-receive.d/gitea new file mode 100755 index 0000000000000..49d09406364a5 --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/pre-receive.d/gitea @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" pre-receive diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/proc-receive b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/proc-receive new file mode 100755 index 0000000000000..af2808b03702f --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/proc-receive @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +ORI_DIR=`pwd` +SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) +cd "$ORI_DIR" +for i in `ls "$SHELL_FOLDER/proc-receive.d"`; do + sh "$SHELL_FOLDER/proc-receive.d/$i" +done diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/proc-receive.d/gitea b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/proc-receive.d/gitea new file mode 100755 index 0000000000000..97521c62115db --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/proc-receive.d/gitea @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" proc-receive diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/update b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/update new file mode 100755 index 0000000000000..c186fe4a18b0f --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/update @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +ORI_DIR=`pwd` +SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) +cd "$ORI_DIR" +for i in `ls "$SHELL_FOLDER/update.d"`; do + sh "$SHELL_FOLDER/update.d/$i" $1 $2 $3 +done \ No newline at end of file diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/update.d/gitea b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/update.d/gitea new file mode 100755 index 0000000000000..38101c242664a --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/hooks/update.d/gitea @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" update $1 $2 $3 diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/info/exclude b/tests/gitea-repositories-meta/org42/search-by-path.git/info/exclude new file mode 100644 index 0000000000000..a5196d1be8fb5 --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/info/exclude @@ -0,0 +1,6 @@ +# git ls-files --others --exclude-from=.git/info/exclude +# Lines that start with '#' are comments. +# For a project mostly in C, the following would be a good set of +# exclude patterns (uncomment them if you want to use them): +# *.[oa] +# *~ diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs b/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs new file mode 100644 index 0000000000000..6b948c96a8351 --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs @@ -0,0 +1,13 @@ +90c1019714259b24fb81711d4416ac0f18667dfa refs/heads/DefaultBranch +985f0301dba5e7b34be866819cd15ad3d8f508ee refs/heads/branch2 +65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop +65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1 +78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check +3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master +62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update +4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check +3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits +4a357436d925b5c974181ff12a994538ddc5a269 refs/pull/2/head +5f22f7d0d95d614d25a5b68592adb345a4b5c7fd refs/pull/3/head +62fb502a7172d4453f0322a2cc85bddffa57f07a refs/pull/5/head +65f1bf27bc3bf70f64657658635e66094edbcb4d refs/tags/v1.1 diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/logs/refs/heads/master b/tests/gitea-repositories-meta/org42/search-by-path.git/logs/refs/heads/master new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/commit-graph b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/commit-graph new file mode 100644 index 0000000000000000000000000000000000000000..b38715bb92b034596ebd83954969d4ac88da755e GIT binary patch literal 1772 zcmZ>E5Aa}QWMT04ba7*V02d(J2f}1=advSGfv{N>++7@vAZ)fZ5E?|X-9WI1C5sX0 zD}0y1Lcr03y@C@%nCFIhS`8@7-k2uVVDERrWWP|nng>@1vDQY<_5}-;z1e) z=7#@*weZfl*icC+h}t;hc+{SMG7EV|-q&#biQOzYnJzrsIMrGDJ6zQ_7I ze@bduL~>jjr{C? zX|=y*#4|JAUiy5q_lvZ~ITxcY-}uV$?#j-(IxEjjTmj7K`=`1r|2(gEdustOF@i7< zu%t78Y`wlWy61NWzZ;+Vw!byseKh=7CrTizN#wg5B$jMsak*fL9mPsBt z5f9YQ0X2swg)9BuovQAI>i=F8+|4i+IG4uzFVfgb8mNXHs)i#@c&~{5)1>B@thCv+_BT)s3tWxDhV5GVvTg-MU3vm6^Gu?zFDi5EezgrG4^_hw zaqeCH=U@CW$EW}CVOqFj-^qL43Ppcz-$!;&3d0Otj=*iQ^LrM&d}b#8P?CK{--~XU zWyoq$A6iRKnWQH)^;m^k?L$fH>Xx-#969PgQQb4^IFL^?n4!sWr&=?L$zMr$c~6w; J6d~FCUjTa)2z>wm literal 0 HcmV?d00001 diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs new file mode 100644 index 0000000000000..b2af8c8378a44 --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs @@ -0,0 +1,2 @@ +P pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack + diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.bitmap b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.bitmap new file mode 100644 index 0000000000000000000000000000000000000000..1fdef225e830cf65fa66e30e13f64bcc31b5feb8 GIT binary patch literal 674 zcmZ?r4Dn@PWME}rVBog2Jv1q7kNRo7;}$alYQGEYtFrkD5(i=?C!2TaXGl&Ce zRJr;eP$>o#G&%nN4KOJ%&4|Kd0K0*K!2u|WVj_Ek1Wcs}SO=g`C wvVrC%>~g5o%UlPOhtVzm3=C5o7#PYeFQr{vDB+MdI5Or2 z@_DF$d4XoYFdvW&!~8%t3=06+=vWXaMh+GN>O;oDKt3)k0+b^aivjf@V{srK8A|~9 zzYYU}c~9mPsBt5wE2$>sCN_^G9q`r_}2g z``I{Fd~vWXJH~Q4Y_f8jAKQZJ)+=d~Jeb@(H7Cz_a(Pw^=WG99-)|ouzHUF`Wm;l( zQ+4af5()W_S~FcO?jBu~>Fp7?Ht>E{?R?dfl_rsuuUuk*hF8|rQ2F3t=;UpWb5(2NxuWM3X86|+A}LHI@7xM{;%*4RjD8M ztM9S?&YzN67Lgp6#_4zav~Skpr50g=M_+fgot!2$$7H3=uC>1_eov^gGUj~s{aW-Tex-}GCn)u zAmelq)!8b)8w+J!#Mbajq}BeKaOXnpg7S;c&n6gc)s?SYryVc3Y04x$p{d6z)M_6} zT35HM?c&H$|2ZR`neq10=bOD>q&3dD7-uStP>^fRCO;@|M#Nc zZicbIxisE?k;YcicOy%5_|u9e%4?^bURo%1fOYa)qsPX*%cF!kBYh1u^nEKoc%Cu) zWBU0Qf6Vdee|(r0?$~$op0`5LpWFA70zO2gaZY}vV5_?=vVPaU`Qf*lyX{I`wrskk zz{_m0o{z1O|MJhvZ@*r(T-SY}>FmvFk(Wmn*6b>I*t$14jdkOk#^apJdsSHGJ4CL@ zZ(Fj);Kl7bR$DAxoXwB->Xq2O-5wut@8Ofn&t6LY>AEOjd~)6;k+X`1C#v$E9Z_8O zo?+K1t0n7i)GvS7<=SUclMrwMSoB>67Qeq37(^cdi(M}u4a)z(dPEc`$6N-KuLM@F z!19tW7sw7}VBj_eivI>yo$G zc^~J#kNT>y<{CBE7_(}wQWcX>0001pFM}?H(mmr->~FhLQG$?@9fs|v{3C1;Al;Fi z*!Ix-)J9D04l*-ISyf@Fj!NKPjz@3B%2uWgQ5TkwzzL^x)>zJUx%MF{{en zrh7B>PK{jth5X7At#|4IN*!TggrqEiU)5maO6m4&IX+Gfs_enr>|&ix(rmTiv)KtmiMy-{~D6`Q((cuF;^DRL@J}>{1S28Gm;|ZfIU!b zz=?;{vFwnYsQ?_b!Kh2F_N6iysjJA)C^|3IH^n17IJL$m5*$!+HUtSRrY6nPjUrJn zX)5T2AE%OTbIY=r+97^LN`OjH4yT9(&FVWAv-sRO2|c(Wk)oY7)v5OY{{FhL)!k{6 zL7yJ*TOPs6#9n_sQ?UkmuEIXh88>;^8;8zriOaMiL#nE4YHr z!nTWA^I@K&+DJwwuImASnePB7+-Ets%bP3ka@QplW^Pm_$@7^n`SvhX^2^^Is z5$}%VEz62ghEoMHlT2xL)O8G;TugJ=j^36C@Cxh|QX;r^aqjBdb@)TMk3LnWa~K#S z$DZ%FiSC3N`uJKyS5Nd2vDHxbSK2>$9aXYHV#5|)?@-C{$r++lAeF%AhyW`bf#^l> z-Qt$s>g-;hb$B#OF#Tj3QcMfh>Yb{q+@A~8VTOP|?uI)bgc;I9rRE^HjaKNWI`qbR z9yFE%+CIhB6eZ8&s|N^^WK-(Mohzkkl_^~(H!5w%Z-{eE?5HJG>lGsf_g9*e1U*0YRRE8x=M_OB(BH`jRo|30ut@RwNZ`2c8DGc7KlMUi zlG9eS))7vlp6lzME$h0ASP#iVu;Gg(;rugCd->F9na8EN6_2RZG>}qEU0L6!O1vHn zndM{YEGgP!$a4vz&eAwl!#eA^s!#b0i_1^!7B15!iZG8)_BlZ^kRe0QZWI?Z2oKDPVoj8)+p zI*!x$>`Ewr8%p<6Lj}=5rl?M10YL;wjvkJpXen=y8KTrf`@`k%qw3z!WVsKuVK^`h z$B`s-M@`_RXRPtSx~b>wAu57)MmTluNo!VBHg%7tramFJV00OVqF*eevV zL}X)O?TDT^1CG#`vidfb!2BFxpLnw-x39{4m^dGzlV{Yn7|lltV}E!`_}3Re4nN1F z(`JBmTJ1AW2xS+hs)dKW2w+$(%sbG$H7N+V(^WTB(%Ss92cHMj5M#|k>Y^p?S^-`$ z7yzd_68sxQT7@R<)&A;|RiHf1Dhim{+`D}S5eu*jQ=J_g)lO$dkSCF*Ezz9SU zR&00RPG8U3sF_;tYDK{xY!q|3psi_E#F0I_SgA!t`H&$>iSBDPG5*8%i|O> z& zu1JWTA3O)rowpfv>+qgyNdEf+SMaIz+1udVOFb-hxQSt8G4(L?FC%m7?< zbG_j1a0>Dv49=nu5%-0n9B+R(p=YAL5F24*4YVh+nRnGp@}c@8OyK87A8s4P=VzY_ za#6Yf-Hq6;F+ub3$hA>l>R3NQ_v*2J5Xaw)$ zjCH1UVxv0YcPS}~bBQST+bg5B7$~;fRs@vl9kU&#^kD{_jg54w-G3rsa@o~a(>2!) z^f*rXK%A-TeZB08FFaZY!`{dPvUonaO#L}INujsh*$D%PT-lw4`&p|i-gdoz;{XpR zj_Izkaon)K_@#E7B>9hjhAYdY0g^Y~QYUH~7N`uY@?_PiRU}dC$FE7FNER)iYgLn8 z!`zWGxkO+sW_kXLa5h!<5OcQPcj7ei?2PhYi`OBe1^U}{>J!;aMCF014Mjp4e16=yEwa&|EhGO z$M--G6A45mP{m8BeOAXfo}Rzn>8dE<_}&-f`Em<6@EM9)WeE|$kRHEfAQD)4D~P;t z)_kR>{I&8TraFCtTWjy~tEL7`KYq_Ts5!`2oWk%%W1UQI#62y{ac&=?dWoG7B7T2F z`vGU>URct&VAm^7PKKi7=P4*t6@yiM$Y>%z=uT5b%6OI2oz$5!hG2KrihUtgpOVha zr08n)y7H`@5W4vLmH#>-5=6^2cI&r67dj)lR z&TJ9yB)Cr_52G)R`=;sP3mcD(OO7A>n>t4T2ioVYG@ja&MZViKDB4dWIfC8tCq2Ip260%*>EC!qL(Vg& z&HN7RM&p}57^U~rbQhd_J&;IDrz4sU@Jq+Vo7rK{_1D1t`z1!p*{7rtbxE_;OZgW? zt6Oo~Zgy9*F$hR?D{sli3}p8KyNPT;XM&BRe)FI#Qn!`28;>hs!dgh`B1E#wZ%OSH ztT7-ge0c7>}X!LsgMkkn&P|Vk{ot-=RJE-T6LuH4VMLQsbHr7aB06gC|JZDK~G8 z#=Z}JX|lX^P}vHdfO$`z$oHItf|a9w?Ng$_ApuLIKHw{|^@SmdR`Ye0;T#;dDUM!GwSfWQ<;3h)E{pHaVzD{0 zLZ*+hhV8%Iw!SsbmD~S$Jw@?FB_vVgTf6Z`3oVOE5nt6cnY>PnD^{h}qDNu{&2J5_ z1(PfB0lM4l>7V5uaWJgZ^3(oz4{E91fANr$Pyc3O#ZUgUQa9KfrXGNq8kEYZ2U;XX z-aT0DaNX~waCRsaILI1gLJB{%o`42+*O3sQ2SP2`@Lp#aL$*tXUMW^1dz8iumS+Y) zHWlORAT;{%v?-D85NfqpN#!BQS0=&=`0=L~z=ylDt~NTdbjw@Iv`3WJ z2*%QPg@V6CQN<&il{6-tuAj-I3>D20p8m$=+K(eSG_jzM9f=8N`aDu8BVZ&N7*AMAy}V8T*+7aymP$O``~rHS4l(L%2*gu6Jb-CcYtq&F56u=BucQ ztBMY02AIygB^tQPLss0vyc{0DMguBuGC41`brvfJOz}wSJhJ^8Xu?0uT6yf!@kb0i zP)e)xDu4em0tPEi<#0Q0<(QI74FanUU`L#;^JQT>{R<@ovHZMYORzk)4qt}DTDq$K z6(`g^?fK%vSJCIcY>nr=lyoQOjJC)37v6Et)0F1s7WiADeQWSGmpA;qHysOqe_*ex z89S+ru3^_&&Y4BGLP5lVjwi?%pi|C?pZzgX% z^JeRJCvOuCmz|TXE?#iMJvo+_AJ@4d2Z_3oro{ZP>(XfmNd=q45I->UlQB`O zyli>TFHVO9d;Sn>r;>o(G~uKU6XKZA8zkM&45!g|PJct=^|3<;n^s&s3}@0kt2aJ* zkPU?5*3cX>?G!L75dd#S>lCv7N~6(b@~ORIfWoY$z5qFMAFUj5z`jDRLx=L~f{7P) z@H%U$avKaGf#$3e4%}P46=E^^%nu#A7dP}Vf~D33lcP$^CUi5*`^RnRT)V)ygGh!E zp=rTn>+xWXGRGdGESt?O2Kzya;Jxa6e_cCssdqmI?(cHDiCd{m_g8uL(2lw~CHwuC z|1=I2udezjgvPx%=TpLXWv={dsU1P7@!?OHn3%Gy_M~y*1bqJFgq9Bk;Oy`Rip{+$ z9|55Qq2=Ml+jFq(E(FD+kX|n>JYG+=v;;a?6B>)o5y3WwRl19wBdOTJy=TP%C?_z{ zleJ09saG`)FfT8D{}Zb2`>Iv(f>U8DLt$Y3Znyi_v@-H)g5ZhOYNW_s#_|%`BV6eC z8BtA*i>|CS3V^%1zH-z-j=G4D=YeEM?P<^1o9Dsm9F+ew7H=UtWsgJgBwh!<0wX;? zjEvM$;QnpI1x6o{{Is-sTy^qCGGAk2?0ug)HY8`QTRjgh5$eMB#Fec-@Ky;pE0uVlXOEwD~-d|Ih3?MQX27`i~C+%5WYNyY9{RH*CHw$Da?@VtPAVW?v!YJlurL&4S z_Gmq7DRq|KFUpEIQg@h7eAoXIHjW=}XKpa_5g|u6y|a*u)si_<_zk%QuoVB=`kQ^y zF_HN$V|*S|JCr=!zyLHD&4yoOhlsv(#lE5~_XfXUt3M|#sKSjZ=l3`4M>}njhmi`4 z%*Mvdxx0gMCnG;sP8VKOGHcDpJEs$gfREKTB>PXlOTKU#v=23L6Mf7yd3X8L^7`D) zR5nb|yKm28-Ct*2W4vh}&6`a(oVixuN~@C2_I$B2ku^E&B%$&Ud58dSF ztMIbBFH^X{Vi!493%awfpzgKM6fQ4Lmwo&z|2`X)K?v#?Ujag(<;@qn$< zjt|sEQnyc&<(9s;$2Kn!Ov1bu^gFi2z-2^>md-ZEBhJ(R`@|XdV4gwh{i8Wk6Qq3A z55iIBxY_m|{(8`dE}x8RVq@Jg&SJUqdd%%pDj#LPRHi4cmjAoq5;IJVMrwK*pjlz( zW4p;@N6v@BYm-4Urv%>lp9MH@*bd)MogcP6BtYWiCWgftD)PT<6~fdvOC+Nop2uHD zw+mfM^gW^#IV!$OeCZoq#*~0P)GvLbP*Dn&6`)8pe!%%pk9$N(6myNLMA4VQ8~dYk zugJGaBuoqGUv}HC8^NQ}w%Fd?7U5atZo7*C)(#Qesc_7tky%k0Hf;l)>5GJO)9pu> zVHxPJ&>y^)2(F|J+9zoV=sLW^k|-=#$(-VYRKt&DzGI2oB1kSQa(LsX0wu&Mr96V! z815MyoKPqSo0=@ssEV@U@cgKfG4E&Uk#O*P*0R7!y2{Ojv%!sq59ByBrSS?03Qh^? zk)k7MRTouf8uy@&8tEN2KJHla6A}ZKIV0?NPGavU$!=I;@b;rFxn_RcR6MMC3>?Ok zd_e#~2sV@H;F}zWzo~@pEx9h16K}8$XYvk}(F_=VAOL>Y@et-B{!3>_c5L>AO7#Z@ zU)ny&9@~?+Fmq3Z?NP^xy;@X2%V%uDV&s3pr)SQtS6Hs*AR9B7P6qeEkN9cjf?Oi1 z=>A~AefD4ohNFw~nae-%POs7R$=pTCTs!xN38;!NUN{z?R?X{NBO z*Jq{kO+Key3^V?0aKoXo3HX)=mtx9Oiod4h)G*^a#BT2c9d1Uc#QQ0C95>X zFbn{swP7YV{R10*fdF#*GB&(J+8^*jHlt={HZ%bymFOn5|C#=OW`t-_6Pm@{Ghpfp zkxNie=Z+3G-%;QKyr4)OGopenH`?6He43o%L$nq-<9qqur~3)Wf?X2`nl^Zo_Axkg zncL#hf?MJDtxN5oaJR$-`f8Slh~F)ckxWr2J;{P!_oRaCyXWq>8L-MBW1lUBJHU<| zRUb{sP(>y|)Il|-%7Hs!5XB9>f#S$Ln3Qxgd-w-!lpCyhF5&w+kuqddQyR+L+0s*0VScLUjj)7N{l;L%k_Tns($Nb znSEuojKABThcuPiJ>1ZkSM{ZFBDvSd=bNZ4#+?Jk`%BFz4EoJK3F+PG#?;7>t$Fiq zE}ui+PY_r&%Mf@}bqM%42flJiww9E~chd`tlNvBA#L&z+o4^Tbi*FeCS^<`MI`~*v zqeZoBkKeI;%-`y-ZjZMM*tscN`OsOVcK>}DNx~qi>1l96@=|zho!P?ARb%shbsPV@ zicizA#7*%th%iyUGd#)?YDw6+gK-#aFj|3{p<4dKD^|g%pVBw{je`F(6Jgc6Sj@h{ zF_ghH$dq^b7ZVh$Wk$LukA7(@In=|`{zGLbf@_WSufNUsxuALtbE%G+;k(feyt0PShB1!l zeiRw=k8u}dglY0l^xPO`AA7Y0?c^lYJ|9-vHEq`fdt>{R5Y8^x=?*IaICj5JZp)>m$qTc2xXrgGbeo*Synva{Ft2jpjP5q1yQ( z#(zEFD95bKGQtEI9-Gv7QFb6Y5+DhT$yWbT7yeHSCZlw6K2h>+A=sj5-(tIP&}Mfp z4}I61Fy9st08}ByssE4MB}3w8Ht=5j5U5b#0gT9UU;9khX8Ef1$I%{9kc145r6NXCE+?fx8 z8P>}4Z!Nc~s6%%%u9wO%iL0}T2te*fkZb<`B^Ehu)6 F`X9zpyc+-j literal 0 HcmV?d00001 diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.rev b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.rev new file mode 100644 index 0000000000000000000000000000000000000000..869860ba611c41e8af0cad3c759ce94e9cacd2a6 GIT binary patch literal 196 zcmWIYbctYKU|@t|Rv;|`#GF7Z2*m6_%nQVPKr9Z#JV49^#QZ=k2gK4q4ARRC#L7S{ z55$TeE+Wg?F|5;{u!wN literal 0 HcmV?d00001 diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/packed-refs b/tests/gitea-repositories-meta/org42/search-by-path.git/packed-refs new file mode 100644 index 0000000000000..70e69af1e1018 --- /dev/null +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/packed-refs @@ -0,0 +1,14 @@ +# pack-refs with: peeled fully-peeled sorted +90c1019714259b24fb81711d4416ac0f18667dfa refs/heads/DefaultBranch +985f0301dba5e7b34be866819cd15ad3d8f508ee refs/heads/branch2 +65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop +65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1 +78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check +3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master +62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update +4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check +3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits +4a357436d925b5c974181ff12a994538ddc5a269 refs/pull/2/head +5f22f7d0d95d614d25a5b68592adb345a4b5c7fd refs/pull/3/head +62fb502a7172d4453f0322a2cc85bddffa57f07a refs/pull/5/head +65f1bf27bc3bf70f64657658635e66094edbcb4d refs/tags/v1.1 diff --git a/tests/integration/api_org_test.go b/tests/integration/api_org_test.go index 70d3a446f7688..fff121490c9ca 100644 --- a/tests/integration/api_org_test.go +++ b/tests/integration/api_org_test.go @@ -177,7 +177,7 @@ func TestAPIGetAll(t *testing.T) { var apiOrgList []*api.Organization DecodeJSON(t, resp, &apiOrgList) - assert.Len(t, apiOrgList, 12) + assert.Len(t, apiOrgList, 13) assert.Equal(t, "Limited Org 36", apiOrgList[1].FullName) assert.Equal(t, "limited", apiOrgList[1].Visibility) @@ -186,7 +186,7 @@ func TestAPIGetAll(t *testing.T) { resp = MakeRequest(t, req, http.StatusOK) DecodeJSON(t, resp, &apiOrgList) - assert.Len(t, apiOrgList, 8) + assert.Len(t, apiOrgList, 9) assert.Equal(t, "org 17", apiOrgList[0].FullName) assert.Equal(t, "public", apiOrgList[0].Visibility) } diff --git a/tests/integration/api_repo_test.go b/tests/integration/api_repo_test.go index 716da762e542d..93c9ca0920d49 100644 --- a/tests/integration/api_repo_test.go +++ b/tests/integration/api_repo_test.go @@ -94,9 +94,9 @@ func TestAPISearchRepo(t *testing.T) { }{ { name: "RepositoriesMax50", requestURL: "/api/v1/repos/search?limit=50&private=false", expectedResults: expectedResults{ - nil: {count: 35}, - user: {count: 35}, - user2: {count: 35}, + nil: {count: 36}, + user: {count: 36}, + user2: {count: 36}, }, }, { From 77919711aa6f78d3699a77e15c2ab98143bca880 Mon Sep 17 00:00:00 2001 From: Bruno Sofiato Date: Thu, 10 Oct 2024 13:55:07 -0300 Subject: [PATCH 2/4] Included documentation about fuzziness' semantics within Bleve --- modules/indexer/internal/bleve/util.go | 3 +++ modules/indexer/internal/bleve/util_test.go | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go index 79c615a238cd4..b426b39bc20db 100644 --- a/modules/indexer/internal/bleve/util.go +++ b/modules/indexer/internal/bleve/util.go @@ -53,6 +53,9 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) { return index, 0, nil } +// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars +// may be different on two string and they still be considered equivalent. +// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero. func GuessFuzzinessByKeyword(s string) int { tokenizer := unicode.NewUnicodeTokenizer() tokens := tokenizer.Tokenize([]byte(s)) diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go index da69917496261..ae0b12c08d42b 100644 --- a/modules/indexer/internal/bleve/util_test.go +++ b/modules/indexer/internal/bleve/util_test.go @@ -10,10 +10,10 @@ import ( "github.com/stretchr/testify/assert" ) -func TestBleveIndexAndSearch(t *testing.T) { +func TestBleveGuessFuzzinessByKeyword(t *testing.T) { scenarios := []struct { Input string - Fuzziness int + Fuzziness int // See util.go for the definition of fuzziness in this particular context }{ { Input: "", From 37dc7699564a14bba8d30af0b9ea08bb43cb3169 Mon Sep 17 00:00:00 2001 From: Bruno Sofiato Date: Thu, 10 Oct 2024 19:26:31 -0300 Subject: [PATCH 3/4] Adding repo structure to description --- .../org42/search-by-path.git/description | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/description b/tests/gitea-repositories-meta/org42/search-by-path.git/description index 3165765e21991..382e2d7f10128 100644 --- a/tests/gitea-repositories-meta/org42/search-by-path.git/description +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/description @@ -1 +1,8 @@ -This repository will be used to test code search, +This repository will be used to test code search. The snippet below shows its directory structure + +. +├── avocado.md +├── cucumber.md +├── ham.md +└── potato + └── ham.md From 765e2f45169d1fed8f3cf2dcafade1c10faf8a0f Mon Sep 17 00:00:00 2001 From: Bruno Sofiato Date: Fri, 11 Oct 2024 16:20:20 -0300 Subject: [PATCH 4/4] Included some descriptions for the code search test scenarios --- modules/indexer/code/indexer_test.go | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go index aca7966e33a83..5b33528dcde04 100644 --- a/modules/indexer/code/indexer_test.go +++ b/modules/indexer/code/indexer_test.go @@ -44,6 +44,8 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { Langs int Results []codeSearchResult }{ + // Search for an exact match on the contents of a file + // This scenario yields a single result (the file README.md on the repo '1') { RepoIDs: nil, Keyword: "Description", @@ -55,11 +57,15 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, + // Search for an exact match on the contents of a file within the repo '2'. + // This scenario yields no results { RepoIDs: []int64{2}, Keyword: "Description", Langs: 0, }, + // Search for an exact match on the contents of a file + // This scenario yields a single result (the file README.md on the repo '1') { RepoIDs: nil, Keyword: "repo1", @@ -71,16 +77,22 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, + // Search for an exact match on the contents of a file within the repo '2'. + // This scenario yields no results { RepoIDs: []int64{2}, Keyword: "repo1", Langs: 0, }, + // Search for a non-existing term. + // This scenario yields no results { RepoIDs: nil, Keyword: "non-exist", Langs: 0, }, + // Search for an exact match on the contents of a file within the repo '62'. + // This scenario yields a single result (the file avocado.md on the repo '62') { RepoIDs: []int64{62}, Keyword: "pineaple", @@ -92,6 +104,8 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, + // Search for an exact match on the filename within the repo '62'. + // This scenario yields a single result (the file avocado.md on the repo '62') { RepoIDs: []int64{62}, Keyword: "avocado.md", @@ -103,6 +117,8 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, + // Search for an partial match on the filename within the repo '62'. + // This scenario yields a single result (the file avocado.md on the repo '62') { RepoIDs: []int64{62}, Keyword: "avo", @@ -114,6 +130,8 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, + // Search for matches on both the contents and the filenames within the repo '62'. + // This scenario yields two results: the first result is baed on the file (cucumber.md) while the second is based on the contents { RepoIDs: []int64{62}, Keyword: "cucumber", @@ -129,6 +147,8 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, + // Search for matches on the filenames within the repo '62'. + // This scenario yields two results (both are based on filename, the first one is an exact match) { RepoIDs: []int64{62}, Keyword: "ham", @@ -144,6 +164,8 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, + // Search for matches on the contents of files within the repo '62'. + // This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one) { RepoIDs: []int64{62}, Keyword: "This is not cheese",