Skip to content

Commit ea4c139

Browse files
lafriksCirnoT
andauthored
Change language statistics to save size instead of percentage (#11681)
* Change language statistics to save size instead of percentage in database Co-Authored-By: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> * Do not exclude if only language * Fix edge cases with special langauges Co-authored-by: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com>
1 parent 4395c60 commit ea4c139

File tree

5 files changed

+149
-37
lines changed

5 files changed

+149
-37
lines changed

models/migrations/migrations.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ var migrations = []Migration{
212212
NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn),
213213
// v139 -> v140
214214
NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs),
215+
// v140 -> v141
216+
NewMigration("Save detected language file size to database instead of percent", fixLanguageStatsToSaveSize),
215217
}
216218

217219
// GetCurrentDBVersion returns the current db version

models/migrations/v140.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Copyright 2020 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package migrations
6+
7+
import (
8+
"fmt"
9+
10+
"code.gitea.io/gitea/modules/setting"
11+
12+
"xorm.io/xorm"
13+
)
14+
15+
func fixLanguageStatsToSaveSize(x *xorm.Engine) error {
16+
// LanguageStat see models/repo_language_stats.go
17+
type LanguageStat struct {
18+
Size int64 `xorm:"NOT NULL DEFAULT 0"`
19+
}
20+
21+
// RepoIndexerType specifies the repository indexer type
22+
type RepoIndexerType int
23+
24+
const (
25+
// RepoIndexerTypeCode code indexer
26+
RepoIndexerTypeCode RepoIndexerType = iota // 0
27+
// RepoIndexerTypeStats repository stats indexer
28+
RepoIndexerTypeStats // 1
29+
)
30+
31+
// RepoIndexerStatus see models/repo_indexer.go
32+
type RepoIndexerStatus struct {
33+
IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"`
34+
}
35+
36+
if err := x.Sync2(new(LanguageStat)); err != nil {
37+
return fmt.Errorf("Sync2: %v", err)
38+
}
39+
40+
x.Delete(&RepoIndexerStatus{IndexerType: RepoIndexerTypeStats})
41+
42+
// Delete language stat statuses
43+
truncExpr := "TRUNCATE TABLE"
44+
if setting.Database.UseSQLite3 {
45+
truncExpr = "DELETE FROM"
46+
}
47+
48+
// Delete language stats
49+
if _, err := x.Exec(fmt.Sprintf("%s language_stat", truncExpr)); err != nil {
50+
return err
51+
}
52+
53+
sess := x.NewSession()
54+
defer sess.Close()
55+
return dropTableColumns(sess, "language_stat", "percentage")
56+
}

models/repo_language_stats.go

Lines changed: 82 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,28 @@ type LanguageStat struct {
2020
CommitID string
2121
IsPrimary bool
2222
Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"`
23-
Percentage float32 `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"`
23+
Percentage float32 `xorm:"-"`
24+
Size int64 `xorm:"NOT NULL DEFAULT 0"`
2425
Color string `xorm:"-"`
2526
CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"`
2627
}
2728

29+
// specialLanguages defines list of languages that are excluded from the calculation
30+
// unless they are the only language present in repository. Only languages which under
31+
// normal circumstances are not considered to be code should be listed here.
32+
var specialLanguages = map[string]struct{}{
33+
"XML": {},
34+
"JSON": {},
35+
"TOML": {},
36+
"YAML": {},
37+
"INI": {},
38+
"SQL": {},
39+
"SVG": {},
40+
"Text": {},
41+
"Markdown": {},
42+
"other": {},
43+
}
44+
2845
// LanguageStatList defines a list of language statistics
2946
type LanguageStatList []*LanguageStat
3047

@@ -34,12 +51,53 @@ func (stats LanguageStatList) loadAttributes() {
3451
}
3552
}
3653

54+
func (stats LanguageStatList) getLanguagePercentages() map[string]float32 {
55+
langPerc := make(map[string]float32)
56+
var otherPerc float32 = 100
57+
var total int64
58+
// Check that repository has at least one non-special language
59+
var skipSpecial bool
60+
for _, stat := range stats {
61+
if _, ok := specialLanguages[stat.Language]; !ok {
62+
skipSpecial = true
63+
break
64+
}
65+
}
66+
for _, stat := range stats {
67+
// Exclude specific languages from percentage calculation
68+
if _, ok := specialLanguages[stat.Language]; ok && skipSpecial {
69+
continue
70+
}
71+
total += stat.Size
72+
}
73+
if total > 0 {
74+
for _, stat := range stats {
75+
// Exclude specific languages from percentage calculation
76+
if _, ok := specialLanguages[stat.Language]; ok && skipSpecial {
77+
continue
78+
}
79+
perc := float32(math.Round(float64(stat.Size)/float64(total)*1000) / 10)
80+
if perc <= 0.1 {
81+
continue
82+
}
83+
otherPerc -= perc
84+
langPerc[stat.Language] = perc
85+
}
86+
otherPerc = float32(math.Round(float64(otherPerc)*10) / 10)
87+
} else {
88+
otherPerc = 100
89+
}
90+
if otherPerc > 0 {
91+
langPerc["other"] = otherPerc
92+
}
93+
return langPerc
94+
}
95+
3796
func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) {
3897
stats := make(LanguageStatList, 0, 6)
39-
if err := e.Where("`repo_id` = ?", repo.ID).Desc("`percentage`").Find(&stats); err != nil {
98+
if err := e.Where("`repo_id` = ?", repo.ID).Desc("`size`").Find(&stats); err != nil {
4099
return nil, err
41100
}
42-
stats.loadAttributes()
43101
return stats, nil
44102
}
45103

@@ -54,13 +112,18 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error)
54112
if err != nil {
55113
return nil, err
56114
}
115+
perc := stats.getLanguagePercentages()
57116
topstats := make(LanguageStatList, 0, limit)
58117
var other float32
59118
for i := range stats {
119+
if _, ok := perc[stats[i].Language]; !ok {
120+
continue
121+
}
60122
if stats[i].Language == "other" || len(topstats) >= limit {
61-
other += stats[i].Percentage
123+
other += perc[stats[i].Language]
62124
continue
63125
}
126+
stats[i].Percentage = perc[stats[i].Language]
64127
topstats = append(topstats, stats[i])
65128
}
66129
if other > 0 {
@@ -71,11 +134,12 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error)
71134
Percentage: float32(math.Round(float64(other)*10) / 10),
72135
})
73136
}
137+
topstats.loadAttributes()
74138
return topstats, nil
75139
}
76140

77141
// UpdateLanguageStats updates the language statistics for repository
78-
func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]float32) error {
142+
func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]int64) error {
79143
sess := x.NewSession()
80144
if err := sess.Begin(); err != nil {
81145
return err
@@ -87,24 +151,24 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl
87151
return err
88152
}
89153
var topLang string
90-
var p float32
91-
for lang, perc := range stats {
92-
if perc > p {
93-
p = perc
154+
var s int64
155+
for lang, size := range stats {
156+
if size > s {
157+
s = size
94158
topLang = strings.ToLower(lang)
95159
}
96160
}
97161

98-
for lang, perc := range stats {
162+
for lang, size := range stats {
99163
upd := false
100164
llang := strings.ToLower(lang)
101165
for _, s := range oldstats {
102166
// Update already existing language
103167
if strings.ToLower(s.Language) == llang {
104168
s.CommitID = commitID
105169
s.IsPrimary = llang == topLang
106-
s.Percentage = perc
107-
if _, err := sess.ID(s.ID).Cols("`commit_id`", "`percentage`", "`is_primary`").Update(s); err != nil {
170+
s.Size = size
171+
if _, err := sess.ID(s.ID).Cols("`commit_id`", "`size`", "`is_primary`").Update(s); err != nil {
108172
return err
109173
}
110174
upd = true
@@ -114,11 +178,11 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl
114178
// Insert new language
115179
if !upd {
116180
if _, err := sess.Insert(&LanguageStat{
117-
RepoID: repo.ID,
118-
CommitID: commitID,
119-
IsPrimary: llang == topLang,
120-
Language: lang,
121-
Percentage: perc,
181+
RepoID: repo.ID,
182+
CommitID: commitID,
183+
IsPrimary: llang == topLang,
184+
Language: lang,
185+
Size: size,
122186
}); err != nil {
123187
return err
124188
}
@@ -153,7 +217,7 @@ func CopyLanguageStat(originalRepo, destRepo *Repository) error {
153217
return err
154218
}
155219
RepoLang := make(LanguageStatList, 0, 6)
156-
if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`percentage`").Find(&RepoLang); err != nil {
220+
if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`size`").Find(&RepoLang); err != nil {
157221
return err
158222
}
159223
if len(RepoLang) > 0 {

modules/git/repo_language_stats.go

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"bytes"
99
"io"
1010
"io/ioutil"
11-
"math"
1211

1312
"code.gitea.io/gitea/modules/analyze"
1413

@@ -21,7 +20,7 @@ import (
2120
const fileSizeLimit int64 = 16 * 1024 * 1024
2221

2322
// GetLanguageStats calculates language stats for git repository at specified commit
24-
func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, error) {
23+
func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {
2524
r, err := git.PlainOpen(repo.Path)
2625
if err != nil {
2726
return nil, err
@@ -43,7 +42,6 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
4342
}
4443

4544
sizes := make(map[string]int64)
46-
var total int64
4745
err = tree.Files().ForEach(func(f *object.File) error {
4846
if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) ||
4947
enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) {
@@ -60,33 +58,22 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
6058

6159
language := analyze.GetCodeLanguage(f.Name, content)
6260
if language == enry.OtherLanguage || language == "" {
63-
return nil
61+
language = "other"
6462
}
6563

6664
sizes[language] += f.Size
67-
total += f.Size
6865

6966
return nil
7067
})
7168
if err != nil {
7269
return nil, err
7370
}
7471

75-
stats := make(map[string]float32)
76-
var otherPerc float32 = 100
77-
for language, size := range sizes {
78-
perc := float32(math.Round(float64(size)/float64(total)*1000) / 10)
79-
if perc <= 0.1 {
80-
continue
81-
}
82-
otherPerc -= perc
83-
stats[language] = perc
72+
if len(sizes) == 0 {
73+
sizes["other"] = 0
8474
}
85-
otherPerc = float32(math.Round(float64(otherPerc)*10) / 10)
86-
if otherPerc > 0 {
87-
stats["other"] = otherPerc
88-
}
89-
return stats, nil
75+
76+
return sizes, nil
9077
}
9178

9279
func readFile(f *object.File, limit int64) ([]byte, error) {

modules/indexer/stats/indexer_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ func TestRepoStatsIndex(t *testing.T) {
3434

3535
repo, err := models.GetRepositoryByID(1)
3636
assert.NoError(t, err)
37+
status, err := repo.GetIndexerStatus(models.RepoIndexerTypeStats)
38+
assert.NoError(t, err)
39+
assert.Equal(t, "65f1bf27bc3bf70f64657658635e66094edbcb4d", status.CommitSha)
3740
langs, err := repo.GetTopLanguageStats(5)
3841
assert.NoError(t, err)
3942
assert.Len(t, langs, 1)

0 commit comments

Comments
 (0)