Skip to content

Commit 7ba1581

Browse files
zeripathlafriks6543a1012112796lunny
authored
Use cat-file --batch in GetLanguageStats (#14685)
* Use cat-file --batch in GetLanguageStats This PR moves to using a single cat-file --batch in GetLanguageStats significantly reducing the number of processes spawned during language stat processing. Signed-off-by: Andrew Thornton <art27@cantab.net> * placate lint Signed-off-by: Andrew Thornton <art27@cantab.net> * Update modules/git/repo_language_stats_nogogit.go Co-authored-by: a1012112796 <1012112796@qq.com> Co-authored-by: Lauris BH <lauris@nix.lv> Co-authored-by: 6543 <6543@obermui.de> Co-authored-by: a1012112796 <1012112796@qq.com> Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com>
1 parent fe628d8 commit 7ba1581

File tree

1 file changed

+100
-30
lines changed

1 file changed

+100
-30
lines changed

modules/git/repo_language_stats_nogogit.go

Lines changed: 100 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77
package git
88

99
import (
10+
"bufio"
1011
"bytes"
1112
"io"
12-
"io/ioutil"
13+
"math"
14+
"strings"
1315

1416
"code.gitea.io/gitea/modules/analyze"
1517

@@ -18,16 +20,60 @@ import (
1820

1921
// GetLanguageStats calculates language stats for git repository at specified commit
2022
func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {
21-
// FIXME: We can be more efficient here...
22-
//
23-
// We're expecting that we will be reading a lot of blobs and the trees
24-
// Thus we should use a shared `cat-file --batch` to get all of this data
25-
// And keep the buffers around with resets as necessary.
26-
//
27-
// It's more complicated so...
28-
commit, err := repo.GetCommit(commitID)
23+
// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
24+
// so let's create a batch stdin and stdout
25+
26+
batchStdinReader, batchStdinWriter := io.Pipe()
27+
batchStdoutReader, batchStdoutWriter := io.Pipe()
28+
defer func() {
29+
_ = batchStdinReader.Close()
30+
_ = batchStdinWriter.Close()
31+
_ = batchStdoutReader.Close()
32+
_ = batchStdoutWriter.Close()
33+
}()
34+
35+
go func() {
36+
stderr := strings.Builder{}
37+
err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repo.Path, batchStdoutWriter, &stderr, batchStdinReader)
38+
if err != nil {
39+
_ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String()))
40+
_ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String()))
41+
} else {
42+
_ = batchStdoutWriter.Close()
43+
_ = batchStdinReader.Close()
44+
}
45+
}()
46+
47+
// For simplicities sake we'll us a buffered reader
48+
batchReader := bufio.NewReader(batchStdoutReader)
49+
50+
writeID := func(id string) error {
51+
_, err := batchStdinWriter.Write([]byte(id))
52+
if err != nil {
53+
return err
54+
}
55+
_, err = batchStdinWriter.Write([]byte{'\n'})
56+
return err
57+
}
58+
59+
if err := writeID(commitID); err != nil {
60+
return nil, err
61+
}
62+
shaBytes, typ, size, err := ReadBatchLine(batchReader)
63+
if typ != "commit" {
64+
log("Unable to get commit for: %s. Err: %v", commitID, err)
65+
return nil, ErrNotExist{commitID, ""}
66+
}
67+
68+
sha, err := NewIDFromString(string(shaBytes))
2969
if err != nil {
30-
log("Unable to get commit for: %s", commitID)
70+
log("Unable to get commit for: %s. Err: %v", commitID, err)
71+
return nil, ErrNotExist{commitID, ""}
72+
}
73+
74+
commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size))
75+
if err != nil {
76+
log("Unable to get commit for: %s. Err: %v", commitID, err)
3177
return nil, err
3278
}
3379

@@ -38,17 +84,45 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
3884
return nil, err
3985
}
4086

87+
contentBuf := bytes.Buffer{}
88+
var content []byte
4189
sizes := make(map[string]int64)
4290
for _, f := range entries {
91+
contentBuf.Reset()
92+
content = contentBuf.Bytes()
4393
if f.Size() == 0 || enry.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) ||
4494
enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) {
4595
continue
4696
}
4797

4898
// If content can not be read or file is too big just do detection by filename
49-
var content []byte
99+
50100
if f.Size() <= bigFileSize {
51-
content, _ = readFile(f, fileSizeLimit)
101+
if err := writeID(f.ID.String()); err != nil {
102+
return nil, err
103+
}
104+
_, _, size, err := ReadBatchLine(batchReader)
105+
if err != nil {
106+
log("Error reading blob: %s Err: %v", f.ID.String(), err)
107+
return nil, err
108+
}
109+
110+
sizeToRead := size
111+
discard := int64(0)
112+
if size > fileSizeLimit {
113+
sizeToRead = fileSizeLimit
114+
discard = size - fileSizeLimit
115+
}
116+
117+
_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
118+
if err != nil {
119+
return nil, err
120+
}
121+
content = contentBuf.Bytes()
122+
err = discardFull(batchReader, discard)
123+
if err != nil {
124+
return nil, err
125+
}
52126
}
53127
if enry.IsGenerated(f.Name(), content) {
54128
continue
@@ -86,24 +160,20 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
86160
return sizes, nil
87161
}
88162

89-
func readFile(entry *TreeEntry, limit int64) ([]byte, error) {
90-
// FIXME: We can probably be a little more efficient here... see above
91-
r, err := entry.Blob().DataAsync()
92-
if err != nil {
93-
return nil, err
94-
}
95-
defer r.Close()
96-
97-
if limit <= 0 {
98-
return ioutil.ReadAll(r)
163+
func discardFull(rd *bufio.Reader, discard int64) error {
164+
if discard > math.MaxInt32 {
165+
n, err := rd.Discard(math.MaxInt32)
166+
discard -= int64(n)
167+
if err != nil {
168+
return err
169+
}
99170
}
100-
101-
size := entry.Size()
102-
if limit > 0 && size > limit {
103-
size = limit
171+
for discard > 0 {
172+
n, err := rd.Discard(int(discard))
173+
discard -= int64(n)
174+
if err != nil {
175+
return err
176+
}
104177
}
105-
buf := bytes.NewBuffer(nil)
106-
buf.Grow(int(size))
107-
_, err = io.Copy(buf, io.LimitReader(r, limit))
108-
return buf.Bytes(), err
178+
return nil
109179
}

0 commit comments

Comments
 (0)