4
4
"context"
5
5
"errors"
6
6
"fmt"
7
- "strconv"
8
7
"time"
9
8
10
9
"github.com/coder/quartz"
@@ -33,11 +32,15 @@ type PlaybackManager struct {
33
32
// readinessCheck checks if a waiting or replaying partition can be
34
33
// switched to ready.
35
34
readinessCheck PartitionReadinessCheck
36
- logger log.Logger
35
+ // zone is used to discard our own records.
36
+ zone string
37
+ logger log.Logger
37
38
38
39
// Metrics.
39
- recordsProcessed * prometheus.CounterVec
40
40
lag prometheus.Histogram
41
+ recordsFetched prometheus.Counter
42
+ recordsDiscarded prometheus.Counter
43
+ recordsInvalid prometheus.Counter
41
44
42
45
// Used for tests.
43
46
clock quartz.Clock
@@ -49,6 +52,7 @@ func NewPlaybackManager(
49
52
partitionManager * PartitionManager ,
50
53
usage * UsageStore ,
51
54
readinessCheck PartitionReadinessCheck ,
55
+ zone string ,
52
56
logger log.Logger ,
53
57
reg prometheus.Registerer ,
54
58
) * PlaybackManager {
@@ -57,15 +61,9 @@ func NewPlaybackManager(
57
61
partitionManager : partitionManager ,
58
62
usage : usage ,
59
63
readinessCheck : readinessCheck ,
64
+ zone : zone ,
60
65
logger : logger ,
61
66
clock : quartz .NewReal (),
62
- recordsProcessed : promauto .With (reg ).NewCounterVec (
63
- prometheus.CounterOpts {
64
- Name : "loki_ingest_limits_records_processed_total" ,
65
- Help : "The total number of records processed." ,
66
- },
67
- []string {"partition" },
68
- ),
69
67
lag : promauto .With (reg ).NewHistogram (
70
68
prometheus.HistogramOpts {
71
69
Name : "loki_ingest_limits_lag_seconds" ,
@@ -76,6 +74,24 @@ func NewPlaybackManager(
76
74
Buckets : prometheus .ExponentialBuckets (0.125 , 2 , 18 ),
77
75
},
78
76
),
77
+ recordsFetched : promauto .With (reg ).NewCounter (
78
+ prometheus.CounterOpts {
79
+ Name : "loki_ingest_limits_records_fetched_total" ,
80
+ Help : "The total number of records fetched." ,
81
+ },
82
+ ),
83
+ recordsDiscarded : promauto .With (reg ).NewCounter (
84
+ prometheus.CounterOpts {
85
+ Name : "loki_ingest_limits_records_discarded_total" ,
86
+ Help : "The total number of records discarded." ,
87
+ },
88
+ ),
89
+ recordsInvalid : promauto .With (reg ).NewCounter (
90
+ prometheus.CounterOpts {
91
+ Name : "loki_ingest_limits_records_invalid_total" ,
92
+ Help : "The total number of invalid records." ,
93
+ },
94
+ ),
79
95
}
80
96
}
81
97
@@ -121,26 +137,26 @@ func (m *PlaybackManager) processFetchTopicPartition(ctx context.Context) func(k
121
137
return
122
138
}
123
139
logger := log .With (m .logger , "partition" , p .Partition )
140
+ m .recordsFetched .Add (float64 (len (p .Records )))
141
+ // We need the state of the partition so we can discard any records
142
+ // that we produced (unless replaying) and mark a replaying partition
143
+ // as ready once it has finished replaying.
124
144
state , ok := m .partitionManager .GetState (p .Partition )
125
145
if ! ok {
146
+ m .recordsDiscarded .Add (float64 (len (p .Records )))
126
147
level .Warn (logger ).Log ("msg" , "discarding records for partition as the partition is not assigned to this client" )
127
148
return
128
149
}
129
- if state == PartitionReplaying {
130
- // TODO(grobinson): For now we just consume records when replaying
131
- // a partition. In a future commit this will be moved outside of
132
- // this check, and records will be consumed both when replaying
133
- // newly assigned partitions and merging records from other zones.
134
- for _ , r := range p .Records {
135
- if err := m .processRecord (ctx , r ); err != nil {
136
- level .Error (logger ).Log ("msg" , "failed to process record" , "err" , err .Error ())
137
- }
150
+ for _ , r := range p .Records {
151
+ if err := m .processRecord (ctx , state , r ); err != nil {
152
+ level .Error (logger ).Log ("msg" , "failed to process record" , "err" , err .Error ())
138
153
}
139
- m .recordsProcessed .
140
- WithLabelValues (strconv .FormatInt (int64 (p .Partition ), 10 )).
141
- Add (float64 (len (p .Records )))
142
- m .lag .Observe (m .clock .Since (p .Records [len (p .Records )- 1 ].Timestamp ).Seconds ())
143
- passed , err := m .readinessCheck (p .Partition , p .Records [len (p .Records )- 1 ])
154
+ }
155
+ // Get the last record (has the latest offset and timestamp).
156
+ lastRecord := p .Records [len (p .Records )- 1 ]
157
+ m .lag .Observe (m .clock .Since (lastRecord .Timestamp ).Seconds ())
158
+ if state == PartitionReplaying {
159
+ passed , err := m .readinessCheck (p .Partition , lastRecord )
144
160
if err != nil {
145
161
level .Error (logger ).Log ("msg" , "failed to run readiness check" , "err" , err .Error ())
146
162
} else if passed {
@@ -151,11 +167,17 @@ func (m *PlaybackManager) processFetchTopicPartition(ctx context.Context) func(k
151
167
}
152
168
}
153
169
154
- func (m * PlaybackManager ) processRecord (_ context.Context , r * kgo.Record ) error {
170
+ func (m * PlaybackManager ) processRecord (_ context.Context , state PartitionState , r * kgo.Record ) error {
155
171
s := proto.StreamMetadataRecord {}
156
172
if err := s .Unmarshal (r .Value ); err != nil {
173
+ m .recordsInvalid .Inc ()
157
174
return fmt .Errorf ("corrupted record: %w" , err )
158
175
}
176
+ if state == PartitionReady && m .zone == s .Zone {
177
+ // Discard our own records so we don't count the same streams twice.
178
+ m .recordsDiscarded .Inc ()
179
+ return nil
180
+ }
159
181
m .usage .Update (s .Tenant , []* proto.StreamMetadata {s .Metadata }, r .Timestamp , nil )
160
182
return nil
161
183
}
0 commit comments