-
-
Notifications
You must be signed in to change notification settings - Fork 1.4k
[client] introduce client-side event aggregation #6394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
60bcf7d
243e934
b654a75
101ae3c
8f99362
42e0007
12a8943
a593e32
d9d585e
598558c
98ce097
b21f7f7
e3f9396
c875aa6
fae5b7e
9dc5e77
07c527f
a93cb66
7295e2e
67d1419
ca4ce0a
e89c0f5
0286c17
3d6fc3b
9ea463e
1721a4f
5dc159e
0e95b6d
fd763ec
17cc13f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,12 +9,14 @@ import ( | |
| "sync" | ||
| "time" | ||
|
|
||
| "github.com/cenkalti/backoff/v4" | ||
| "github.com/google/uuid" | ||
| log "github.com/sirupsen/logrus" | ||
| "google.golang.org/protobuf/types/known/timestamppb" | ||
|
|
||
| "github.com/netbirdio/netbird/client/internal/netflow/conntrack" | ||
| "github.com/netbirdio/netbird/client/internal/netflow/logger" | ||
| "github.com/netbirdio/netbird/client/internal/netflow/store" | ||
| nftypes "github.com/netbirdio/netbird/client/internal/netflow/types" | ||
| "github.com/netbirdio/netbird/client/internal/peer" | ||
| "github.com/netbirdio/netbird/flow/client" | ||
|
|
@@ -23,14 +25,16 @@ import ( | |
|
|
||
| // Manager handles netflow tracking and logging | ||
| type Manager struct { | ||
| mux sync.Mutex | ||
| shutdownWg sync.WaitGroup | ||
| logger nftypes.FlowLogger | ||
| flowConfig *nftypes.FlowConfig | ||
| conntrack nftypes.ConnTracker | ||
| receiverClient *client.GRPCClient | ||
| publicKey []byte | ||
| cancel context.CancelFunc | ||
| mux sync.Mutex | ||
| shutdownWg sync.WaitGroup | ||
| logger nftypes.FlowLogger | ||
| flowConfig *nftypes.FlowConfig | ||
| conntrack nftypes.ConnTracker | ||
| receiverClient *client.GRPCClient | ||
| eventsWithoutAcks nftypes.Store | ||
| publicKey []byte | ||
| cancel context.CancelFunc | ||
| retryInterval time.Duration | ||
| } | ||
|
|
||
| // NewManager creates a new netflow manager | ||
|
|
@@ -48,9 +52,11 @@ func NewManager(iface nftypes.IFaceMapper, publicKey []byte, statusRecorder *pee | |
| } | ||
|
|
||
| return &Manager{ | ||
| logger: flowLogger, | ||
| conntrack: ct, | ||
| publicKey: publicKey, | ||
| logger: flowLogger, | ||
| conntrack: ct, | ||
| publicKey: publicKey, | ||
| retryInterval: time.Second, | ||
| eventsWithoutAcks: store.NewMemoryStore(), | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -107,7 +113,7 @@ func (m *Manager) resetClient() error { | |
| ctx, cancel := context.WithCancel(context.Background()) | ||
| m.cancel = cancel | ||
|
|
||
| m.shutdownWg.Add(2) | ||
| m.shutdownWg.Add(3) | ||
| go func() { | ||
| defer m.shutdownWg.Done() | ||
| m.receiveACKs(ctx, flowClient) | ||
|
|
@@ -116,6 +122,10 @@ func (m *Manager) resetClient() error { | |
| defer m.shutdownWg.Done() | ||
| m.startSender(ctx) | ||
| }() | ||
| go func() { | ||
| defer m.shutdownWg.Done() | ||
| m.startRetries(ctx) | ||
| }() | ||
|
|
||
| return nil | ||
| } | ||
|
|
@@ -207,13 +217,15 @@ func (m *Manager) startSender(ctx context.Context) { | |
| case <-ctx.Done(): | ||
| return | ||
| case <-ticker.C: | ||
| events := m.logger.GetEvents() | ||
| collectedEvents := m.logger.ResetAggregationWindow() | ||
| events := collectedEvents.GetAggregatedEvents() | ||
| for _, event := range events { | ||
| if err := m.send(event); err != nil { | ||
| log.Errorf("failed to send flow event to server: %v", err) | ||
| continue | ||
| } else { | ||
| log.Tracef("sent flow event: %s", event.ID) | ||
| } | ||
| log.Tracef("sent flow event: %s", event.ID) | ||
| m.eventsWithoutAcks.StoreEvent(event) | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -227,7 +239,7 @@ func (m *Manager) receiveACKs(ctx context.Context, client *client.GRPCClient) { | |
| return nil | ||
| } | ||
| log.Tracef("received flow event ack: %s", id) | ||
| m.logger.DeleteEvents([]uuid.UUID{id}) | ||
| m.eventsWithoutAcks.DeleteEvents([]uuid.UUID{id}) | ||
| return nil | ||
| }) | ||
|
|
||
|
|
@@ -236,6 +248,43 @@ func (m *Manager) receiveACKs(ctx context.Context, client *client.GRPCClient) { | |
| } | ||
| } | ||
|
|
||
| // We effectively never drop events (see MaxInterval), which makes eventsWithoutAcks unbounded. | ||
| // We may want to limit the max size of the store, and start dropping oldest events when the threshold is reached. | ||
| func (m *Manager) startRetries(ctx context.Context) { | ||
| timer := time.NewTimer(m.retryInterval) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use Line 254 honors the configurable retry interval only for the initial timer; Line 283 hard-codes all later normal passes to one second. That makes short-interval tests or future overrides ineffective after the first fire. Proposed fix- timer = time.NewTimer(time.Second)
+ timer = time.NewTimer(m.retryInterval)Also applies to: 283-283 🤖 Prompt for AI Agents |
||
| retryBackoff := backoff.WithContext(&backoff.ExponentialBackOff{ | ||
| InitialInterval: 1 * time.Second, | ||
| RandomizationFactor: 0.5, | ||
| Multiplier: 1.7, | ||
| MaxInterval: m.flowConfig.Interval / 2, | ||
| MaxElapsedTime: 3 * 30 * 24 * time.Hour, // 3 months | ||
| Stop: backoff.Stop, | ||
| Clock: backoff.SystemClock, | ||
| }, ctx) | ||
| defer timer.Stop() | ||
|
|
||
| for { | ||
| select { | ||
| case <-ctx.Done(): | ||
| return | ||
| case <-timer.C: | ||
| for _, e := range m.eventsWithoutAcks.GetEvents() { | ||
| if e.Timestamp.Add(time.Second).After(time.Now()) { | ||
| // grace period on retries to avoid early retries | ||
| // do not retry if the event is less than 1 sec old | ||
| continue | ||
| } | ||
| if err := m.send(e); err != nil { | ||
| timer = time.NewTimer(retryBackoff.NextBackOff()) //nolint:staticcheck,wastedassign | ||
| break | ||
| } | ||
| } | ||
| retryBackoff.Reset() | ||
| timer = time.NewTimer(time.Second) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func (m *Manager) send(event *nftypes.Event) error { | ||
| m.mux.Lock() | ||
| client := m.receiverClient | ||
|
|
@@ -250,9 +299,11 @@ func (m *Manager) send(event *nftypes.Event) error { | |
|
|
||
| func toProtoEvent(publicKey []byte, event *nftypes.Event) *proto.FlowEvent { | ||
| protoEvent := &proto.FlowEvent{ | ||
| EventId: event.ID[:], | ||
| Timestamp: timestamppb.New(event.Timestamp), | ||
| PublicKey: publicKey, | ||
| EventId: event.ID[:], | ||
| Timestamp: timestamppb.New(event.Timestamp), | ||
| PublicKey: publicKey, | ||
| WindowStart: timestamppb.New(event.WindowStart), | ||
| WindowEnd: timestamppb.New(event.WindowEnd), | ||
| FlowFields: &proto.FlowFields{ | ||
| FlowId: event.FlowID[:], | ||
| RuleId: event.RuleID, | ||
|
|
@@ -267,6 +318,9 @@ func toProtoEvent(publicKey []byte, event *nftypes.Event) *proto.FlowEvent { | |
| TxBytes: event.TxBytes, | ||
| SourceResourceId: event.SourceResourceID, | ||
| DestResourceId: event.DestResourceID, | ||
| NumOfStarts: event.NumOfStarts, | ||
| NumOfEnds: event.NumOfEnds, | ||
| NumOfDrops: event.NumOfDrops, | ||
| }, | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bound the un-acked event store before retrying indefinitely.
When ACKs stop, every sent aggregate remains in
eventsWithoutAcksuntil an ACK arrives. The inline comment already notes the store is unbounded; add a max size/age policy before this can grow client memory without limit during receiver outages.I can help draft a bounded store policy or open a follow-up issue if you want.
Also applies to: 271-283
🤖 Prompt for AI Agents