blob: 6d8a93ce9fdf789ac7551a60a642afa5bd8a50b9 [file] [log] [blame]
khenaidoo26721882021-08-11 17:42:52 -04001/*
Joey Armstrong9cdee9f2024-01-03 04:56:14 -05002 * Copyright 2021-2024 Open Networking Foundation (ONF) and the ONF Contributors
khenaidoo26721882021-08-11 17:42:52 -04003 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package grpc
17
18import (
19 "context"
20 "fmt"
21 "reflect"
22 "strings"
23 "sync"
24 "time"
25
26 grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware"
27 grpc_opentracing "github.com/grpc-ecosystem/go-grpc-middleware/tracing/opentracing"
Abhay Kumar685507d2025-10-06 09:04:09 +000028 grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
khenaidoo0927c722021-12-15 16:49:32 -050029 "github.com/jhump/protoreflect/dynamic/grpcdynamic"
30 "github.com/jhump/protoreflect/grpcreflect"
khenaidoo26721882021-08-11 17:42:52 -040031 "github.com/opencord/voltha-lib-go/v7/pkg/log"
32 "github.com/opencord/voltha-lib-go/v7/pkg/probe"
khenaidoo0927c722021-12-15 16:49:32 -050033 "github.com/opencord/voltha-protos/v5/go/adapter_service"
khenaidoob9503212021-12-08 14:22:21 -050034 "github.com/opencord/voltha-protos/v5/go/common"
khenaidooa5feb8e2021-10-19 17:29:22 -040035 "github.com/opencord/voltha-protos/v5/go/core_service"
36 "github.com/opencord/voltha-protos/v5/go/olt_inter_adapter_service"
37 "github.com/opencord/voltha-protos/v5/go/onu_inter_adapter_service"
khenaidoo26721882021-08-11 17:42:52 -040038 "google.golang.org/grpc"
khenaidoo0927c722021-12-15 16:49:32 -050039 "google.golang.org/grpc/codes"
40 rpb "google.golang.org/grpc/reflection/grpc_reflection_v1alpha"
41 "google.golang.org/grpc/status"
khenaidoo26721882021-08-11 17:42:52 -040042)
43
44type event byte
45type state byte
khenaidoo0927c722021-12-15 16:49:32 -050046type GetServiceClient func(context.Context, *grpc.ClientConn) interface{}
khenaidoo26721882021-08-11 17:42:52 -040047type RestartedHandler func(ctx context.Context, endPoint string) error
48
khenaidoo26721882021-08-11 17:42:52 -040049const (
50 grpcBackoffInitialInterval = "GRPC_BACKOFF_INITIAL_INTERVAL"
51 grpcBackoffMaxInterval = "GRPC_BACKOFF_MAX_INTERVAL"
52 grpcBackoffMaxElapsedTime = "GRPC_BACKOFF_MAX_ELAPSED_TIME"
53 grpcMonitorInterval = "GRPC_MONITOR_INTERVAL"
54)
55
56const (
57 DefaultBackoffInitialInterval = 100 * time.Millisecond
58 DefaultBackoffMaxInterval = 5 * time.Second
59 DefaultBackoffMaxElapsedTime = 0 * time.Second // No time limit
60 DefaultGRPCMonitorInterval = 5 * time.Second
61)
62
63const (
abhay116c4d42025-03-21 00:35:07 +053064 // [VOL-5434] Setting max receive message size to 20 MB,
65 // Default value of 'defaultServerMaxReceiveMessageSize' is 4 MB
66 grpcRecvMsgSizeLimit = 20
67)
68
69const (
khenaidoo26721882021-08-11 17:42:52 -040070 eventConnecting = event(iota)
khenaidoo0927c722021-12-15 16:49:32 -050071 eventValidatingConnection
khenaidoo26721882021-08-11 17:42:52 -040072 eventConnected
73 eventDisconnected
74 eventStopped
75 eventError
76
77 stateConnected = state(iota)
khenaidoo0927c722021-12-15 16:49:32 -050078 stateValidatingConnection
khenaidoo26721882021-08-11 17:42:52 -040079 stateConnecting
80 stateDisconnected
81)
82
83type Client struct {
khenaidoob9503212021-12-08 14:22:21 -050084 clientEndpoint string
khenaidoo0927c722021-12-15 16:49:32 -050085 clientContextData string
khenaidoob9503212021-12-08 14:22:21 -050086 serverEndPoint string
khenaidoo0927c722021-12-15 16:49:32 -050087 remoteServiceName string
khenaidoo26721882021-08-11 17:42:52 -040088 connection *grpc.ClientConn
89 connectionLock sync.RWMutex
90 stateLock sync.RWMutex
91 state state
92 service interface{}
93 events chan event
94 onRestart RestartedHandler
95 backoffInitialInterval time.Duration
96 backoffMaxInterval time.Duration
97 backoffMaxElapsedTime time.Duration
khenaidoo26721882021-08-11 17:42:52 -040098 monitorInterval time.Duration
khenaidoo26721882021-08-11 17:42:52 -040099 done bool
khenaidoo0927c722021-12-15 16:49:32 -0500100 livenessLock sync.RWMutex
khenaidoo26721882021-08-11 17:42:52 -0400101 livenessCallback func(timestamp time.Time)
102}
103
104type ClientOption func(*Client)
105
khenaidoo0927c722021-12-15 16:49:32 -0500106func ClientContextData(data string) ClientOption {
107 return func(args *Client) {
108 args.clientContextData = data
109 }
110}
111
112func NewClient(clientEndpoint, serverEndpoint, remoteServiceName string, onRestart RestartedHandler,
113 opts ...ClientOption) (*Client, error) {
khenaidoo26721882021-08-11 17:42:52 -0400114 c := &Client{
khenaidoob9503212021-12-08 14:22:21 -0500115 clientEndpoint: clientEndpoint,
116 serverEndPoint: serverEndpoint,
khenaidoo0927c722021-12-15 16:49:32 -0500117 remoteServiceName: remoteServiceName,
khenaidoo26721882021-08-11 17:42:52 -0400118 onRestart: onRestart,
khenaidoo0927c722021-12-15 16:49:32 -0500119 events: make(chan event, 5),
khenaidoo26721882021-08-11 17:42:52 -0400120 state: stateDisconnected,
121 backoffInitialInterval: DefaultBackoffInitialInterval,
122 backoffMaxInterval: DefaultBackoffMaxInterval,
123 backoffMaxElapsedTime: DefaultBackoffMaxElapsedTime,
124 monitorInterval: DefaultGRPCMonitorInterval,
125 }
126 for _, option := range opts {
127 option(c)
128 }
129
130 // Check for environment variables
131 if err := SetFromEnvVariable(grpcBackoffInitialInterval, &c.backoffInitialInterval); err != nil {
132 logger.Warnw(context.Background(), "failure-reading-env-variable", log.Fields{"error": err, "variable": grpcBackoffInitialInterval})
133 }
134
135 if err := SetFromEnvVariable(grpcBackoffMaxInterval, &c.backoffMaxInterval); err != nil {
136 logger.Warnw(context.Background(), "failure-reading-env-variable", log.Fields{"error": err, "variable": grpcBackoffMaxInterval})
137 }
138
139 if err := SetFromEnvVariable(grpcBackoffMaxElapsedTime, &c.backoffMaxElapsedTime); err != nil {
140 logger.Warnw(context.Background(), "failure-reading-env-variable", log.Fields{"error": err, "variable": grpcBackoffMaxElapsedTime})
141 }
142
143 if err := SetFromEnvVariable(grpcMonitorInterval, &c.monitorInterval); err != nil {
144 logger.Warnw(context.Background(), "failure-reading-env-variable", log.Fields{"error": err, "variable": grpcMonitorInterval})
145 }
146
147 logger.Infow(context.Background(), "initialized-client", log.Fields{"client": c})
148
149 // Sanity check
150 if c.backoffInitialInterval > c.backoffMaxInterval {
151 return nil, fmt.Errorf("initial retry delay %v is greater than maximum retry delay %v", c.backoffInitialInterval, c.backoffMaxInterval)
152 }
153
khenaidoo0927c722021-12-15 16:49:32 -0500154 grpc.EnableTracing = true
155
khenaidoo26721882021-08-11 17:42:52 -0400156 return c, nil
157}
158
159func (c *Client) GetClient() (interface{}, error) {
160 c.connectionLock.RLock()
161 defer c.connectionLock.RUnlock()
162 if c.service == nil {
khenaidoob9503212021-12-08 14:22:21 -0500163 return nil, fmt.Errorf("no connection to %s", c.serverEndPoint)
khenaidoo26721882021-08-11 17:42:52 -0400164 }
165 return c.service, nil
166}
167
168// GetCoreServiceClient is a helper function that returns a concrete service instead of the GetClient() API
169// which returns an interface
khenaidooa5feb8e2021-10-19 17:29:22 -0400170func (c *Client) GetCoreServiceClient() (core_service.CoreServiceClient, error) {
khenaidoo26721882021-08-11 17:42:52 -0400171 c.connectionLock.RLock()
172 defer c.connectionLock.RUnlock()
173 if c.service == nil {
khenaidoob9503212021-12-08 14:22:21 -0500174 return nil, fmt.Errorf("no core connection to %s", c.serverEndPoint)
khenaidoo26721882021-08-11 17:42:52 -0400175 }
khenaidooa5feb8e2021-10-19 17:29:22 -0400176 client, ok := c.service.(core_service.CoreServiceClient)
khenaidoo26721882021-08-11 17:42:52 -0400177 if ok {
178 return client, nil
179 }
180 return nil, fmt.Errorf("invalid-service-%s", reflect.TypeOf(c.service))
181}
182
183// GetOnuAdapterServiceClient is a helper function that returns a concrete service instead of the GetClient() API
184// which returns an interface
khenaidooa5feb8e2021-10-19 17:29:22 -0400185func (c *Client) GetOnuInterAdapterServiceClient() (onu_inter_adapter_service.OnuInterAdapterServiceClient, error) {
khenaidoo26721882021-08-11 17:42:52 -0400186 c.connectionLock.RLock()
187 defer c.connectionLock.RUnlock()
188 if c.service == nil {
khenaidoob9503212021-12-08 14:22:21 -0500189 return nil, fmt.Errorf("no child adapter connection to %s", c.serverEndPoint)
khenaidoo26721882021-08-11 17:42:52 -0400190 }
khenaidooa5feb8e2021-10-19 17:29:22 -0400191 client, ok := c.service.(onu_inter_adapter_service.OnuInterAdapterServiceClient)
khenaidoo26721882021-08-11 17:42:52 -0400192 if ok {
193 return client, nil
194 }
195 return nil, fmt.Errorf("invalid-service-%s", reflect.TypeOf(c.service))
196}
197
198// GetOltAdapterServiceClient is a helper function that returns a concrete service instead of the GetClient() API
199// which returns an interface
khenaidooa5feb8e2021-10-19 17:29:22 -0400200func (c *Client) GetOltInterAdapterServiceClient() (olt_inter_adapter_service.OltInterAdapterServiceClient, error) {
khenaidoo26721882021-08-11 17:42:52 -0400201 c.connectionLock.RLock()
202 defer c.connectionLock.RUnlock()
203 if c.service == nil {
khenaidoob9503212021-12-08 14:22:21 -0500204 return nil, fmt.Errorf("no parent adapter connection to %s", c.serverEndPoint)
khenaidoo26721882021-08-11 17:42:52 -0400205 }
khenaidooa5feb8e2021-10-19 17:29:22 -0400206 client, ok := c.service.(olt_inter_adapter_service.OltInterAdapterServiceClient)
khenaidoo26721882021-08-11 17:42:52 -0400207 if ok {
208 return client, nil
209 }
210 return nil, fmt.Errorf("invalid-service-%s", reflect.TypeOf(c.service))
211}
212
khenaidoo0927c722021-12-15 16:49:32 -0500213// GetAdapterServiceClient is a helper function that returns a concrete service instead of the GetClient() API
214// which returns an interface
215func (c *Client) GetAdapterServiceClient() (adapter_service.AdapterServiceClient, error) {
216 c.connectionLock.RLock()
217 defer c.connectionLock.RUnlock()
218 if c.service == nil {
219 return nil, fmt.Errorf("no adapter service connection to %s", c.serverEndPoint)
220 }
221 client, ok := c.service.(adapter_service.AdapterServiceClient)
222 if ok {
223 return client, nil
224 }
225 return nil, fmt.Errorf("invalid-service-%s", reflect.TypeOf(c.service))
226}
227
khenaidoo26721882021-08-11 17:42:52 -0400228func (c *Client) Reset(ctx context.Context) {
khenaidoo0927c722021-12-15 16:49:32 -0500229 logger.Debugw(ctx, "resetting-client-connection", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400230 c.stateLock.Lock()
231 defer c.stateLock.Unlock()
232 if c.state == stateConnected {
233 c.state = stateDisconnected
234 c.events <- eventDisconnected
235 }
236}
237
khenaidoo0927c722021-12-15 16:49:32 -0500238// executeWithTimeout runs a sending function (sf) along with a receiving one(rf) and returns an error, if any.
nikesh.krishnanb547c1a2023-03-11 03:05:16 +0530239// If the deadline elapses first, it returns a grpc DeadlineExceeded error instead.
khenaidoo0927c722021-12-15 16:49:32 -0500240func (c *Client) executeWithTimeout(sf func(*common.Connection) error, rf func() (interface{}, error), conn *common.Connection, d time.Duration) error {
241 errChan := make(chan error, 1)
242 go func() {
243 err := sf(conn)
244 logger.Debugw(context.Background(), "message-sent", log.Fields{"error": err, "qpi-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
245 if err == nil {
246 response, err := rf()
247 logger.Debugw(context.Background(), "message-received", log.Fields{"error": err, "qpi-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "health": response})
248 }
249 errChan <- err
250 close(errChan)
251 }()
252 t := time.NewTimer(d)
253 select {
254 case <-t.C:
255 return status.Errorf(codes.DeadlineExceeded, "timeout-on-sending-message")
256 case err := <-errChan:
257 if !t.Stop() {
258 <-t.C
khenaidoo26721882021-08-11 17:42:52 -0400259 }
260 return err
261 }
khenaidoo26721882021-08-11 17:42:52 -0400262}
263
khenaidoo0927c722021-12-15 16:49:32 -0500264func (c *Client) monitorConnection(ctx context.Context) {
265 logger.Debugw(ctx, "monitor-connection-started", log.Fields{"qpi-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400266
khenaidoo0927c722021-12-15 16:49:32 -0500267 // If we exit, assume disconnected
268 defer func() {
269 c.stateLock.Lock()
270 if !c.done && (c.state == stateConnected || c.state == stateValidatingConnection) {
271 // Handle only connected state here. We need the validating state to know if we need to backoff before a retry
Sridhar Ravindra729e4b02025-02-10 16:41:14 +0530272 if c.state == stateConnected {
273 c.state = stateDisconnected
274 }
khenaidoo0927c722021-12-15 16:49:32 -0500275 logger.Warnw(ctx, "sending-disconnect-event", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "curr-state": stateConnected, "new-state": c.state})
276 c.events <- eventDisconnected
277 } else {
278 logger.Debugw(ctx, "no-state-change-needed", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "state": c.state, "client-done": c.done})
khenaidoo26721882021-08-11 17:42:52 -0400279 }
khenaidoo0927c722021-12-15 16:49:32 -0500280 c.stateLock.Unlock()
281 logger.Debugw(ctx, "monitor-connection-ended", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
282 }()
283
284 c.connectionLock.RLock()
285 conn := c.connection
286 c.connectionLock.RUnlock()
287 if conn == nil {
288 logger.Errorw(ctx, "connection-nil", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
289 return
khenaidoo26721882021-08-11 17:42:52 -0400290 }
khenaidoo26721882021-08-11 17:42:52 -0400291
khenaidoo0927c722021-12-15 16:49:32 -0500292 // Get a new client using reflection. The server can implement any grpc service, but it
293 // needs to also implement the "StartKeepAliveStream" API
294 grpcReflectClient := grpcreflect.NewClient(ctx, rpb.NewServerReflectionClient(conn))
295 if grpcReflectClient == nil {
296 logger.Errorw(ctx, "grpc-reflect-client-nil", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
297 return
khenaidoo26721882021-08-11 17:42:52 -0400298 }
khenaidoo26721882021-08-11 17:42:52 -0400299
khenaidoo0927c722021-12-15 16:49:32 -0500300 // Get the list of services - there should be 2 services: a server reflection and the voltha service we are interested in
301 services, err := grpcReflectClient.ListServices()
302 if err != nil {
303 logger.Errorw(ctx, "list-services-error", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "error": err})
304 return
305 }
khenaidoo26721882021-08-11 17:42:52 -0400306
khenaidoo0927c722021-12-15 16:49:32 -0500307 // Filter out the service
308 logger.Debugw(ctx, "services", log.Fields{"services": services})
309 serviceOfInterest := ""
310 for _, service := range services {
311 if strings.EqualFold(service, c.remoteServiceName) {
312 serviceOfInterest = service
313 break
314 }
315 }
316 if serviceOfInterest == "" {
317 logger.Errorw(ctx, "no-service-found", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "services": services, "expected-remote-service": c.remoteServiceName})
318 return
319 }
khenaidooaa290962021-10-22 18:14:33 -0400320
khenaidoo0927c722021-12-15 16:49:32 -0500321 // Resolve the service
322 resolvedService, err := grpcReflectClient.ResolveService(serviceOfInterest)
323 if err != nil {
324 logger.Errorw(ctx, "service-error", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "service": resolvedService, "error": err})
325 return
326 }
327
328 // Find the method of interest
329 method := resolvedService.FindMethodByName("GetHealthStatus")
330 if method == nil {
331 logger.Errorw(ctx, "nil-method", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "service": resolvedService})
332 return
333 }
334 logger.Debugw(ctx, "resolved-to-method", log.Fields{"service": resolvedService.GetName(), "method": method.GetName()})
335
336 // Get a dynamic connection
337 dynamicConn := grpcdynamic.NewStub(conn)
338
339 // Get the stream and send this client information
340 streamCtx, streamDone := context.WithCancel(log.WithSpanFromContext(context.Background(), ctx))
341 defer streamDone()
342 stream, err := dynamicConn.InvokeRpcBidiStream(streamCtx, method)
343 if err != nil {
344 logger.Errorw(ctx, "stream-error", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "service": resolvedService, "error": err})
345 return
346 }
347
348 clientInfo := &common.Connection{
349 Endpoint: c.clientEndpoint,
350 ContextInfo: c.clientContextData,
351 KeepAliveInterval: int64(c.monitorInterval),
352 }
353
354 initialConnection := true
khenaidoo26721882021-08-11 17:42:52 -0400355loop:
356 for {
khenaidoo0927c722021-12-15 16:49:32 -0500357 // Let's send a keep alive message with our info
358 err := c.executeWithTimeout(
359 func(conn *common.Connection) error { return stream.SendMsg(conn) },
360 func() (interface{}, error) { return stream.RecvMsg() },
361 clientInfo,
362 c.monitorInterval)
khenaidoo26721882021-08-11 17:42:52 -0400363
khenaidoo0927c722021-12-15 16:49:32 -0500364 if err != nil {
365 // Any error means the far end is gone
366 logger.Errorw(ctx, "sending-stream-error", log.Fields{"error": err, "api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "context": stream.Context().Err()})
khenaidoo26721882021-08-11 17:42:52 -0400367 break loop
khenaidoo0927c722021-12-15 16:49:32 -0500368 }
369 // Send a connect event
370 if initialConnection {
371 logger.Debugw(ctx, "first-stream-data-sent", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
372 c.events <- eventConnected
373 initialConnection = false
374 }
375 logger.Debugw(ctx, "stream-data-sent", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
376 // Update liveness, if configured
377 c.livenessLock.RLock()
378 if c.livenessCallback != nil {
379 go c.livenessCallback(time.Now())
380 }
381 c.livenessLock.RUnlock()
khenaidoo26721882021-08-11 17:42:52 -0400382
khenaidoo0927c722021-12-15 16:49:32 -0500383 // Wait to send the next keep alive
384 keepAliveTimer := time.NewTimer(time.Duration(clientInfo.KeepAliveInterval))
385 select {
386 case <-ctx.Done():
387 logger.Warnw(ctx, "context-done", log.Fields{"api-endpont": c.serverEndPoint, "client": c.clientEndpoint})
388 break loop
389 case <-stream.Context().Done():
390 logger.Debugw(ctx, "stream-context-done", log.Fields{"api-endpoint": c.serverEndPoint, "stream-info": stream.Context(), "client": c.clientEndpoint})
391 break loop
392 case <-keepAliveTimer.C:
393 continue
khenaidoo26721882021-08-11 17:42:52 -0400394 }
395 }
khenaidoo0927c722021-12-15 16:49:32 -0500396 if stream != nil {
397 if err := stream.CloseSend(); err != nil {
398 logger.Warnw(ctx, "closing-stream-error", log.Fields{"error": err, "api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
399 }
400 }
khenaidoo26721882021-08-11 17:42:52 -0400401}
402
403// Start kicks off the adapter agent by trying to connect to the adapter
nikesh.krishnanb547c1a2023-03-11 03:05:16 +0530404func (c *Client) Start(ctx context.Context, handler GetServiceClient, retry_interceptor ...grpc.UnaryClientInterceptor) {
khenaidoob9503212021-12-08 14:22:21 -0500405 logger.Debugw(ctx, "Starting GRPC - Client", log.Fields{"api-endpoint": c.serverEndPoint})
khenaidoo26721882021-08-11 17:42:52 -0400406
407 // If the context contains a k8s probe then register services
408 p := probe.GetProbeFromContext(ctx)
409 if p != nil {
khenaidoob9503212021-12-08 14:22:21 -0500410 p.RegisterService(ctx, c.serverEndPoint)
khenaidoo26721882021-08-11 17:42:52 -0400411 }
412
khenaidoo0927c722021-12-15 16:49:32 -0500413 var monitorConnectionCtx context.Context
414 var monitorConnectionDone func()
khenaidoo26721882021-08-11 17:42:52 -0400415
416 initialConnection := true
417 c.events <- eventConnecting
418 backoff := NewBackoff(c.backoffInitialInterval, c.backoffMaxInterval, c.backoffMaxElapsedTime)
419 attempt := 1
420loop:
421 for {
422 select {
423 case <-ctx.Done():
khenaidoo0927c722021-12-15 16:49:32 -0500424 logger.Warnw(ctx, "context-closing", log.Fields{"api_endpoint": c.serverEndPoint, "client": c.clientEndpoint, "context": ctx})
425 c.connectionLock.Lock()
426 if !c.done {
427 c.done = true
428 c.events <- eventStopped
429 close(c.events)
430 }
431 c.connectionLock.Unlock()
432 // break loop
khenaidoo26721882021-08-11 17:42:52 -0400433 case event := <-c.events:
khenaidoo0927c722021-12-15 16:49:32 -0500434 logger.Debugw(ctx, "received-event", log.Fields{"event": event, "api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoofe90ac32021-11-08 18:17:32 -0500435 c.connectionLock.RLock()
436 // On a client stopped, just allow the stop event to go through
437 if c.done && event != eventStopped {
438 c.connectionLock.RUnlock()
khenaidoo0927c722021-12-15 16:49:32 -0500439 logger.Debugw(ctx, "ignoring-event-on-client-stop", log.Fields{"event": event, "api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoofe90ac32021-11-08 18:17:32 -0500440 continue
441 }
442 c.connectionLock.RUnlock()
khenaidoo26721882021-08-11 17:42:52 -0400443 switch event {
444 case eventConnecting:
khenaidoo26721882021-08-11 17:42:52 -0400445 c.stateLock.Lock()
khenaidoo0927c722021-12-15 16:49:32 -0500446 logger.Debugw(ctx, "connection-start", log.Fields{"api-endpoint": c.serverEndPoint, "attempts": attempt, "curr-state": c.state, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400447 if c.state == stateConnected {
448 c.state = stateDisconnected
449 }
450 if c.state != stateConnecting {
451 c.state = stateConnecting
452 go func() {
nikesh.krishnanb547c1a2023-03-11 03:05:16 +0530453 var err error
454 if len(retry_interceptor) > 0 {
455 err = c.connectToEndpoint(ctx, p, retry_interceptor...)
456 } else {
457 err = c.connectToEndpoint(ctx, p)
458 }
459
460 if err != nil {
khenaidoo26721882021-08-11 17:42:52 -0400461 c.stateLock.Lock()
462 c.state = stateDisconnected
463 c.stateLock.Unlock()
khenaidoo0927c722021-12-15 16:49:32 -0500464 logger.Errorw(ctx, "connection-failed", log.Fields{"api-endpoint": c.serverEndPoint, "attempt": attempt, "client": c.clientEndpoint, "error": err})
khenaidoo26721882021-08-11 17:42:52 -0400465
466 // Retry connection after a delay
467 if err = backoff.Backoff(ctx); err != nil {
468 // Context has closed or reached maximum elapsed time, if set
khenaidoo0927c722021-12-15 16:49:32 -0500469 logger.Errorw(ctx, "retry-aborted", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "error": err})
khenaidoo26721882021-08-11 17:42:52 -0400470 return
471 }
472 attempt += 1
khenaidoofe90ac32021-11-08 18:17:32 -0500473 c.connectionLock.RLock()
474 if !c.done {
475 c.events <- eventConnecting
476 }
477 c.connectionLock.RUnlock()
khenaidoo26721882021-08-11 17:42:52 -0400478 }
479 }()
480 }
481 c.stateLock.Unlock()
482
khenaidoo0927c722021-12-15 16:49:32 -0500483 case eventValidatingConnection:
484 logger.Debugw(ctx, "connection-validation", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
485 c.stateLock.Lock()
486 if c.state != stateConnected {
487 c.state = stateValidatingConnection
488 }
489 c.stateLock.Unlock()
490 monitorConnectionCtx, monitorConnectionDone = context.WithCancel(context.Background())
491 go c.monitorConnection(monitorConnectionCtx)
492
khenaidoo26721882021-08-11 17:42:52 -0400493 case eventConnected:
khenaidoo26721882021-08-11 17:42:52 -0400494 attempt = 1
khenaidoo0927c722021-12-15 16:49:32 -0500495 backoff.Reset()
khenaidoo26721882021-08-11 17:42:52 -0400496 c.stateLock.Lock()
khenaidoo0927c722021-12-15 16:49:32 -0500497 logger.Debugw(ctx, "endpoint-connected", log.Fields{"api-endpoint": c.serverEndPoint, "curr-state": c.state, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400498 if c.state != stateConnected {
khenaidoo0927c722021-12-15 16:49:32 -0500499 // Setup the service
500 c.connectionLock.RLock()
501 conn := c.connection
502 c.connectionLock.RUnlock()
503
504 subCtx, cancel := context.WithTimeout(ctx, c.backoffMaxInterval)
505 svc := handler(subCtx, conn)
506 if svc != nil {
507 c.service = svc
508 if p != nil {
509 p.UpdateStatus(ctx, c.serverEndPoint, probe.ServiceStatusRunning)
510 }
511 logger.Infow(ctx, "connected-to-endpoint", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
512 } else {
513 // Should never happen, but just in case
514 logger.Warnw(ctx, "service-is-nil", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
515 c.events <- eventDisconnected
516 }
517 cancel()
khenaidoo26721882021-08-11 17:42:52 -0400518 c.state = stateConnected
519 if initialConnection {
khenaidoo0927c722021-12-15 16:49:32 -0500520 logger.Debugw(ctx, "initial-endpoint-connection", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400521 initialConnection = false
522 } else {
khenaidoo0927c722021-12-15 16:49:32 -0500523 logger.Debugw(ctx, "endpoint-reconnection", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400524 // Trigger any callback on a restart
525 go func() {
khenaidoob9503212021-12-08 14:22:21 -0500526 err := c.onRestart(log.WithSpanFromContext(context.Background(), ctx), c.serverEndPoint)
khenaidoo26721882021-08-11 17:42:52 -0400527 if err != nil {
khenaidoo0927c722021-12-15 16:49:32 -0500528 logger.Errorw(ctx, "unable-to-restart-endpoint", log.Fields{"error": err, "api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400529 }
530 }()
531 }
532 }
533 c.stateLock.Unlock()
534
535 case eventDisconnected:
536 if p != nil {
khenaidoob9503212021-12-08 14:22:21 -0500537 p.UpdateStatus(ctx, c.serverEndPoint, probe.ServiceStatusNotReady)
khenaidoo26721882021-08-11 17:42:52 -0400538 }
khenaidoo0927c722021-12-15 16:49:32 -0500539 connectionValidationFail := false
540 c.stateLock.Lock()
541 logger.Debugw(ctx, "endpoint-disconnected", log.Fields{"api-endpoint": c.serverEndPoint, "curr-state": c.state, "client": c.clientEndpoint})
Sridhar Ravindra729e4b02025-02-10 16:41:14 +0530542 if c.state == stateValidatingConnection {
khenaidoo0927c722021-12-15 16:49:32 -0500543 connectionValidationFail = true
544 c.state = stateDisconnected
545 }
546 c.stateLock.Unlock()
khenaidoo26721882021-08-11 17:42:52 -0400547
khenaidoo0927c722021-12-15 16:49:32 -0500548 // Stop the streaming connection
549 if monitorConnectionDone != nil {
550 monitorConnectionDone()
551 monitorConnectionDone = nil
552 }
553
554 if connectionValidationFail {
555 // Retry connection after a delay
556 if err := backoff.Backoff(ctx); err != nil {
557 // Context has closed or reached maximum elapsed time, if set
558 logger.Errorw(ctx, "retry-aborted", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "error": err})
559 return
560 }
561 }
562 c.connectionLock.RLock()
563 if !c.done {
Sridhar Ravindra729e4b02025-02-10 16:41:14 +0530564 c.events <- eventConnecting
khenaidoo0927c722021-12-15 16:49:32 -0500565 }
566 c.connectionLock.RUnlock()
khenaidoo26721882021-08-11 17:42:52 -0400567
568 case eventStopped:
khenaidoo0927c722021-12-15 16:49:32 -0500569 logger.Debugw(ctx, "endpoint-stopped", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
570
571 if monitorConnectionDone != nil {
572 monitorConnectionDone()
573 monitorConnectionDone = nil
574 }
575 if err := c.closeConnection(ctx, p); err != nil {
576 logger.Errorw(ctx, "endpoint-closing-connection-failed", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "error": err})
577 }
khenaidoo26721882021-08-11 17:42:52 -0400578 break loop
579 case eventError:
khenaidoo0927c722021-12-15 16:49:32 -0500580 logger.Errorw(ctx, "endpoint-error-event", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400581 default:
khenaidoo0927c722021-12-15 16:49:32 -0500582 logger.Errorw(ctx, "endpoint-unknown-event", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "error": event})
khenaidoo26721882021-08-11 17:42:52 -0400583 }
584 }
585 }
khenaidoo0927c722021-12-15 16:49:32 -0500586
587 // Stop the streaming connection
588 if monitorConnectionDone != nil {
589 logger.Debugw(ctx, "closing-connection-monitoring", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
590 monitorConnectionDone()
591 }
592
593 logger.Infow(ctx, "client-stopped", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400594}
595
nikesh.krishnanb547c1a2023-03-11 03:05:16 +0530596func (c *Client) connectToEndpoint(ctx context.Context, p *probe.Probe, retry_interceptor ...grpc.UnaryClientInterceptor) error {
khenaidoo26721882021-08-11 17:42:52 -0400597 if p != nil {
khenaidoob9503212021-12-08 14:22:21 -0500598 p.UpdateStatus(ctx, c.serverEndPoint, probe.ServiceStatusPreparing)
khenaidoo26721882021-08-11 17:42:52 -0400599 }
600
601 c.connectionLock.Lock()
602 defer c.connectionLock.Unlock()
603
604 if c.connection != nil {
605 _ = c.connection.Close()
606 c.connection = nil
607 }
608
609 c.service = nil
610
611 // Use Interceptors to:
612 // 1. automatically inject
613 // 2. publish Open Tracing Spans by this GRPC Client
614 // 3. detect connection failure on client calls such that the reconnection process can begin
Abhay Kumar685507d2025-10-06 09:04:09 +0000615 interceptor_opts := []grpc.UnaryClientInterceptor{
616 grpc_opentracing.UnaryClientInterceptor(grpc_opentracing.WithTracer(log.ActiveTracerProxy{})),
617 grpc_prometheus.UnaryClientInterceptor,
618 }
nikesh.krishnanb547c1a2023-03-11 03:05:16 +0530619
Abhay Kumar685507d2025-10-06 09:04:09 +0000620 grpc_prometheus.EnableClientHandlingTimeHistogram()
nikesh.krishnanb547c1a2023-03-11 03:05:16 +0530621 if len(retry_interceptor) > 0 {
622 interceptor_opts = append(interceptor_opts, retry_interceptor...)
623 }
khenaidoob9503212021-12-08 14:22:21 -0500624 conn, err := grpc.Dial(c.serverEndPoint,
khenaidoo26721882021-08-11 17:42:52 -0400625 grpc.WithInsecure(),
abhay116c4d42025-03-21 00:35:07 +0530626 grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(grpcRecvMsgSizeLimit*1024*1024)),
khenaidoo26721882021-08-11 17:42:52 -0400627 grpc.WithStreamInterceptor(grpc_middleware.ChainStreamClient(
628 grpc_opentracing.StreamClientInterceptor(grpc_opentracing.WithTracer(log.ActiveTracerProxy{})),
Abhay Kumar685507d2025-10-06 09:04:09 +0000629 grpc_prometheus.StreamClientInterceptor,
khenaidoo26721882021-08-11 17:42:52 -0400630 )),
nikesh.krishnanb547c1a2023-03-11 03:05:16 +0530631 grpc.WithUnaryInterceptor(grpc_middleware.ChainUnaryClient(interceptor_opts...)),
khenaidoo26721882021-08-11 17:42:52 -0400632 )
633
634 if err == nil {
khenaidoo0927c722021-12-15 16:49:32 -0500635 c.connection = conn
636 c.events <- eventValidatingConnection
637 return nil
638 } else {
639 logger.Warnw(ctx, "no-connection-to-endpoint", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint, "error": err})
khenaidoo26721882021-08-11 17:42:52 -0400640 }
khenaidoo26721882021-08-11 17:42:52 -0400641
642 if p != nil {
khenaidoob9503212021-12-08 14:22:21 -0500643 p.UpdateStatus(ctx, c.serverEndPoint, probe.ServiceStatusFailed)
khenaidoo26721882021-08-11 17:42:52 -0400644 }
khenaidoo0927c722021-12-15 16:49:32 -0500645 return fmt.Errorf("no connection to api endpoint %s", c.serverEndPoint)
khenaidoo26721882021-08-11 17:42:52 -0400646}
647
648func (c *Client) closeConnection(ctx context.Context, p *probe.Probe) error {
649 if p != nil {
khenaidoob9503212021-12-08 14:22:21 -0500650 p.UpdateStatus(ctx, c.serverEndPoint, probe.ServiceStatusStopped)
khenaidoo26721882021-08-11 17:42:52 -0400651 }
khenaidoo0927c722021-12-15 16:49:32 -0500652 logger.Infow(ctx, "client-closing-connection", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400653
654 c.connectionLock.Lock()
655 defer c.connectionLock.Unlock()
656
657 if c.connection != nil {
658 err := c.connection.Close()
khenaidoo0927c722021-12-15 16:49:32 -0500659 c.service = nil
khenaidoo26721882021-08-11 17:42:52 -0400660 c.connection = nil
661 return err
662 }
663
664 return nil
665}
666
667func (c *Client) Stop(ctx context.Context) {
khenaidoo0927c722021-12-15 16:49:32 -0500668 logger.Infow(ctx, "client-stop-request-event-received", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoofe90ac32021-11-08 18:17:32 -0500669 c.connectionLock.Lock()
670 defer c.connectionLock.Unlock()
khenaidoo26721882021-08-11 17:42:52 -0400671 if !c.done {
khenaidoofe90ac32021-11-08 18:17:32 -0500672 c.done = true
khenaidoo26721882021-08-11 17:42:52 -0400673 c.events <- eventStopped
674 close(c.events)
khenaidoo26721882021-08-11 17:42:52 -0400675 }
khenaidoo0927c722021-12-15 16:49:32 -0500676 logger.Infow(ctx, "client-stop-request-event-sent", log.Fields{"api-endpoint": c.serverEndPoint, "client": c.clientEndpoint})
khenaidoo26721882021-08-11 17:42:52 -0400677}
678
679// SetService is used for testing only
680func (c *Client) SetService(srv interface{}) {
681 c.connectionLock.Lock()
682 defer c.connectionLock.Unlock()
683 c.service = srv
684}
685
686func (c *Client) SubscribeForLiveness(callback func(timestamp time.Time)) {
khenaidoo0927c722021-12-15 16:49:32 -0500687 c.livenessLock.Lock()
688 defer c.livenessLock.Unlock()
khenaidoo26721882021-08-11 17:42:52 -0400689 c.livenessCallback = callback
690}