[VOL-5485] update transceint state to None if delete-device fails and returning err for childdevicelost

Change-Id: I4e4b247f1592a18def550b81c9e86bc7ac73d3dc
Signed-off-by: Akash Reddy Kankanala <akash.kankanala@radisys.com>
diff --git a/VERSION b/VERSION
index 4cc5902..2d6744e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.6.22
+3.6.23
diff --git a/rw_core/core/device/agent.go b/rw_core/core/device/agent.go
index 74c2475..38197e6 100755
--- a/rw_core/core/device/agent.go
+++ b/rw_core/core/device/agent.go
@@ -305,19 +305,27 @@
 
 // onDeleteSuccess is a common callback for scenarios where we receive a nil response following a delete request
 // to an adapter.
-func (agent *Agent) onDeleteSuccess(ctx context.Context, prevState, currState *common.AdminState_Types) {
+func (agent *Agent) onDeleteSuccess(ctx context.Context, prevState, currState *common.AdminState_Types) error {
 	if err := agent.requestQueue.WaitForGreenLight(ctx); err != nil {
 		logger.Errorw(ctx, "delete-device-failure", log.Fields{"device-id": agent.deviceID, "error": err})
+		return err
 	}
 	previousDeviceTransientState := agent.getTransientState()
 	newDevice := agent.cloneDeviceWithoutLock()
 	if err := agent.updateDeviceWithTransientStateAndReleaseLock(ctx, newDevice,
 		core.DeviceTransientState_DELETING_POST_ADAPTER_RESPONSE, previousDeviceTransientState); err != nil {
+		ctx1, cancel1 := context.WithTimeout(context.Background(), agent.rpcTimeout)               // incase of ctx cancellation, updatetranscientstate will fail , so creating a new context for updating
+		if err1 := agent.updateTransientState(ctx1, core.DeviceTransientState_NONE); err1 != nil { // reset the device transient state if the transition handlers fail, so the next retry can go through
+			logger.Errorf(ctx, "failed-to-reset-transient-state-to-none: %s", err1)
+		}
+		cancel1()
 		logger.Errorw(ctx, "delete-device-failure", log.Fields{"device-id": agent.deviceID, "error": err})
+		return err
 	}
 	requestStatus := &common.OperationResp{Code: common.OperationResp_OPERATION_SUCCESS}
 	desc := "adapter-response"
 	agent.logDeviceUpdate(ctx, prevState, currState, requestStatus, nil, desc)
+	return nil
 }
 
 // onDeleteFailure is a common callback for scenarios where we receive an error response following a delete request
@@ -754,6 +762,12 @@
 	// Update device and release lock
 	if err = agent.updateDeviceWithTransientStateAndReleaseLock(ctx, device,
 		currentDeviceTransientState, previousDeviceTransientState); err != nil {
+		ctx1, cancel1 := context.WithTimeout(context.Background(), agent.rpcTimeout)               // incase of ctx cancellation, updatetranscientstate will fail , so creating a new context for updating
+		if err1 := agent.updateTransientState(ctx1, core.DeviceTransientState_NONE); err1 != nil { // reset the device transient state if the transition handlers fail, so the next retry can go through
+			logger.Errorf(ctx, "failed-to-reset-transient-state-to-none: %s", err1)
+		}
+		cancel1()
+
 		desc = err.Error()
 		return err
 	}
@@ -774,10 +788,12 @@
 		}
 		subCtx, cancel := context.WithTimeout(coreutils.WithAllMetadataFromContext(ctx), agent.rpcTimeout)
 		requestStatus.Code = common.OperationResp_OPERATION_IN_PROGRESS
-		if _, err = client.DeleteDevice(subCtx, device); err != nil {
-			agent.onDeleteFailure(subCtx, err, &previousAdminState, &agent.device.AdminState)
+		_, err = client.DeleteDevice(subCtx, device)
+		if (err == nil) || (status.Code(err) == codes.NotFound) {
+			err = agent.onDeleteSuccess(subCtx, &previousAdminState, &agent.device.AdminState) // return error is the device transition update fails , so that northbound can retry
+
 		} else {
-			agent.onDeleteSuccess(subCtx, &previousAdminState, &agent.device.AdminState)
+			agent.onDeleteFailure(subCtx, err, &previousAdminState, &agent.device.AdminState)
 		}
 		cancel()
 	}
@@ -1084,6 +1100,7 @@
 		rpce := agent.deviceMgr.NewRPCEvent(ctx, agent.deviceID, err.Error(), nil)
 		go agent.deviceMgr.SendRPCEvent(ctx, "RPC_ERROR_RAISE_EVENT", rpce, voltha.EventCategory_COMMUNICATION,
 			nil, time.Now().Unix())
+		return err
 	}
 
 	return nil
@@ -1137,7 +1154,7 @@
 	}
 	if err = agent.deviceMgr.canAdapterRequestProceed(ctx, agent.deviceID); err != nil {
 		logger.Errorw(ctx, "adapter-request-cannot-proceed", log.Fields{"device-id": agent.deviceID, "error": err})
-		return err
+		return nil // as we are returning the err for childdevice lost call , and canAdapterRequestProceed will fail for forceDeleteDevice of OLT , so returning nil here
 	}
 	// send request to adapter
 	client, err := agent.adapterMgr.GetAdapterClient(ctx, agent.adapterEndpoint)
diff --git a/rw_core/core/device/logical_manager.go b/rw_core/core/device/logical_manager.go
index 0d89e8e..6e009f2 100644
--- a/rw_core/core/device/logical_manager.go
+++ b/rw_core/core/device/logical_manager.go
@@ -284,10 +284,12 @@
 	//	retrieve parent device using child device ID
 	// TODO: return (string, have) instead of *string
 	//       also: If not root device, just return device.parentID instead of loading the parent device.
-	if parentDevice := ldMgr.deviceMgr.getParentDevice(ctx, device); parentDevice != nil {
+	parentDevice, err := ldMgr.deviceMgr.getParentDevice(ctx, device)
+	if parentDevice != nil && err == nil {
+
 		return &parentDevice.ParentId, nil
 	}
-	return nil, status.Errorf(codes.NotFound, "%s", device.Id)
+	return nil, status.Errorf(codes.NotFound, "LogicalDeviceId for %s NotFound with Error %v", device.Id, err)
 }
 
 func (ldMgr *LogicalManager) getLogicalDeviceIDFromDeviceID(ctx context.Context, deviceID string) (*string, error) {
diff --git a/rw_core/core/device/manager.go b/rw_core/core/device/manager.go
index e27ea43..0b5df56 100755
--- a/rw_core/core/device/manager.go
+++ b/rw_core/core/device/manager.go
@@ -679,14 +679,18 @@
 	return status.Errorf(codes.NotFound, "%s", device.Id)
 }
 
-func (dMgr *Manager) getParentDevice(ctx context.Context, childDevice *voltha.Device) *voltha.Device {
+func (dMgr *Manager) getParentDevice(ctx context.Context, childDevice *voltha.Device) (*voltha.Device, error) {
 	//	Sanity check
 	if childDevice.Root {
 		// childDevice is the parent device
-		return childDevice
+		return childDevice, nil
 	}
-	parentDevice, _ := dMgr.getDeviceReadOnly(ctx, childDevice.ParentId)
-	return parentDevice
+	parentDevice, err := dMgr.getDeviceReadOnly(ctx, childDevice.ParentId)
+	if err != nil {
+		return nil, err
+	}
+	return parentDevice, nil
+
 }
 
 /*
diff --git a/rw_core/core/device/manager_state_callback.go b/rw_core/core/device/manager_state_callback.go
index 62f79e6..1cb69f2 100644
--- a/rw_core/core/device/manager_state_callback.go
+++ b/rw_core/core/device/manager_state_callback.go
@@ -191,6 +191,7 @@
 		if err := parentAgent.ChildDeviceLost(ctx, curr); err != nil {
 			// Just log the message and let the remaining pipeline proceed.
 			logger.Warnw(ctx, "childDeviceLost", log.Fields{"child-device-id": curr.Id, "parent-device-id": curr.ParentId, "error": err})
+			return err
 		}
 	}
 	// Do not return an error as parent device may also have been deleted.  Let the remaining pipeline proceed.